diff --git a/ChangeLog b/ChangeLog index bb299f1c26d836e9cd61e47ff51840c22fa38fce..816aa79a039825559392c76525849a6583bd3cc1 100644 --- a/ChangeLog +++ b/ChangeLog @@ -1,3 +1,50 @@ +2015-01-11 Niels Möller <nisse@lysator.liu.se> + + Merged memxor-reorg changes, starting at 2014-10-23. + +2015-01-10 Niels Möller <nisse@lysator.liu.se> + + * arm/memxor.asm (memxor3): Moved to new file. + * arm/memxor3.asm: New file. + +2014-11-24 Niels Möller <nisse@lysator.liu.se> + + * x86_64/memxor3.asm (memxor3): New file, code moved from old + memxor.asm. + * x86_64/memxor.asm (memxor): Rewritten, no longer jumps into + memxor3. + + * configure.ac (asm_replace_list): Added memxor.asm and + memxor3.asm. + +2014-10-23 Niels Möller <nisse@lysator.liu.se> + + * configure.ac (IF_ASM): New substituted variable. + * testsuite/Makefile.in (VALGRIND): Allow partial loads only when + build includes assembly files. + + * memxor-internal.h (READ_PARTIAL): New macro. + * memxor.c (memxor_different_alignment): Avoid out-of-bounds + reads, corresponding to valgrind's --partial-loads-ok. Use + READ_PARTIAL. + * memxor3.c: Analogous changes for unaligned operations. + + * configure.ac (asm_replace_list): Deleted memxor.asm, now + incompatible with the memxor/memxor3 split. + + * memxor3.c: New file, split off from memxor.c. + * memxor-internal.h: New file, declarations shared by memxor.c and + memxor3.c. + * memxor.c: memxor3 fucntions moved out from this file. + * Makefile.in (nettle_SOURCES): Added memxor3.c. + (DISTFILES): Added memxor-internal.h. + + * memxor.c (memxor_common_alignment, memxor_different_alignment) + (memxor): Change loop order, iterate from the end. + (memxor3_common_alignment): Unroll twice. + (word_t): On x86_64, unconditionally define as uint64_t, to get 64 + bits also in M$ windows. Replaced all uses of SIZEOF_LONG. + 2014-12-12 Niels Möller <nisse@lysator.liu.se> * cbc.h (CBC_ENCRYPT, CBC_DECRYPT): Make type-checking hack diff --git a/Makefile.in b/Makefile.in index 10a58b0b269fc16e0be54be0756dea58fa46e8e7..0d3c424622b1f0b413816c3c01f9fa0155da4d2b 100644 --- a/Makefile.in +++ b/Makefile.in @@ -109,7 +109,7 @@ nettle_SOURCES = aes-decrypt-internal.c aes-decrypt.c \ knuth-lfib.c \ md2.c md2-meta.c md4.c md4-meta.c \ md5.c md5-compress.c md5-compat.c md5-meta.c \ - memxor.c \ + memxor.c memxor3.c \ nettle-meta-aeads.c nettle-meta-armors.c \ nettle-meta-ciphers.c nettle-meta-hashes.c \ pbkdf2.c pbkdf2-hmac-sha1.c pbkdf2-hmac-sha256.c \ @@ -216,7 +216,7 @@ DISTFILES = $(SOURCES) $(HEADERS) getopt.h getopt_int.h \ $(des_headers) descore.README \ aes-internal.h camellia-internal.h serpent-internal.h \ cast128_sboxes.h desinfo.h desCode.h \ - nettle-internal.h nettle-write.h \ + memxor-internal.h nettle-internal.h nettle-write.h \ gmp-glue.h ecc-internal.h \ mini-gmp.h mini-gmp.c asm.m4 \ nettle.texinfo nettle.info nettle.html nettle.pdf sha-example.c diff --git a/arm/memxor.asm b/arm/memxor.asm index fd0f633095670399d7c274c27d0ceb3c19516e53..a50e91bc4a463dc71bddd58df490bc083b959681 100644 --- a/arm/memxor.asm +++ b/arm/memxor.asm @@ -227,274 +227,3 @@ PROLOGUE(nettle_memxor) b .Lmemxor_bytes EPILOGUE(nettle_memxor) - -define(<DST>, <r0>) -define(<AP>, <r1>) -define(<BP>, <r2>) -define(<N>, <r3>) -undefine(<CNT>) -undefine(<TNC>) - -C Temporaries r4-r7 -define(<ACNT>, <r8>) -define(<ATNC>, <r10>) -define(<BCNT>, <r11>) -define(<BTNC>, <r12>) - - C memxor3(void *dst, const void *a, const void *b, size_t n) - .align 2 -PROLOGUE(nettle_memxor3) - cmp N, #0 - beq .Lmemxor3_ret - - push {r4,r5,r6,r7,r8,r10,r11} - cmp N, #7 - - add AP, N - add BP, N - add DST, N - - bcs .Lmemxor3_large - - C Simple byte loop -.Lmemxor3_bytes: - ldrb r4, [AP, #-1]! - ldrb r5, [BP, #-1]! - eor r4, r5 - strb r4, [DST, #-1]! - subs N, #1 - bne .Lmemxor3_bytes - -.Lmemxor3_done: - pop {r4,r5,r6,r7,r8,r10,r11} -.Lmemxor3_ret: - bx lr - -.Lmemxor3_align_loop: - ldrb r4, [AP, #-1]! - ldrb r5, [BP, #-1]! - eor r5, r4 - strb r5, [DST, #-1]! - sub N, #1 - -.Lmemxor3_large: - tst DST, #3 - bne .Lmemxor3_align_loop - - C We have at least 4 bytes left to do here. - sub N, #4 - ands ACNT, AP, #3 - lsl ACNT, #3 - beq .Lmemxor3_a_aligned - - ands BCNT, BP, #3 - lsl BCNT, #3 - bne .Lmemxor3_uu - - C Swap - mov r4, AP - mov AP, BP - mov BP, r4 - -.Lmemxor3_au: - C NOTE: We have the relevant shift count in ACNT, not BCNT - - C AP is aligned, BP is not - C v original SRC - C +-------+------+ - C |SRC-4 |SRC | - C +---+---+------+ - C |DST-4 | - C +-------+ - C - C With little-endian, we need to do - C DST[i-i] ^= (SRC[i-i] >> CNT) ^ (SRC[i] << TNC) - rsb ATNC, ACNT, #32 - bic BP, #3 - - ldr r4, [BP] - - tst N, #4 - itet eq - moveq r5, r4 - subne N, #4 - beq .Lmemxor3_au_odd - -.Lmemxor3_au_loop: - ldr r5, [BP, #-4]! - ldr r6, [AP, #-4]! - eor r6, r6, r4, lsl ATNC - eor r6, r6, r5, lsr ACNT - str r6, [DST, #-4]! -.Lmemxor3_au_odd: - ldr r4, [BP, #-4]! - ldr r6, [AP, #-4]! - eor r6, r6, r5, lsl ATNC - eor r6, r6, r4, lsr ACNT - str r6, [DST, #-4]! - subs N, #8 - bcs .Lmemxor3_au_loop - adds N, #8 - beq .Lmemxor3_done - - C Leftover bytes in r4, low end - ldr r5, [AP, #-4] - eor r4, r5, r4, lsl ATNC - -.Lmemxor3_au_leftover: - C Store a byte at a time - ror r4, #24 - strb r4, [DST, #-1]! - subs N, #1 - beq .Lmemxor3_done - subs ACNT, #8 - sub AP, #1 - bne .Lmemxor3_au_leftover - b .Lmemxor3_bytes - -.Lmemxor3_a_aligned: - ands ACNT, BP, #3 - lsl ACNT, #3 - bne .Lmemxor3_au ; - - C a, b and dst all have the same alignment. - subs N, #8 - bcc .Lmemxor3_aligned_word_end - - C This loop runs at 8 cycles per iteration. It has been - C observed running at only 7 cycles, for this speed, the loop - C started at offset 0x2ac in the object file. - - C FIXME: consider software pipelining, similarly to the memxor - C loop. - -.Lmemxor3_aligned_word_loop: - ldmdb AP!, {r4,r5,r6} - ldmdb BP!, {r7,r8,r10} - subs N, #12 - eor r4, r7 - eor r5, r8 - eor r6, r10 - stmdb DST!, {r4, r5,r6} - bcs .Lmemxor3_aligned_word_loop - -.Lmemxor3_aligned_word_end: - C We have 0-11 bytes left to do, and N holds number of bytes -12. - adds N, #4 - bcc .Lmemxor3_aligned_lt_8 - C Do 8 bytes more, leftover is in N - ldmdb AP!, {r4, r5} - ldmdb BP!, {r6, r7} - eor r4, r6 - eor r5, r7 - stmdb DST!, {r4,r5} - beq .Lmemxor3_done - b .Lmemxor3_bytes - -.Lmemxor3_aligned_lt_8: - adds N, #4 - bcc .Lmemxor3_aligned_lt_4 - - ldr r4, [AP,#-4]! - ldr r5, [BP,#-4]! - eor r4, r5 - str r4, [DST,#-4]! - beq .Lmemxor3_done - b .Lmemxor3_bytes - -.Lmemxor3_aligned_lt_4: - adds N, #4 - beq .Lmemxor3_done - b .Lmemxor3_bytes - -.Lmemxor3_uu: - - cmp ACNT, BCNT - bic AP, #3 - bic BP, #3 - rsb ATNC, ACNT, #32 - - bne .Lmemxor3_uud - - C AP and BP are unaligned in the same way - - ldr r4, [AP] - ldr r6, [BP] - eor r4, r6 - - tst N, #4 - itet eq - moveq r5, r4 - subne N, #4 - beq .Lmemxor3_uu_odd - -.Lmemxor3_uu_loop: - ldr r5, [AP, #-4]! - ldr r6, [BP, #-4]! - eor r5, r6 - lsl r4, ATNC - eor r4, r4, r5, lsr ACNT - str r4, [DST, #-4]! -.Lmemxor3_uu_odd: - ldr r4, [AP, #-4]! - ldr r6, [BP, #-4]! - eor r4, r6 - lsl r5, ATNC - eor r5, r5, r4, lsr ACNT - str r5, [DST, #-4]! - subs N, #8 - bcs .Lmemxor3_uu_loop - adds N, #8 - beq .Lmemxor3_done - - C Leftover bytes in a4, low end - ror r4, ACNT -.Lmemxor3_uu_leftover: - ror r4, #24 - strb r4, [DST, #-1]! - subs N, #1 - beq .Lmemxor3_done - subs ACNT, #8 - bne .Lmemxor3_uu_leftover - b .Lmemxor3_bytes - -.Lmemxor3_uud: - C Both AP and BP unaligned, and in different ways - rsb BTNC, BCNT, #32 - - ldr r4, [AP] - ldr r6, [BP] - - tst N, #4 - ittet eq - moveq r5, r4 - moveq r7, r6 - subne N, #4 - beq .Lmemxor3_uud_odd - -.Lmemxor3_uud_loop: - ldr r5, [AP, #-4]! - ldr r7, [BP, #-4]! - lsl r4, ATNC - eor r4, r4, r6, lsl BTNC - eor r4, r4, r5, lsr ACNT - eor r4, r4, r7, lsr BCNT - str r4, [DST, #-4]! -.Lmemxor3_uud_odd: - ldr r4, [AP, #-4]! - ldr r6, [BP, #-4]! - lsl r5, ATNC - eor r5, r5, r7, lsl BTNC - eor r5, r5, r4, lsr ACNT - eor r5, r5, r6, lsr BCNT - str r5, [DST, #-4]! - subs N, #8 - bcs .Lmemxor3_uud_loop - adds N, #8 - beq .Lmemxor3_done - - C FIXME: More clever left-over handling? For now, just adjust pointers. - add AP, AP, ACNT, lsr #3 - add BP, BP, BCNT, lsr #3 - b .Lmemxor3_bytes -EPILOGUE(nettle_memxor3) diff --git a/arm/memxor3.asm b/arm/memxor3.asm new file mode 100644 index 0000000000000000000000000000000000000000..139fd2082278d3b7cb82739530706aabd3cdea44 --- /dev/null +++ b/arm/memxor3.asm @@ -0,0 +1,315 @@ +C arm/memxor3.asm + +ifelse(< + Copyright (C) 2013, 2015 Niels Möller + + This file is part of GNU Nettle. + + GNU Nettle is free software: you can redistribute it and/or + modify it under the terms of either: + + * the GNU Lesser General Public License as published by the Free + Software Foundation; either version 3 of the License, or (at your + option) any later version. + + or + + * the GNU General Public License as published by the Free + Software Foundation; either version 2 of the License, or (at your + option) any later version. + + or both in parallel, as here. + + GNU Nettle is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + General Public License for more details. + + You should have received copies of the GNU General Public License and + the GNU Lesser General Public License along with this program. If + not, see http://www.gnu.org/licenses/. +>) + +C Possible speedups: +C +C The ldm instruction can do load two registers per cycle, +C if the address is two-word aligned. Or three registers in two +C cycles, regardless of alignment. + +C Register usage: + +define(<DST>, <r0>) +define(<AP>, <r1>) +define(<BP>, <r2>) +define(<N>, <r3>) + +C Temporaries r4-r7 +define(<ACNT>, <r8>) +define(<ATNC>, <r10>) +define(<BCNT>, <r11>) +define(<BTNC>, <r12>) + + .syntax unified + + .file "memxor3.asm" + + .text + .arm + + C memxor3(void *dst, const void *a, const void *b, size_t n) + .align 2 +PROLOGUE(nettle_memxor3) + cmp N, #0 + beq .Lmemxor3_ret + + push {r4,r5,r6,r7,r8,r10,r11} + cmp N, #7 + + add AP, N + add BP, N + add DST, N + + bcs .Lmemxor3_large + + C Simple byte loop +.Lmemxor3_bytes: + ldrb r4, [AP, #-1]! + ldrb r5, [BP, #-1]! + eor r4, r5 + strb r4, [DST, #-1]! + subs N, #1 + bne .Lmemxor3_bytes + +.Lmemxor3_done: + pop {r4,r5,r6,r7,r8,r10,r11} +.Lmemxor3_ret: + bx lr + +.Lmemxor3_align_loop: + ldrb r4, [AP, #-1]! + ldrb r5, [BP, #-1]! + eor r5, r4 + strb r5, [DST, #-1]! + sub N, #1 + +.Lmemxor3_large: + tst DST, #3 + bne .Lmemxor3_align_loop + + C We have at least 4 bytes left to do here. + sub N, #4 + ands ACNT, AP, #3 + lsl ACNT, #3 + beq .Lmemxor3_a_aligned + + ands BCNT, BP, #3 + lsl BCNT, #3 + bne .Lmemxor3_uu + + C Swap + mov r4, AP + mov AP, BP + mov BP, r4 + +.Lmemxor3_au: + C NOTE: We have the relevant shift count in ACNT, not BCNT + + C AP is aligned, BP is not + C v original SRC + C +-------+------+ + C |SRC-4 |SRC | + C +---+---+------+ + C |DST-4 | + C +-------+ + C + C With little-endian, we need to do + C DST[i-i] ^= (SRC[i-i] >> CNT) ^ (SRC[i] << TNC) + rsb ATNC, ACNT, #32 + bic BP, #3 + + ldr r4, [BP] + + tst N, #4 + itet eq + moveq r5, r4 + subne N, #4 + beq .Lmemxor3_au_odd + +.Lmemxor3_au_loop: + ldr r5, [BP, #-4]! + ldr r6, [AP, #-4]! + eor r6, r6, r4, lsl ATNC + eor r6, r6, r5, lsr ACNT + str r6, [DST, #-4]! +.Lmemxor3_au_odd: + ldr r4, [BP, #-4]! + ldr r6, [AP, #-4]! + eor r6, r6, r5, lsl ATNC + eor r6, r6, r4, lsr ACNT + str r6, [DST, #-4]! + subs N, #8 + bcs .Lmemxor3_au_loop + adds N, #8 + beq .Lmemxor3_done + + C Leftover bytes in r4, low end + ldr r5, [AP, #-4] + eor r4, r5, r4, lsl ATNC + +.Lmemxor3_au_leftover: + C Store a byte at a time + ror r4, #24 + strb r4, [DST, #-1]! + subs N, #1 + beq .Lmemxor3_done + subs ACNT, #8 + sub AP, #1 + bne .Lmemxor3_au_leftover + b .Lmemxor3_bytes + +.Lmemxor3_a_aligned: + ands ACNT, BP, #3 + lsl ACNT, #3 + bne .Lmemxor3_au ; + + C a, b and dst all have the same alignment. + subs N, #8 + bcc .Lmemxor3_aligned_word_end + + C This loop runs at 8 cycles per iteration. It has been + C observed running at only 7 cycles, for this speed, the loop + C started at offset 0x2ac in the object file. + + C FIXME: consider software pipelining, similarly to the memxor + C loop. + +.Lmemxor3_aligned_word_loop: + ldmdb AP!, {r4,r5,r6} + ldmdb BP!, {r7,r8,r10} + subs N, #12 + eor r4, r7 + eor r5, r8 + eor r6, r10 + stmdb DST!, {r4, r5,r6} + bcs .Lmemxor3_aligned_word_loop + +.Lmemxor3_aligned_word_end: + C We have 0-11 bytes left to do, and N holds number of bytes -12. + adds N, #4 + bcc .Lmemxor3_aligned_lt_8 + C Do 8 bytes more, leftover is in N + ldmdb AP!, {r4, r5} + ldmdb BP!, {r6, r7} + eor r4, r6 + eor r5, r7 + stmdb DST!, {r4,r5} + beq .Lmemxor3_done + b .Lmemxor3_bytes + +.Lmemxor3_aligned_lt_8: + adds N, #4 + bcc .Lmemxor3_aligned_lt_4 + + ldr r4, [AP,#-4]! + ldr r5, [BP,#-4]! + eor r4, r5 + str r4, [DST,#-4]! + beq .Lmemxor3_done + b .Lmemxor3_bytes + +.Lmemxor3_aligned_lt_4: + adds N, #4 + beq .Lmemxor3_done + b .Lmemxor3_bytes + +.Lmemxor3_uu: + + cmp ACNT, BCNT + bic AP, #3 + bic BP, #3 + rsb ATNC, ACNT, #32 + + bne .Lmemxor3_uud + + C AP and BP are unaligned in the same way + + ldr r4, [AP] + ldr r6, [BP] + eor r4, r6 + + tst N, #4 + itet eq + moveq r5, r4 + subne N, #4 + beq .Lmemxor3_uu_odd + +.Lmemxor3_uu_loop: + ldr r5, [AP, #-4]! + ldr r6, [BP, #-4]! + eor r5, r6 + lsl r4, ATNC + eor r4, r4, r5, lsr ACNT + str r4, [DST, #-4]! +.Lmemxor3_uu_odd: + ldr r4, [AP, #-4]! + ldr r6, [BP, #-4]! + eor r4, r6 + lsl r5, ATNC + eor r5, r5, r4, lsr ACNT + str r5, [DST, #-4]! + subs N, #8 + bcs .Lmemxor3_uu_loop + adds N, #8 + beq .Lmemxor3_done + + C Leftover bytes in a4, low end + ror r4, ACNT +.Lmemxor3_uu_leftover: + ror r4, #24 + strb r4, [DST, #-1]! + subs N, #1 + beq .Lmemxor3_done + subs ACNT, #8 + bne .Lmemxor3_uu_leftover + b .Lmemxor3_bytes + +.Lmemxor3_uud: + C Both AP and BP unaligned, and in different ways + rsb BTNC, BCNT, #32 + + ldr r4, [AP] + ldr r6, [BP] + + tst N, #4 + ittet eq + moveq r5, r4 + moveq r7, r6 + subne N, #4 + beq .Lmemxor3_uud_odd + +.Lmemxor3_uud_loop: + ldr r5, [AP, #-4]! + ldr r7, [BP, #-4]! + lsl r4, ATNC + eor r4, r4, r6, lsl BTNC + eor r4, r4, r5, lsr ACNT + eor r4, r4, r7, lsr BCNT + str r4, [DST, #-4]! +.Lmemxor3_uud_odd: + ldr r4, [AP, #-4]! + ldr r6, [BP, #-4]! + lsl r5, ATNC + eor r5, r5, r7, lsl BTNC + eor r5, r5, r4, lsr ACNT + eor r5, r5, r6, lsr BCNT + str r5, [DST, #-4]! + subs N, #8 + bcs .Lmemxor3_uud_loop + adds N, #8 + beq .Lmemxor3_done + + C FIXME: More clever left-over handling? For now, just adjust pointers. + add AP, AP, ACNT, lsr #3 + add BP, BP, BCNT, lsr #3 + b .Lmemxor3_bytes +EPILOGUE(nettle_memxor3) diff --git a/configure.ac b/configure.ac index bb33962cca1004143ccd18c49c859e0eb5e4e80e..76beb3160853abde4dfaafcb90fcc35a558c6d2f 100644 --- a/configure.ac +++ b/configure.ac @@ -272,7 +272,8 @@ fi # to a new object file). asm_replace_list="aes-encrypt-internal.asm aes-decrypt-internal.asm \ arcfour-crypt.asm camellia-crypt-internal.asm \ - md5-compress.asm memxor.asm poly1305-internal.asm \ + md5-compress.asm memxor.asm memxor3.asm \ + poly1305-internal.asm \ chacha-core-internal.asm \ salsa20-crypt.asm salsa20-core-internal.asm \ serpent-encrypt.asm serpent-decrypt.asm \ @@ -356,6 +357,12 @@ fi AC_SUBST([OPT_ASM_NETTLE_SOURCES]) AC_SUBST([OPT_ASM_HOGWEED_SOURCES]) AC_SUBST([ASM_RODATA]) +if test "x$enable_assembler" = xyes ; then + IF_ASM='' +else + IF_ASM='#' +fi +AC_SUBST([IF_ASM]) AH_VERBATIM([HAVE_NATIVE], [/* Define to 1 each of the following for which a native (ie. CPU specific) diff --git a/memxor-internal.h b/memxor-internal.h new file mode 100644 index 0000000000000000000000000000000000000000..dbb5e99073b594d50f7763d8aea8977c7c5dc201 --- /dev/null +++ b/memxor-internal.h @@ -0,0 +1,73 @@ +/* memxor-internal.h + + Copyright (C) 2010, 2014 Niels Möller + + This file is part of GNU Nettle. + + GNU Nettle is free software: you can redistribute it and/or + modify it under the terms of either: + + * the GNU Lesser General Public License as published by the Free + Software Foundation; either version 3 of the License, or (at your + option) any later version. + + or + + * the GNU General Public License as published by the Free + Software Foundation; either version 2 of the License, or (at your + option) any later version. + + or both in parallel, as here. + + GNU Nettle is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + General Public License for more details. + + You should have received copies of the GNU General Public License and + the GNU Lesser General Public License along with this program. If + not, see http://www.gnu.org/licenses/. +*/ + +#ifndef NETTLE_MEMXOR_INTERNAL_H_INCLUDED +#define NETTLE_MEMXOR_INTERNAL_H_INCLUDED + +#include "nettle-types.h" + +/* The word_t type is intended to be the native word size. */ +#if defined(__x86_64__) || defined(__arch64__) +/* Including on M$ windows, where unsigned long is only 32 bits */ +typedef uint64_t word_t; +#else +typedef unsigned long int word_t; +#endif + +#define ALIGN_OFFSET(p) ((uintptr_t) (p) % sizeof(word_t)) + +#ifndef WORDS_BIGENDIAN +#define MERGE(w0, sh_1, w1, sh_2) \ + (((w0) >> (sh_1)) | ((w1) << (sh_2))) +#else +#define MERGE(w0, sh_1, w1, sh_2) \ + (((w0) << (sh_1)) | ((w1) >> (sh_2))) +#endif + +#ifndef WORDS_BIGENDIAN +#define READ_PARTIAL(r,p,n) do { \ + word_t _rp_x; \ + unsigned _rp_i; \ + for (_rp_i = (n), _rp_x = (p)[--_rp_i]; _rp_i > 0;) \ + _rp_x = (_rp_x << CHAR_BIT) | (p)[--_rp_i]; \ + (r) = _rp_x; \ + } while (0) +#else +#define READ_PARTIAL(r,p,n) do { \ + word_t _rp_x; \ + unsigned _rp_i; \ + for (_rp_x = (p)[0], _rp_i = 1; _rp_i < (n); _rp_i++) \ + _rp_x = (_rp_x << CHAR_BIT) | (p)[_rp_i]; \ + (r) = _rp_x; \ + } while (0) +#endif + +#endif /* NETTLE_MEMXOR_INTERNAL_H_INCLUDED */ diff --git a/memxor.c b/memxor.c index 99f46f37b7962b4c62e620df64bac24e5eb59b33..36306ac87e155e1b1d06e2b7a40a13bbd7421be5 100644 --- a/memxor.c +++ b/memxor.c @@ -1,6 +1,6 @@ /* memxor.c - Copyright (C) 2010 Niels Möller + Copyright (C) 2010, 2014 Niels Möller This file is part of GNU Nettle. @@ -37,26 +37,11 @@ # include "config.h" #endif +#include <assert.h> #include <limits.h> #include "memxor.h" - -/* For uintptr_t */ -#include "nettle-types.h" - -typedef unsigned long int word_t; - -#if SIZEOF_LONG & (SIZEOF_LONG - 1) -#error Word size must be a power of two -#endif - -#define ALIGN_OFFSET(p) ((uintptr_t) (p) % sizeof(word_t)) - -#ifndef WORDS_BIGENDIAN -#define MERGE(w0, sh_1, w1, sh_2) (((w0) >> (sh_1)) | ((w1) << (sh_2))) -#else -#define MERGE(w0, sh_1, w1, sh_2) (((w0) << (sh_1)) | ((w1) >> (sh_2))) -#endif +#include "memxor-internal.h" #define WORD_T_THRESH 16 @@ -70,13 +55,14 @@ memxor_common_alignment (word_t *dst, const word_t *src, size_t n) if (n & 1) { - *dst++ ^= *src++; n--; + dst[n] ^= src[n]; } - for (; n >= 2; dst += 2, src += 2, n -= 2) + while (n >= 2) { - dst[0] ^= src[0]; - dst[1] ^= src[1]; + n -= 2; + dst[n+1] ^= src[n+1]; + dst[n] ^= src[n]; } } @@ -84,35 +70,52 @@ memxor_common_alignment (word_t *dst, const word_t *src, size_t n) words, not bytes. Assumes we can read complete words at the start and end of the src operand. */ static void -memxor_different_alignment (word_t *dst, const char *src, size_t n) +memxor_different_alignment (word_t *dst, const unsigned char *src, size_t n) { - size_t i; int shl, shr; const word_t *src_word; unsigned offset = ALIGN_OFFSET (src); word_t s0, s1; + assert (n > 0); shl = CHAR_BIT * offset; shr = CHAR_BIT * (sizeof(word_t) - offset); - src_word = (const word_t *) ((uintptr_t) src & -SIZEOF_LONG); + src_word = (const word_t *) ((uintptr_t) src & -sizeof(word_t)); + + /* Read top offset bytes, in native byte order. */ + READ_PARTIAL (s0, (unsigned char *) &src_word[n], offset); +#ifdef WORDS_BIGENDIAN + s0 <<= shr; /* FIXME: Eliminate this shift? */ +#endif - /* FIXME: Unroll four times, like memcmp? */ - i = n & 1; - s0 = src_word[i]; - if (i) + /* Do n-1 regular iterations */ + if (n & 1) + s1 = s0; + else { - s1 = src_word[0]; - dst[0] ^= MERGE (s1, shl, s0, shr); + n--; + s1 = src_word[n]; + dst[n] ^= MERGE (s1, shl, s0, shr); } - for (; i < n; i += 2) + assert (n & 1); + while (n > 2) { - s1 = src_word[i+1]; - dst[i] ^= MERGE(s0, shl, s1, shr); - s0 = src_word[i+2]; - dst[i+1] ^= MERGE(s1, shl, s0, shr); + n -= 2; + s0 = src_word[n+1]; + dst[n+1] ^= MERGE(s0, shl, s1, shr); + s1 = src_word[n]; /* FIXME: Overread on last iteration */ + dst[n] ^= MERGE(s1, shl, s0, shr); } + assert (n == 1); + /* Read low wordsize - offset bytes */ + READ_PARTIAL (s0, src, sizeof(word_t) - offset); +#ifndef WORDS_BIGENDIAN + s0 <<= shl; /* FIXME: eliminate shift? */ +#endif /* !WORDS_BIGENDIAN */ + + dst[0] ^= MERGE(s0, shl, s1, shr); } /* Performance, Intel SU1400 (x86_64): 0.25 cycles/byte aligned, 0.45 @@ -123,216 +126,36 @@ memxor_different_alignment (word_t *dst, const char *src, size_t n) void * memxor(void *dst_in, const void *src_in, size_t n) { - char *dst = dst_in; - const char *src = src_in; - - if (n >= WORD_T_THRESH) - { - /* There are at least some bytes to compare. No need to test - for N == 0 in this alignment loop. */ - while (ALIGN_OFFSET (dst)) - { - *dst++ ^= *src++; - n--; - } - if (ALIGN_OFFSET (src)) - memxor_different_alignment ((word_t *) dst, src, n / sizeof(word_t)); - else - memxor_common_alignment ((word_t *) dst, (const word_t *) src, n / sizeof(word_t)); - - dst += n & -SIZEOF_LONG; - src += n & -SIZEOF_LONG; - n = n & (SIZEOF_LONG - 1); - } - for (; n > 0; n--) - *dst++ ^= *src++; - - return dst_in; -} - - -/* XOR word-aligned areas. n is the number of words, not bytes. */ -static void -memxor3_common_alignment (word_t *dst, - const word_t *a, const word_t *b, size_t n) -{ - /* FIXME: Require n > 0? */ - while (n-- > 0) - dst[n] = a[n] ^ b[n]; -} - -static void -memxor3_different_alignment_b (word_t *dst, - const word_t *a, const char *b, unsigned offset, size_t n) -{ - int shl, shr; - const word_t *b_word; - - word_t s0, s1; - - shl = CHAR_BIT * offset; - shr = CHAR_BIT * (sizeof(word_t) - offset); - - b_word = (const word_t *) ((uintptr_t) b & -SIZEOF_LONG); - - if (n & 1) - { - n--; - s1 = b_word[n]; - s0 = b_word[n+1]; - dst[n] = a[n] ^ MERGE (s1, shl, s0, shr); - } - else - s1 = b_word[n]; - - while (n > 0) - { - n -= 2; - s0 = b_word[n+1]; - dst[n+1] = a[n+1] ^ MERGE(s0, shl, s1, shr); - s1 = b_word[n]; - dst[n] = a[n] ^ MERGE(s1, shl, s0, shr); - } -} - -static void -memxor3_different_alignment_ab (word_t *dst, - const char *a, const char *b, - unsigned offset, size_t n) -{ - int shl, shr; - const word_t *a_word; - const word_t *b_word; - - word_t s0, s1; - - shl = CHAR_BIT * offset; - shr = CHAR_BIT * (sizeof(word_t) - offset); - - a_word = (const word_t *) ((uintptr_t) a & -SIZEOF_LONG); - b_word = (const word_t *) ((uintptr_t) b & -SIZEOF_LONG); - - if (n & 1) - { - n--; - s1 = a_word[n] ^ b_word[n]; - s0 = a_word[n+1] ^ b_word[n+1]; - dst[n] = MERGE (s1, shl, s0, shr); - } - else - s1 = a_word[n] ^ b_word[n]; - - while (n > 0) - { - n -= 2; - s0 = a_word[n+1] ^ b_word[n+1]; - dst[n+1] = MERGE(s0, shl, s1, shr); - s1 = a_word[n] ^ b_word[n]; - dst[n] = MERGE(s1, shl, s0, shr); - } -} - -static void -memxor3_different_alignment_all (word_t *dst, - const char *a, const char *b, - unsigned a_offset, unsigned b_offset, - size_t n) -{ - int al, ar, bl, br; - const word_t *a_word; - const word_t *b_word; - - word_t a0, a1, b0, b1; - - al = CHAR_BIT * a_offset; - ar = CHAR_BIT * (sizeof(word_t) - a_offset); - bl = CHAR_BIT * b_offset; - br = CHAR_BIT * (sizeof(word_t) - b_offset); - - a_word = (const word_t *) ((uintptr_t) a & -SIZEOF_LONG); - b_word = (const word_t *) ((uintptr_t) b & -SIZEOF_LONG); - - if (n & 1) - { - n--; - a1 = a_word[n]; a0 = a_word[n+1]; - b1 = b_word[n]; b0 = b_word[n+1]; - - dst[n] = MERGE (a1, al, a0, ar) ^ MERGE (b1, bl, b0, br); - } - else - { - a1 = a_word[n]; - b1 = b_word[n]; - } - - while (n > 0) - { - n -= 2; - a0 = a_word[n+1]; b0 = b_word[n+1]; - dst[n+1] = MERGE(a0, al, a1, ar) ^ MERGE(b0, bl, b1, br); - a1 = a_word[n]; b1 = b_word[n]; - dst[n] = MERGE(a1, al, a0, ar) ^ MERGE(b1, bl, b0, br); - } -} - -/* Current implementation processes data in descending order, to - support overlapping operation with one of the sources overlapping - the start of the destination area. This feature is used only - internally by cbc decrypt, and it is not advertised or documented - to nettle users. */ -void * -memxor3(void *dst_in, const void *a_in, const void *b_in, size_t n) -{ - char *dst = dst_in; - const char *a = a_in; - const char *b = b_in; + unsigned char *dst = dst_in; + const unsigned char *src = src_in; if (n >= WORD_T_THRESH) { unsigned i; - unsigned a_offset; - unsigned b_offset; + unsigned offset; size_t nwords; - + /* There are at least some bytes to compare. No need to test + for N == 0 in this alignment loop. */ for (i = ALIGN_OFFSET(dst + n); i > 0; i--) { n--; - dst[n] = a[n] ^ b[n]; + dst[n] ^= src[n]; } - - a_offset = ALIGN_OFFSET(a + n); - b_offset = ALIGN_OFFSET(b + n); - + offset = ALIGN_OFFSET(src + n); nwords = n / sizeof (word_t); n %= sizeof (word_t); - if (a_offset == b_offset) - { - if (!a_offset) - memxor3_common_alignment((word_t *) (dst + n), - (const word_t *) (a + n), - (const word_t *) (b + n), nwords); - else - memxor3_different_alignment_ab((word_t *) (dst + n), - a + n, b + n, a_offset, - nwords); - } - else if (!a_offset) - memxor3_different_alignment_b((word_t *) (dst + n), - (const word_t *) (a + n), b + n, - b_offset, nwords); - else if (!b_offset) - memxor3_different_alignment_b((word_t *) (dst + n), - (const word_t *) (b + n), a + n, - a_offset, nwords); + if (offset) + memxor_different_alignment ((word_t *) (dst+n), src+n, nwords); else - memxor3_different_alignment_all((word_t *) (dst + n), a + n, b + n, - a_offset, b_offset, nwords); - + memxor_common_alignment ((word_t *) (dst+n), + (const word_t *) (src+n), nwords); + } + while (n > 0) + { + n--; + dst[n] ^= src[n]; } - while (n-- > 0) - dst[n] = a[n] ^ b[n]; return dst; } diff --git a/memxor3.c b/memxor3.c new file mode 100644 index 0000000000000000000000000000000000000000..fe208bf12e8448178eb0b0e21994f41fa72fcd4c --- /dev/null +++ b/memxor3.c @@ -0,0 +1,292 @@ +/* memxor3.c + + Copyright (C) 2010, 2014 Niels Möller + + This file is part of GNU Nettle. + + GNU Nettle is free software: you can redistribute it and/or + modify it under the terms of either: + + * the GNU Lesser General Public License as published by the Free + Software Foundation; either version 3 of the License, or (at your + option) any later version. + + or + + * the GNU General Public License as published by the Free + Software Foundation; either version 2 of the License, or (at your + option) any later version. + + or both in parallel, as here. + + GNU Nettle is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + General Public License for more details. + + You should have received copies of the GNU General Public License and + the GNU Lesser General Public License along with this program. If + not, see http://www.gnu.org/licenses/. +*/ + +/* Implementation inspired by memcmp in glibc, contributed to the FSF + by Torbjorn Granlund. + */ + +#if HAVE_CONFIG_H +# include "config.h" +#endif + +#include <assert.h> +#include <limits.h> + +#include "memxor.h" +#include "memxor-internal.h" + +#define WORD_T_THRESH 16 + +/* XOR word-aligned areas. n is the number of words, not bytes. */ +static void +memxor3_common_alignment (word_t *dst, + const word_t *a, const word_t *b, size_t n) +{ + /* FIXME: Require n > 0? */ + if (n & 1) + { + n--; + dst[n] = a[n] ^ b[n]; + } + while (n > 0) + { + n -= 2; + dst[n+1] = a[n+1] ^ b[n+1]; + dst[n] = a[n] ^ b[n]; + } +} + +static void +memxor3_different_alignment_b (word_t *dst, + const word_t *a, const unsigned char *b, + unsigned offset, size_t n) +{ + int shl, shr; + const word_t *b_word; + + word_t s0, s1; + + assert (n > 0); + + shl = CHAR_BIT * offset; + shr = CHAR_BIT * (sizeof(word_t) - offset); + + b_word = (const word_t *) ((uintptr_t) b & -sizeof(word_t)); + + /* Read top offset bytes, in native byte order. */ + READ_PARTIAL (s0, (unsigned char *) &b_word[n], offset); +#ifdef WORDS_BIGENDIAN + s0 <<= shr; +#endif + + if (n & 1) + s1 = s0; + else + { + n--; + s1 = b_word[n]; + dst[n] = a[n] ^ MERGE (s1, shl, s0, shr); + } + + while (n > 2) + { + n -= 2; + s0 = b_word[n+1]; + dst[n+1] = a[n+1] ^ MERGE(s0, shl, s1, shr); + s1 = b_word[n]; + dst[n] = a[n] ^ MERGE(s1, shl, s0, shr); + } + assert (n == 1); + /* Read low wordsize - offset bytes */ + READ_PARTIAL (s0, b, sizeof(word_t) - offset); +#ifndef WORDS_BIGENDIAN + s0 <<= shl; +#endif /* !WORDS_BIGENDIAN */ + + dst[0] = a[0] ^ MERGE(s0, shl, s1, shr); +} + +static void +memxor3_different_alignment_ab (word_t *dst, + const unsigned char *a, const unsigned char *b, + unsigned offset, size_t n) +{ + int shl, shr; + const word_t *a_word; + const word_t *b_word; + + word_t s0, s1, t; + + assert (n > 0); + + shl = CHAR_BIT * offset; + shr = CHAR_BIT * (sizeof(word_t) - offset); + + a_word = (const word_t *) ((uintptr_t) a & -sizeof(word_t)); + b_word = (const word_t *) ((uintptr_t) b & -sizeof(word_t)); + + /* Read top offset bytes, in native byte order. */ + READ_PARTIAL (s0, (unsigned char *) &a_word[n], offset); + READ_PARTIAL (t, (unsigned char *) &b_word[n], offset); + s0 ^= t; +#ifdef WORDS_BIGENDIAN + s0 <<= shr; +#endif + + if (n & 1) + s1 = s0; + else + { + n--; + s1 = a_word[n] ^ b_word[n]; + dst[n] = MERGE (s1, shl, s0, shr); + } + + while (n > 2) + { + n -= 2; + s0 = a_word[n+1] ^ b_word[n+1]; + dst[n+1] = MERGE(s0, shl, s1, shr); + s1 = a_word[n] ^ b_word[n]; + dst[n] = MERGE(s1, shl, s0, shr); + } + assert (n == 1); + /* Read low wordsize - offset bytes */ + READ_PARTIAL (s0, a, sizeof(word_t) - offset); + READ_PARTIAL (t, b, sizeof(word_t) - offset); + s0 ^= t; +#ifndef WORDS_BIGENDIAN + s0 <<= shl; +#endif /* !WORDS_BIGENDIAN */ + + dst[0] = MERGE(s0, shl, s1, shr); +} + +static void +memxor3_different_alignment_all (word_t *dst, + const unsigned char *a, const unsigned char *b, + unsigned a_offset, unsigned b_offset, + size_t n) +{ + int al, ar, bl, br; + const word_t *a_word; + const word_t *b_word; + + word_t a0, a1, b0, b1; + + al = CHAR_BIT * a_offset; + ar = CHAR_BIT * (sizeof(word_t) - a_offset); + bl = CHAR_BIT * b_offset; + br = CHAR_BIT * (sizeof(word_t) - b_offset); + + a_word = (const word_t *) ((uintptr_t) a & -sizeof(word_t)); + b_word = (const word_t *) ((uintptr_t) b & -sizeof(word_t)); + + /* Read top offset bytes, in native byte order. */ + READ_PARTIAL (a0, (unsigned char *) &a_word[n], a_offset); + READ_PARTIAL (b0, (unsigned char *) &b_word[n], b_offset); +#ifdef WORDS_BIGENDIAN + a0 <<= ar; + b0 <<= br; +#endif + + if (n & 1) + { + a1 = a0; b1 = b0; + } + else + { + n--; + a1 = a_word[n]; + b1 = b_word[n]; + + dst[n] = MERGE (a1, al, a0, ar) ^ MERGE (b1, bl, b0, br); + } + while (n > 2) + { + n -= 2; + a0 = a_word[n+1]; b0 = b_word[n+1]; + dst[n+1] = MERGE(a0, al, a1, ar) ^ MERGE(b0, bl, b1, br); + a1 = a_word[n]; b1 = b_word[n]; + dst[n] = MERGE(a1, al, a0, ar) ^ MERGE(b1, bl, b0, br); + } + assert (n == 1); + /* Read low wordsize - offset bytes */ + READ_PARTIAL (a0, a, sizeof(word_t) - a_offset); + READ_PARTIAL (b0, b, sizeof(word_t) - b_offset); +#ifndef WORDS_BIGENDIAN + a0 <<= al; + b0 <<= bl; +#endif /* !WORDS_BIGENDIAN */ + + dst[0] = MERGE(a0, al, a1, ar) ^ MERGE(b0, bl, b1, br); +} + +/* Current implementation processes data in descending order, to + support overlapping operation with one of the sources overlapping + the start of the destination area. This feature is used only + internally by cbc decrypt, and it is not advertised or documented + to nettle users. */ +void * +memxor3(void *dst_in, const void *a_in, const void *b_in, size_t n) +{ + unsigned char *dst = dst_in; + const unsigned char *a = a_in; + const unsigned char *b = b_in; + + if (n >= WORD_T_THRESH) + { + unsigned i; + unsigned a_offset; + unsigned b_offset; + size_t nwords; + + for (i = ALIGN_OFFSET(dst + n); i > 0; i--) + { + n--; + dst[n] = a[n] ^ b[n]; + } + + a_offset = ALIGN_OFFSET(a + n); + b_offset = ALIGN_OFFSET(b + n); + + nwords = n / sizeof (word_t); + n %= sizeof (word_t); + + if (a_offset == b_offset) + { + if (!a_offset) + memxor3_common_alignment((word_t *) (dst + n), + (const word_t *) (a + n), + (const word_t *) (b + n), nwords); + else + memxor3_different_alignment_ab((word_t *) (dst + n), + a + n, b + n, a_offset, + nwords); + } + else if (!a_offset) + memxor3_different_alignment_b((word_t *) (dst + n), + (const word_t *) (a + n), b + n, + b_offset, nwords); + else if (!b_offset) + memxor3_different_alignment_b((word_t *) (dst + n), + (const word_t *) (b + n), a + n, + a_offset, nwords); + else + memxor3_different_alignment_all((word_t *) (dst + n), a + n, b + n, + a_offset, b_offset, nwords); + + } + while (n-- > 0) + dst[n] = a[n] ^ b[n]; + + return dst; +} diff --git a/testsuite/Makefile.in b/testsuite/Makefile.in index 96c0bc80aee9b24c239fbc1cb5c2714b6f1fd049..6bc1907c79fbea3c94eabb82f1044116842d3326 100644 --- a/testsuite/Makefile.in +++ b/testsuite/Makefile.in @@ -114,7 +114,7 @@ $(TARGETS) $(EXTRA_TARGETS): testutils.$(OBJEXT) ../nettle-internal.$(OBJEXT) \ # --partial-loads-ok=yes is needed for memxor's handling of unaligned # data. -VALGRIND = valgrind --error-exitcode=1 --leak-check=full --partial-loads-ok=yes --show-reachable=yes +VALGRIND = valgrind --error-exitcode=1 --leak-check=full --show-reachable=yes @IF_ASM@ --partial-loads-ok=yes # The PATH update is for locating dlls on w*ndows. check: $(TS_ALL) diff --git a/x86_64/memxor.asm b/x86_64/memxor.asm index e14e31a13f4617bc202bb66239703caedee4f47d..69d6cfe382a12e8a0d2ace1d1019a9db44d23f24 100644 --- a/x86_64/memxor.asm +++ b/x86_64/memxor.asm @@ -1,7 +1,7 @@ C x86_64/memxor.asm ifelse(< - Copyright (C) 2010, Niels Möller + Copyright (C) 2010, 2014, Niels Möller This file is part of GNU Nettle. @@ -32,9 +32,8 @@ ifelse(< C Register usage: define(<DST>, <%rax>) C Originally in %rdi -define(<AP>, <%rsi>) -define(<BP>, <%rdx>) -define(<N>, <%r10>) +define(<SRC>, <%rsi>) +define(<N>, <%rdx>) define(<TMP>, <%r8>) define(<TMP2>, <%r9>) define(<CNT>, <%rdi>) @@ -53,20 +52,7 @@ define(<USE_SSE2>, <no>) PROLOGUE(nettle_memxor) W64_ENTRY(3, 0) - mov %rdx, %r10 - mov %rdi, %rdx - jmp .Lmemxor3_entry -EPILOGUE(nettle_memxor) - C memxor3(void *dst, const void *a, const void *b, size_t n) - C %rdi %rsi %rdx %rcx - ALIGN(16) - -PROLOGUE(nettle_memxor3) - W64_ENTRY(4, 0) - C %cl needed for shift count, so move away N - mov %rcx, N -.Lmemxor3_entry: test N, N C Get number of unaligned bytes at the end C %rdi is used as CNT, %rax as DST and as return value @@ -87,9 +73,8 @@ PROLOGUE(nettle_memxor3) .Lalign_loop: sub $1, N - movb (AP, N), LREG(TMP) - xorb (BP, N), LREG(TMP) - movb LREG(TMP), (DST, N) + movb (SRC, N), LREG(TMP) + xorb LREG(TMP), (DST, N) sub $1, CNT jnz .Lalign_loop @@ -98,83 +83,7 @@ ifelse(USE_SSE2, yes, < cmp $16, N jnc .Lsse2_case >) - C Check for the case that AP and BP have the same alignment, - C but different from DST. - mov AP, TMP - sub BP, TMP - test $7, TMP - jnz .Lno_shift_case - mov AP, %rcx - sub DST, %rcx - and $7, %rcx - jz .Lno_shift_case - sub %rcx, AP - sub %rcx, BP - shl $3, %rcx - - C Unrolling, with aligned values alternating in S0 and S1 - test $8, N - jnz .Lshift_odd - mov (AP, N), S1 - xor (BP, N), S1 - jmp .Lshift_next - -.Lshift_odd: - mov -8(AP, N), S1 - mov (AP, N), S0 - xor -8(BP, N), S1 - xor (BP, N), S0 - mov S1, TMP - shr %cl, TMP - neg %cl - shl %cl, S0 - neg %cl - - or S0, TMP - mov TMP, -8(DST, N) - sub $8, N - jz .Ldone - jmp .Lshift_next - - ALIGN(16) -.Lshift_loop: - mov 8(AP, N), S0 - xor 8(BP, N), S0 - mov S0, TMP - shr %cl, TMP - neg %cl - shl %cl, S1 - neg %cl - or S1, TMP - mov TMP, 8(DST, N) - - mov (AP, N), S1 - xor (BP, N), S1 - mov S1, TMP - shr %cl, TMP - neg %cl - shl %cl, S0 - neg %cl - or S0, TMP - mov TMP, (DST, N) -.Lshift_next: - sub $16, N - C FIXME: Handle the case N == 16 specially, - C like in the non-shifted case? -C ja .Lshift_loop -C jz .Ldone - jnc .Lshift_loop - - add $15, N - jnc .Ldone - - shr $3, %rcx - add %rcx, AP - add %rcx, BP - jmp .Lfinal_loop - -.Lno_shift_case: C Next destination word is -8(DST, N) C Setup for unrolling test $8, N @@ -183,21 +92,18 @@ C jz .Ldone sub $8, N jz .Lone_word - mov (AP, N), TMP - xor (BP, N), TMP - mov TMP, (DST, N) + mov (SRC, N), TMP + xor TMP, (DST, N) jmp .Lword_next ALIGN(16) .Lword_loop: - mov 8(AP, N), TMP - mov (AP, N), TMP2 - xor 8(BP, N), TMP - xor (BP, N), TMP2 - mov TMP, 8(DST, N) - mov TMP2, (DST, N) + mov 8(SRC, N), TMP + mov (SRC, N), TMP2 + xor TMP, 8(DST, N) + xor TMP2, (DST, N) .Lword_next: sub $16, N @@ -205,33 +111,28 @@ C jz .Ldone jnz .Lfinal C Final operation is word aligned - mov 8(AP, N), TMP - xor 8(BP, N), TMP - mov TMP, 8(DST, N) + mov 8(SRC, N), TMP + xor TMP, 8(DST, N) .Lone_word: - mov (AP, N), TMP - xor (BP, N), TMP - mov TMP, (DST, N) + mov (SRC, N), TMP + xor TMP, (DST, N) - C ENTRY might have been 3 args, too, but it doesn't matter for the exit - W64_EXIT(4, 0) + W64_EXIT(3, 0) ret .Lfinal: add $15, N .Lfinal_loop: - movb (AP, N), LREG(TMP) - xorb (BP, N), LREG(TMP) - movb LREG(TMP), (DST, N) + movb (SRC, N), LREG(TMP) + xorb LREG(TMP), (DST, N) .Lfinal_next: sub $1, N jnc .Lfinal_loop .Ldone: - C ENTRY might have been 3 args, too, but it doesn't matter for the exit - W64_EXIT(4, 0) + W64_EXIT(3, 0) ret ifelse(USE_SSE2, yes, < @@ -241,15 +142,14 @@ ifelse(USE_SSE2, yes, < test $8, TMP jz .Lsse2_next sub $8, N - mov (AP, N), TMP - xor (BP, N), TMP - mov TMP, (DST, N) + mov (SRC, N), TMP + xor TMP, (DST, N) jmp .Lsse2_next ALIGN(16) .Lsse2_loop: - movdqu (AP, N), %xmm0 - movdqu (BP, N), %xmm1 + movdqu (SRC, N), %xmm0 + movdqa (DST, N), %xmm1 pxor %xmm0, %xmm1 movdqa %xmm1, (DST, N) .Lsse2_next: @@ -261,14 +161,13 @@ ifelse(USE_SSE2, yes, < jnz .Lfinal C Final operation is aligned - movdqu (AP), %xmm0 - movdqu (BP), %xmm1 + movdqu (SRC), %xmm0 + movdqa (DST), %xmm1 pxor %xmm0, %xmm1 movdqa %xmm1, (DST) - C ENTRY might have been 3 args, too, but it doesn't matter for the exit - W64_EXIT(4, 0) + + W64_EXIT(3, 0) ret >) - -EPILOGUE(nettle_memxor3) +EPILOGUE(nettle_memxor) diff --git a/x86_64/memxor3.asm b/x86_64/memxor3.asm new file mode 100644 index 0000000000000000000000000000000000000000..8ff3e79cf14ef942f78438faac5263b1d9a48d67 --- /dev/null +++ b/x86_64/memxor3.asm @@ -0,0 +1,263 @@ +C x86_64/memxor3.asm + +ifelse(< + Copyright (C) 2010, 2014 Niels Möller + + This file is part of GNU Nettle. + + GNU Nettle is free software: you can redistribute it and/or + modify it under the terms of either: + + * the GNU Lesser General Public License as published by the Free + Software Foundation; either version 3 of the License, or (at your + option) any later version. + + or + + * the GNU General Public License as published by the Free + Software Foundation; either version 2 of the License, or (at your + option) any later version. + + or both in parallel, as here. + + GNU Nettle is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + General Public License for more details. + + You should have received copies of the GNU General Public License and + the GNU Lesser General Public License along with this program. If + not, see http://www.gnu.org/licenses/. +>) + +C Register usage: +define(<DST>, <%rax>) C Originally in %rdi +define(<AP>, <%rsi>) +define(<BP>, <%rdx>) +define(<N>, <%r10>) +define(<TMP>, <%r8>) +define(<TMP2>, <%r9>) +define(<CNT>, <%rdi>) +define(<S0>, <%r11>) +define(<S1>, <%rdi>) C Overlaps with CNT + +define(<USE_SSE2>, <no>) + + .file "memxor3.asm" + + .text + + C memxor3(void *dst, const void *a, const void *b, size_t n) + C %rdi %rsi %rdx %rcx + ALIGN(16) + +PROLOGUE(nettle_memxor3) + W64_ENTRY(4, 0) + C %cl needed for shift count, so move away N + mov %rcx, N +.Lmemxor3_entry: + test N, N + C Get number of unaligned bytes at the end + C %rdi is used as CNT, %rax as DST and as return value + mov %rdi, %rax + jz .Ldone + add N, CNT + and $7, CNT + + jz .Laligned + + cmp $8, N + jc .Lfinal_next + + C FIXME: Instead of this loop, could try cmov with memory + C destination, as a sequence of one 8-bit, one 16-bit and one + C 32-bit operations. (Except that cmov can't do 8-bit ops, so + C that step has to use a conditional). +.Lalign_loop: + + sub $1, N + movb (AP, N), LREG(TMP) + xorb (BP, N), LREG(TMP) + movb LREG(TMP), (DST, N) + sub $1, CNT + jnz .Lalign_loop + +.Laligned: +ifelse(USE_SSE2, yes, < + cmp $16, N + jnc .Lsse2_case +>) + C Check for the case that AP and BP have the same alignment, + C but different from DST. + mov AP, TMP + sub BP, TMP + test $7, TMP + jnz .Lno_shift_case + mov AP, %rcx + sub DST, %rcx + and $7, %rcx + jz .Lno_shift_case + sub %rcx, AP + sub %rcx, BP + shl $3, %rcx + + C Unrolling, with aligned values alternating in S0 and S1 + test $8, N + jnz .Lshift_odd + mov (AP, N), S1 + xor (BP, N), S1 + jmp .Lshift_next + +.Lshift_odd: + mov -8(AP, N), S1 + mov (AP, N), S0 + xor -8(BP, N), S1 + xor (BP, N), S0 + mov S1, TMP + shr %cl, TMP + neg %cl + shl %cl, S0 + neg %cl + + or S0, TMP + mov TMP, -8(DST, N) + sub $8, N + jz .Ldone + jmp .Lshift_next + + ALIGN(16) + +.Lshift_loop: + mov 8(AP, N), S0 + xor 8(BP, N), S0 + mov S0, TMP + shr %cl, TMP + neg %cl + shl %cl, S1 + neg %cl + or S1, TMP + mov TMP, 8(DST, N) + + mov (AP, N), S1 + xor (BP, N), S1 + mov S1, TMP + shr %cl, TMP + neg %cl + shl %cl, S0 + neg %cl + or S0, TMP + mov TMP, (DST, N) +.Lshift_next: + sub $16, N + C FIXME: Handle the case N == 16 specially, + C like in the non-shifted case? +C ja .Lshift_loop +C jz .Ldone + jnc .Lshift_loop + + add $15, N + jnc .Ldone + + shr $3, %rcx + add %rcx, AP + add %rcx, BP + jmp .Lfinal_loop + +.Lno_shift_case: + C Next destination word is -8(DST, N) + C Setup for unrolling + test $8, N + jz .Lword_next + + sub $8, N + jz .Lone_word + + mov (AP, N), TMP + xor (BP, N), TMP + mov TMP, (DST, N) + + jmp .Lword_next + + ALIGN(16) + +.Lword_loop: + mov 8(AP, N), TMP + mov (AP, N), TMP2 + xor 8(BP, N), TMP + xor (BP, N), TMP2 + mov TMP, 8(DST, N) + mov TMP2, (DST, N) + +.Lword_next: + sub $16, N + ja .Lword_loop C Not zero and no carry + jnz .Lfinal + + C Final operation is word aligned + mov 8(AP, N), TMP + xor 8(BP, N), TMP + mov TMP, 8(DST, N) + +.Lone_word: + mov (AP, N), TMP + xor (BP, N), TMP + mov TMP, (DST, N) + + C ENTRY might have been 3 args, too, but it doesn't matter for the exit + W64_EXIT(4, 0) + ret + +.Lfinal: + add $15, N + +.Lfinal_loop: + movb (AP, N), LREG(TMP) + xorb (BP, N), LREG(TMP) + movb LREG(TMP), (DST, N) +.Lfinal_next: + sub $1, N + jnc .Lfinal_loop + +.Ldone: + C ENTRY might have been 3 args, too, but it doesn't matter for the exit + W64_EXIT(4, 0) + ret + +ifelse(USE_SSE2, yes, < + +.Lsse2_case: + lea (DST, N), TMP + test $8, TMP + jz .Lsse2_next + sub $8, N + mov (AP, N), TMP + xor (BP, N), TMP + mov TMP, (DST, N) + jmp .Lsse2_next + + ALIGN(16) +.Lsse2_loop: + movdqu (AP, N), %xmm0 + movdqu (BP, N), %xmm1 + pxor %xmm0, %xmm1 + movdqa %xmm1, (DST, N) +.Lsse2_next: + sub $16, N + ja .Lsse2_loop + + C FIXME: See if we can do a full word first, before the + C byte-wise final loop. + jnz .Lfinal + + C Final operation is aligned + movdqu (AP), %xmm0 + movdqu (BP), %xmm1 + pxor %xmm0, %xmm1 + movdqa %xmm1, (DST) + C ENTRY might have been 3 args, too, but it doesn't matter for the exit + W64_EXIT(4, 0) + ret +>) + + +EPILOGUE(nettle_memxor3)