Skip to content
Snippets Groups Projects
Select Git revision
  • ade7779c98a5426c7d86c8a01bbd7ad65980c9b9
  • master default protected
  • siv-mode
  • delete-des-compat
  • delete-rsa_blind
  • aes-struct-layout
  • master-updates
  • release-3.4-fixes
  • struct-layout
  • attribute-deprecated
  • rename-data-symbols
  • x86_64-sha_ni-sha256
  • ecc-params-tweak
  • delete-old-aes
  • cmac-support
  • x86_64-sha_ni-sha1
  • gcm-ctr-opt
  • ctr-opt
  • skein
  • api-opaque-fix
  • curve448
  • nettle_3.4.1_release_20181204
  • nettle_3.4.1rc1
  • nettle_3.4_release_20171119
  • nettle_3.4rc2
  • nettle_3.4rc1
  • nettle_3.3_release_20161001
  • nettle_3.2_release_20160128
  • nettle_3.1.1_release_20150424
  • nettle_3.1_release_20150407
  • nettle_3.1rc3
  • nettle_3.1rc2
  • nettle_3.1rc1
  • nettle_3.0_release_20140607
  • nettle_2.7.1_release_20130528
  • nettle_2.7_release_20130424
  • nettle_2.6_release_20130116
  • nettle_2.5_release_20120707
  • converted-master-branch-to-git
  • nettle_2.4_release_20110903
  • nettle_2.3_release_20110902
41 results

memxor.asm

Blame
  • Forked from Nettle / nettle
    Source project has a limited visibility.
    memxor.asm 9.00 KiB
    C -*- mode: asm; asm-comment-char: ?C; -*-
    C nettle, low-level cryptographics library
    C
    C Copyright (C) 2013, Niels Möller
    C
    C The nettle library is free software; you can redistribute it and/or modify
    C it under the terms of the GNU Lesser General Public License as published by
    C the Free Software Foundation; either version 2.1 of the License, or (at your
    C option) any later version.
    C
    C The nettle library is distributed in the hope that it will be useful, but
    C WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
    C or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public
    C License for more details.
    C
    C You should have received a copy of the GNU Lesser General Public License
    C along with the nettle library; see the file COPYING.LIB.  If not, write to
    C the Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston,
    C MA 02111-1301, USA.
    
    C Possible speedups:
    C
    C The ldm instruction can do load two registers per cycle,
    C if the address is two-word aligned. Or three registers in two
    C cycles, regardless of alignment.
    
    C Register usage:
    
    define(<DST>, <r0>)
    define(<SRC>, <r1>)
    define(<N>, <r2>)
    define(<CNT>, <r6>)
    define(<TNC>, <r12>)
    
    	.syntax unified
    
    	.file "memxor.asm"
    
    	.text
    	.arm
    
    	C memxor(uint8_t *dst, const uint8_t *src, size_t n)
    	.align 4
    PROLOGUE(memxor)
    	cmp	N, #0
    	beq	.Lmemxor_done
    
    	cmp	N, #7
    	bcs	.Lmemxor_large
    
    	C Simple byte loop
    .Lmemxor_bytes:
    	ldrb	r3, [SRC], #+1
    	ldrb	r12, [DST]
    	eor	r3, r12
    	strb	r3, [DST], #+1
    	subs	N, #1
    	bne	.Lmemxor_bytes
    
    .Lmemxor_done:
    	bx	lr
    
    .Lmemxor_align_loop:
    	ldrb	r3, [SRC], #+1
    	ldrb	r12, [DST]
    	eor	r3, r12
    	strb	r3, [DST], #+1
    	sub	N, #1
    
    .Lmemxor_large:
    	tst	DST, #3
    	bne	.Lmemxor_align_loop
    
    	C We have at least 4 bytes left to do here.
    	sub	N, #4
    
    	ands	r3, SRC, #3
    	beq	.Lmemxor_same
    
    	C Different alignment case.
    	C     v original SRC
    	C +-------+------+
    	C |SRC    |SRC+4 |
    	C +---+---+------+
    	C     |DST    |
    	C     +-------+
    	C
    	C With little-endian, we need to do
    	C DST[i] ^= (SRC[i] >> CNT) ^ (SRC[i+1] << TNC)
    
    	push	{r4,r5,r6}
    	
    	lsl	CNT, r3, #3
    	bic	SRC, #3
    	rsb	TNC, CNT, #32
    
    	ldr	r4, [SRC], #+4
    
    	tst	N, #4
    	itet	eq
    	moveq	r5, r4
    	subne	N, #4
    	beq	.Lmemxor_odd
    
    .Lmemxor_word_loop:
    	ldr	r5, [SRC], #+4
    	ldr	r3, [DST]
    	eor	r3, r3, r4, lsr CNT
    	eor	r3, r3, r5, lsl TNC
    	str	r3, [DST], #+4
    .Lmemxor_odd:
    	ldr	r4, [SRC], #+4
    	ldr	r3, [DST]
    	eor	r3, r3, r5, lsr CNT
    	eor	r3, r3, r4, lsl TNC
    	str	r3, [DST], #+4
    	subs	N, #8
    	bcs	.Lmemxor_word_loop
    	adds	N, #8
    	beq	.Lmemxor_odd_done
    
    	C We have TNC/8 left-over bytes in r4, high end
    	lsr	r4, CNT
    	ldr	r3, [DST]
    	eor	r3, r4
    
    	pop	{r4,r5,r6}
    
    	C Store bytes, one by one.
    .Lmemxor_leftover:
    	strb	r3, [DST], #+1
    	subs	N, #1
    	beq	.Lmemxor_done
    	subs	TNC, #8
    	lsr	r3, #8
    	bne	.Lmemxor_leftover
    	b	.Lmemxor_bytes
    .Lmemxor_odd_done:
    	pop	{r4,r5,r6}
    	bx	lr
    
    .Lmemxor_same:
    	push	{r4,r5,r6,r7,r8,r10,r11,r14}	C lr is the link register
    
    	subs	N, #8
    	bcc	.Lmemxor_same_end
    
    	ldmia	SRC!, {r3, r4, r5}
    	C Keep address for loads in r14
    	mov	r14, DST
    	ldmia	r14!, {r6, r7, r8}
    	subs	N, #12
    	eor	r10, r3, r6
    	eor	r11, r4, r7
    	eor	r12, r5, r8
    	bcc	.Lmemxor_same_final_store
    	subs	N, #12
    	ldmia	r14!, {r6, r7, r8}
    	bcc	.Lmemxor_same_wind_down
    
    	C 6 cycles per iteration, 0.50 cycles/byte. For this speed,
    	C loop starts at offset 0x11c in the object file.
    
    .Lmemxor_same_loop:
    	C r10-r12 contains values to be stored at DST
    	C r6-r8 contains values read from r14, in advance
    	ldmia	SRC!, {r3, r4, r5}
    	subs	N, #12
    	stmia	DST!, {r10, r11, r12}
    	eor	r10, r3, r6
    	eor	r11, r4, r7
    	eor	r12, r5, r8
    	ldmia	r14!, {r6, r7, r8}
    	bcs	.Lmemxor_same_loop
    
    .Lmemxor_same_wind_down:
    	C Wind down code
    	ldmia	SRC!, {r3, r4, r5}
    	stmia	DST!, {r10, r11, r12}
    	eor	r10, r3, r6
    	eor	r11, r4, r7
    	eor	r12, r5, r8
    .Lmemxor_same_final_store:
    	stmia	DST!, {r10, r11, r12}
    	
    .Lmemxor_same_end:
    	C We have 0-11 bytes left to do, and N holds number of bytes -12.
    	adds	N, #4
    	bcc	.Lmemxor_same_lt_8
    	C Do 8 bytes more, leftover is in N
    	ldmia	SRC!, {r3, r4}
    	ldmia	DST, {r6, r7}
    	eor	r3, r6
    	eor	r4, r7
    	stmia	DST!, {r3, r4}
    	pop	{r4,r5,r6,r7,r8,r10,r11,r14}
    	beq	.Lmemxor_done
    	b	.Lmemxor_bytes
    
    .Lmemxor_same_lt_8:
    	pop	{r4,r5,r6,r7,r8,r10,r11,r14}
    	adds	N, #4
    	bcc	.Lmemxor_same_lt_4
    
    	ldr	r3, [SRC], #+4
    	ldr	r12, [DST]
    	eor	r3, r12
    	str	r3, [DST], #+4
    	beq	.Lmemxor_done
    	b	.Lmemxor_bytes
    
    .Lmemxor_same_lt_4:
    	adds	N, #4
    	beq	.Lmemxor_done
    	b	.Lmemxor_bytes
    	
    EPILOGUE(memxor)
    
    define(<DST>, <r0>)
    define(<AP>, <r1>)
    define(<BP>, <r2>)
    define(<N>, <r3>)
    undefine(<CNT>)
    undefine(<TNC>)
    
    C Temporaries r4-r7
    define(<ACNT>, <r8>)
    define(<ATNC>, <r10>)
    define(<BCNT>, <r11>)
    define(<BTNC>, <r12>)
    
    	C memxor3(uint8_t *dst, const uint8_t *a, const uint8_t *b, size_t n)
    	.align 2
    PROLOGUE(memxor3)
    	cmp	N, #0
    	beq	.Lmemxor3_ret
    
    	push	{r4,r5,r6,r7,r8,r10,r11}
    	cmp	N, #7
    
    	add	AP, N
    	add	BP, N
    	add	DST, N
    
    	bcs	.Lmemxor3_large
    
    	C Simple byte loop
    .Lmemxor3_bytes:
    	ldrb	r4, [AP, #-1]!
    	ldrb	r5, [BP, #-1]!
    	eor	r4, r5
    	strb	r4, [DST, #-1]!
    	subs	N, #1
    	bne	.Lmemxor3_bytes
    
    .Lmemxor3_done:
    	pop	{r4,r5,r6,r7,r8,r10,r11}
    .Lmemxor3_ret:
    	bx	lr
    
    .Lmemxor3_align_loop:
    	ldrb	r4, [AP, #-1]!
    	ldrb	r5, [BP, #-1]!
    	eor	r5, r4
    	strb	r5, [DST, #-1]!
    	sub	N, #1
    
    .Lmemxor3_large:
    	tst	DST, #3
    	bne	.Lmemxor3_align_loop
    
    	C We have at least 4 bytes left to do here.
    	sub	N, #4
    	ands	ACNT, AP, #3
    	lsl	ACNT, #3
    	beq	.Lmemxor3_a_aligned
    
    	ands	BCNT, BP, #3
    	lsl	BCNT, #3
    	bne	.Lmemxor3_uu
    
    	C Swap
    	mov	r4, AP
    	mov	AP, BP
    	mov	BP, r4
    
    .Lmemxor3_au:
    	C NOTE: We have the relevant shift count in ACNT, not BCNT
    
    	C AP is aligned, BP is not
    	C           v original SRC
    	C +-------+------+
    	C |SRC-4  |SRC   |
    	C +---+---+------+
    	C     |DST-4  |
    	C     +-------+
    	C
    	C With little-endian, we need to do
    	C DST[i-i] ^= (SRC[i-i] >> CNT) ^ (SRC[i] << TNC)
    	rsb	ATNC, ACNT, #32
    	bic	BP, #3
    
    	ldr	r4, [BP]
    
    	tst	N, #4
    	itet	eq
    	moveq	r5, r4
    	subne	N, #4
    	beq	.Lmemxor3_au_odd
    
    .Lmemxor3_au_loop:
    	ldr	r5, [BP, #-4]!
    	ldr	r6, [AP, #-4]!
    	eor	r6, r6, r4, lsl ATNC
    	eor	r6, r6, r5, lsr ACNT
    	str	r6, [DST, #-4]!
    .Lmemxor3_au_odd:
    	ldr	r4, [BP, #-4]!
    	ldr	r6, [AP, #-4]!
    	eor	r6, r6, r5, lsl ATNC
    	eor	r6, r6, r4, lsr ACNT
    	str	r6, [DST, #-4]!
    	subs	N, #8
    	bcs	.Lmemxor3_au_loop
    	adds	N, #8
    	beq	.Lmemxor3_done
    
    	C Leftover bytes in r4, low end
    	ldr	r5, [AP, #-4]
    	eor	r4, r5, r4, lsl ATNC
    
    .Lmemxor3_au_leftover:
    	C Store a byte at a time
    	ror	r4, #24
    	strb	r4, [DST, #-1]!
    	subs	N, #1
    	beq	.Lmemxor3_done
    	subs	ACNT, #8
    	sub	AP, #1
    	bne	.Lmemxor3_au_leftover
    	b	.Lmemxor3_bytes
    
    .Lmemxor3_a_aligned:
    	ands	ACNT, BP, #3
    	lsl	ACNT, #3
    	bne	.Lmemxor3_au ;
    
    	C a, b and dst all have the same alignment.
    	subs	N, #8
    	bcc	.Lmemxor3_aligned_word_end
    
    	C This loop runs at 8 cycles per iteration. It has been
    	C observed running at only 7 cycles, for this speed, the loop
    	C started at offset 0x2ac in the object file.
    
    	C FIXME: consider software pipelining, similarly to the memxor
    	C loop.
    	
    .Lmemxor3_aligned_word_loop:
    	ldmdb	AP!, {r4,r5,r6}
    	ldmdb	BP!, {r7,r8,r10}
    	subs	N, #12
    	eor	r4, r7
    	eor	r5, r8
    	eor	r6, r10
    	stmdb	DST!, {r4, r5,r6}
    	bcs	.Lmemxor3_aligned_word_loop
    
    .Lmemxor3_aligned_word_end:
    	C We have 0-11 bytes left to do, and N holds number of bytes -12.
    	adds	N, #4
    	bcc	.Lmemxor3_aligned_lt_8
    	C Do 8 bytes more, leftover is in N
    	ldmdb	AP!, {r4, r5}
    	ldmdb	BP!, {r6, r7}
    	eor	r4, r6
    	eor	r5, r7
    	stmdb	DST!, {r4,r5}
    	beq	.Lmemxor3_done
    	b	.Lmemxor3_bytes
    
    .Lmemxor3_aligned_lt_8:
    	adds	N, #4
    	bcc	.Lmemxor3_aligned_lt_4
    
    	ldr	r4, [AP,#-4]!
    	ldr	r5, [BP,#-4]!
    	eor	r4, r5
    	str	r4, [DST,#-4]!
    	beq	.Lmemxor3_done
    	b	.Lmemxor3_bytes
    
    .Lmemxor3_aligned_lt_4:
    	adds	N, #4	
    	beq	.Lmemxor3_done
    	b	.Lmemxor3_bytes
    
    .Lmemxor3_uu:
    
    	cmp	ACNT, BCNT
    	bic	AP, #3
    	bic	BP, #3
    	rsb	ATNC, ACNT, #32
    
    	bne	.Lmemxor3_uud
    
    	C AP and BP are unaligned in the same way
    
    	ldr	r4, [AP]
    	ldr	r6, [BP]
    	eor	r4, r6
    
    	tst	N, #4
    	itet	eq
    	moveq	r5, r4
    	subne	N, #4
    	beq	.Lmemxor3_uu_odd
    
    .Lmemxor3_uu_loop:
    	ldr	r5, [AP, #-4]!
    	ldr	r6, [BP, #-4]!
    	eor	r5, r6
    	lsl	r4, ATNC
    	eor	r4, r4, r5, lsr ACNT
    	str	r4, [DST, #-4]!
    .Lmemxor3_uu_odd:
    	ldr	r4, [AP, #-4]!
    	ldr	r6, [BP, #-4]!
    	eor	r4, r6
    	lsl	r5, ATNC
    	eor	r5, r5, r4, lsr ACNT
    	str	r5, [DST, #-4]!
    	subs	N, #8
    	bcs	.Lmemxor3_uu_loop
    	adds	N, #8
    	beq	.Lmemxor3_done
    
    	C Leftover bytes in a4, low end
    	ror	r4, ACNT
    .Lmemxor3_uu_leftover:
    	ror	r4, #24
    	strb	r4, [DST, #-1]!
    	subs	N, #1
    	beq	.Lmemxor3_done
    	subs	ACNT, #8
    	bne	.Lmemxor3_uu_leftover
    	b	.Lmemxor3_bytes
    
    .Lmemxor3_uud:
    	C Both AP and BP unaligned, and in different ways
    	rsb	BTNC, BCNT, #32
    
    	ldr	r4, [AP]
    	ldr	r6, [BP]
    
    	tst	N, #4
    	ittet	eq
    	moveq	r5, r4
    	moveq	r7, r6
    	subne	N, #4
    	beq	.Lmemxor3_uud_odd
    
    .Lmemxor3_uud_loop:
    	ldr	r5, [AP, #-4]!
    	ldr	r7, [BP, #-4]!
    	lsl	r4, ATNC
    	eor	r4, r4, r6, lsl BTNC
    	eor	r4, r4, r5, lsr ACNT
    	eor	r4, r4, r7, lsr BCNT
    	str	r4, [DST, #-4]!
    .Lmemxor3_uud_odd:
    	ldr	r4, [AP, #-4]!
    	ldr	r6, [BP, #-4]!
    	lsl	r5, ATNC
    	eor	r5, r5, r7, lsl BTNC
    	eor	r5, r5, r4, lsr ACNT
    	eor	r5, r5, r6, lsr BCNT
    	str	r5, [DST, #-4]!
    	subs	N, #8
    	bcs	.Lmemxor3_uud_loop
    	adds	N, #8
    	beq	.Lmemxor3_done
    
    	C FIXME: More clever left-over handling? For now, just adjust pointers.
    	add	AP, AP,	ACNT, lsr #3
    	add	BP, BP, BCNT, lsr #3
    	b	.Lmemxor3_bytes
    EPILOGUE(memxor3)