memxor.asm 4.46 KB
Newer Older
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31
C arm/memxor.asm

ifelse(<
   Copyright (C) 2013 Niels Möller

   This file is part of GNU Nettle.

   GNU Nettle is free software: you can redistribute it and/or
   modify it under the terms of either:

     * the GNU Lesser General Public License as published by the Free
       Software Foundation; either version 3 of the License, or (at your
       option) any later version.

   or

     * the GNU General Public License as published by the Free
       Software Foundation; either version 2 of the License, or (at your
       option) any later version.

   or both in parallel, as here.

   GNU Nettle is distributed in the hope that it will be useful,
   but WITHOUT ANY WARRANTY; without even the implied warranty of
   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
   General Public License for more details.

   You should have received copies of the GNU General Public License and
   the GNU Lesser General Public License along with this program.  If
   not, see http://www.gnu.org/licenses/.
>) 
32

33 34 35 36 37 38
C Possible speedups:
C
C The ldm instruction can do load two registers per cycle,
C if the address is two-word aligned. Or three registers in two
C cycles, regardless of alignment.

39 40 41 42 43 44
C Register usage:

define(<DST>, <r0>)
define(<SRC>, <r1>)
define(<N>, <r2>)
define(<CNT>, <r6>)
45
define(<TNC>, <r12>)
46 47 48 49 50 51 52 53

	.syntax unified

	.file "memxor.asm"

	.text
	.arm

Niels Möller's avatar
Niels Möller committed
54
	C memxor(void *dst, const void *src, size_t n)
Niels Möller's avatar
Niels Möller committed
55
	.align 4
56
PROLOGUE(nettle_memxor)
57
	cmp	N, #0
58
	beq	.Lmemxor_done
59 60 61 62 63 64 65

	cmp	N, #7
	bcs	.Lmemxor_large

	C Simple byte loop
.Lmemxor_bytes:
	ldrb	r3, [SRC], #+1
66 67
	ldrb	r12, [DST]
	eor	r3, r12
68 69 70 71 72 73 74 75 76
	strb	r3, [DST], #+1
	subs	N, #1
	bne	.Lmemxor_bytes

.Lmemxor_done:
	bx	lr

.Lmemxor_align_loop:
	ldrb	r3, [SRC], #+1
77 78
	ldrb	r12, [DST]
	eor	r3, r12
79 80 81 82 83 84 85 86 87 88
	strb	r3, [DST], #+1
	sub	N, #1

.Lmemxor_large:
	tst	DST, #3
	bne	.Lmemxor_align_loop

	C We have at least 4 bytes left to do here.
	sub	N, #4

89
	ands	r3, SRC, #3
90 91 92 93 94 95 96 97 98 99 100 101 102
	beq	.Lmemxor_same

	C Different alignment case.
	C     v original SRC
	C +-------+------+
	C |SRC    |SRC+4 |
	C +---+---+------+
	C     |DST    |
	C     +-------+
	C
	C With little-endian, we need to do
	C DST[i] ^= (SRC[i] >> CNT) ^ (SRC[i+1] << TNC)

103 104 105
	push	{r4,r5,r6}
	
	lsl	CNT, r3, #3
106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131
	bic	SRC, #3
	rsb	TNC, CNT, #32

	ldr	r4, [SRC], #+4

	tst	N, #4
	itet	eq
	moveq	r5, r4
	subne	N, #4
	beq	.Lmemxor_odd

.Lmemxor_word_loop:
	ldr	r5, [SRC], #+4
	ldr	r3, [DST]
	eor	r3, r3, r4, lsr CNT
	eor	r3, r3, r5, lsl TNC
	str	r3, [DST], #+4
.Lmemxor_odd:
	ldr	r4, [SRC], #+4
	ldr	r3, [DST]
	eor	r3, r3, r5, lsr CNT
	eor	r3, r3, r4, lsl TNC
	str	r3, [DST], #+4
	subs	N, #8
	bcs	.Lmemxor_word_loop
	adds	N, #8
132
	beq	.Lmemxor_odd_done
133 134 135 136 137

	C We have TNC/8 left-over bytes in r4, high end
	lsr	r4, CNT
	ldr	r3, [DST]
	eor	r3, r4
138 139 140

	pop	{r4,r5,r6}

141 142 143 144 145 146 147 148 149
	C Store bytes, one by one.
.Lmemxor_leftover:
	strb	r3, [DST], #+1
	subs	N, #1
	beq	.Lmemxor_done
	subs	TNC, #8
	lsr	r3, #8
	bne	.Lmemxor_leftover
	b	.Lmemxor_bytes
150 151 152
.Lmemxor_odd_done:
	pop	{r4,r5,r6}
	bx	lr
153 154

.Lmemxor_same:
155 156
	push	{r4,r5,r6,r7,r8,r10,r11,r14}	C lr is the link register

157 158 159
	subs	N, #8
	bcc	.Lmemxor_same_end

Niels Möller's avatar
Niels Möller committed
160 161 162 163 164 165 166 167 168 169 170 171 172
	ldmia	SRC!, {r3, r4, r5}
	C Keep address for loads in r14
	mov	r14, DST
	ldmia	r14!, {r6, r7, r8}
	subs	N, #12
	eor	r10, r3, r6
	eor	r11, r4, r7
	eor	r12, r5, r8
	bcc	.Lmemxor_same_final_store
	subs	N, #12
	ldmia	r14!, {r6, r7, r8}
	bcc	.Lmemxor_same_wind_down

173 174 175
	C 6 cycles per iteration, 0.50 cycles/byte. For this speed,
	C loop starts at offset 0x11c in the object file.

176
.Lmemxor_same_loop:
Niels Möller's avatar
Niels Möller committed
177 178
	C r10-r12 contains values to be stored at DST
	C r6-r8 contains values read from r14, in advance
179 180
	ldmia	SRC!, {r3, r4, r5}
	subs	N, #12
Niels Möller's avatar
Niels Möller committed
181 182 183 184 185
	stmia	DST!, {r10, r11, r12}
	eor	r10, r3, r6
	eor	r11, r4, r7
	eor	r12, r5, r8
	ldmia	r14!, {r6, r7, r8}
186
	bcs	.Lmemxor_same_loop
Niels Möller's avatar
Niels Möller committed
187 188 189 190 191 192 193 194 195 196

.Lmemxor_same_wind_down:
	C Wind down code
	ldmia	SRC!, {r3, r4, r5}
	stmia	DST!, {r10, r11, r12}
	eor	r10, r3, r6
	eor	r11, r4, r7
	eor	r12, r5, r8
.Lmemxor_same_final_store:
	stmia	DST!, {r10, r11, r12}
197 198
	
.Lmemxor_same_end:
199 200 201 202 203 204 205 206 207
	C We have 0-11 bytes left to do, and N holds number of bytes -12.
	adds	N, #4
	bcc	.Lmemxor_same_lt_8
	C Do 8 bytes more, leftover is in N
	ldmia	SRC!, {r3, r4}
	ldmia	DST, {r6, r7}
	eor	r3, r6
	eor	r4, r7
	stmia	DST!, {r3, r4}
208
	pop	{r4,r5,r6,r7,r8,r10,r11,r14}
209 210
	beq	.Lmemxor_done
	b	.Lmemxor_bytes
211 212

.Lmemxor_same_lt_8:
213
	pop	{r4,r5,r6,r7,r8,r10,r11,r14}
214 215 216 217
	adds	N, #4
	bcc	.Lmemxor_same_lt_4

	ldr	r3, [SRC], #+4
218 219
	ldr	r12, [DST]
	eor	r3, r12
220 221 222 223 224 225 226 227 228
	str	r3, [DST], #+4
	beq	.Lmemxor_done
	b	.Lmemxor_bytes

.Lmemxor_same_lt_4:
	adds	N, #4
	beq	.Lmemxor_done
	b	.Lmemxor_bytes
	
229
EPILOGUE(nettle_memxor)