memxor.asm 3.25 KB
Newer Older
1
2
3
C x86_64/memxor.asm

ifelse(<
Niels Möller's avatar
Niels Möller committed
4
   Copyright (C) 2010, 2014, Niels Möller
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31

   This file is part of GNU Nettle.

   GNU Nettle is free software: you can redistribute it and/or
   modify it under the terms of either:

     * the GNU Lesser General Public License as published by the Free
       Software Foundation; either version 3 of the License, or (at your
       option) any later version.

   or

     * the GNU General Public License as published by the Free
       Software Foundation; either version 2 of the License, or (at your
       option) any later version.

   or both in parallel, as here.

   GNU Nettle is distributed in the hope that it will be useful,
   but WITHOUT ANY WARRANTY; without even the implied warranty of
   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
   General Public License for more details.

   You should have received copies of the GNU General Public License and
   the GNU Lesser General Public License along with this program.  If
   not, see http://www.gnu.org/licenses/.
>)
32
33
34

C Register usage:
define(<DST>, <%rax>) C Originally in %rdi
Niels Möller's avatar
Niels Möller committed
35
36
define(<SRC>, <%rsi>)
define(<N>, <%rdx>)
37
38
39
40
41
define(<TMP>, <%r8>)
define(<TMP2>, <%r9>)
define(<CNT>, <%rdi>)
define(<S0>, <%r11>)
define(<S1>, <%rdi>) C Overlaps with CNT 
42
43
44

define(<USE_SSE2>, <no>)

45
46
47
48
	.file "memxor.asm"

	.text

Niels Möller's avatar
Niels Möller committed
49
	C memxor(void *dst, const void *src, size_t n)
50
	C 	          %rdi               %rsi      %rdx
51
	ALIGN(16)
52

53
PROLOGUE(nettle_memxor)
54
	W64_ENTRY(3, 0)
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75

	test	N, N
	C Get number of unaligned bytes at the end
	C %rdi is used as CNT, %rax as DST and as return value
	mov	%rdi, %rax
	jz	.Ldone
	add 	N, CNT
	and	$7, CNT
	
	jz	.Laligned

	cmp	$8, N
	jc	.Lfinal_next

	C FIXME: Instead of this loop, could try cmov with memory
	C destination, as a sequence of one 8-bit, one 16-bit and one
	C 32-bit operations. (Except that cmov can't do 8-bit ops, so
	C that step has to use a conditional).
.Lalign_loop:
	
	sub	$1, N
Niels Möller's avatar
Niels Möller committed
76
77
	movb	(SRC, N), LREG(TMP)
	xorb	LREG(TMP), (DST, N)
78
79
80
81
	sub	$1, CNT
	jnz	.Lalign_loop

.Laligned:
82
ifdef(<USE_SSE2>, <
83
84
85
	cmp	$16, N
	jnc	.Lsse2_case
>)
86

87
88
89
90
91
92
93
94
	C Next destination word is -8(DST, N)
	C Setup for unrolling
	test	$8, N
	jz	.Lword_next

	sub	$8, N
	jz	.Lone_word

Niels Möller's avatar
Niels Möller committed
95
96
	mov	(SRC, N), TMP
	xor	TMP, (DST, N)
97
98
99
	
	jmp	.Lword_next

100
	ALIGN(16)
101
102

.Lword_loop:
Niels Möller's avatar
Niels Möller committed
103
104
105
106
	mov	8(SRC, N), TMP
	mov	(SRC, N), TMP2
	xor	TMP, 8(DST, N)
	xor	TMP2, (DST, N)
107
108
109
110
111
112
113

.Lword_next:
	sub	$16, N
	ja	.Lword_loop	C Not zero and no carry
	jnz	.Lfinal

	C Final operation is word aligned
Niels Möller's avatar
Niels Möller committed
114
115
	mov	8(SRC, N), TMP
	xor	TMP, 8(DST, N)
116
117
	
.Lone_word:
Niels Möller's avatar
Niels Möller committed
118
119
	mov	(SRC, N), TMP
	xor	TMP, (DST, N)
120

Niels Möller's avatar
Niels Möller committed
121
	W64_EXIT(3, 0)
122
123
124
125
126
127
	ret

.Lfinal:
	add	$15, N

.Lfinal_loop:
Niels Möller's avatar
Niels Möller committed
128
129
	movb	(SRC, N), LREG(TMP)
	xorb	LREG(TMP), (DST, N)
130
131
132
133
134
.Lfinal_next:
	sub	$1, N
	jnc	.Lfinal_loop

.Ldone:
Niels Möller's avatar
Niels Möller committed
135
	W64_EXIT(3, 0)
136
	ret
137

138
ifdef(<USE_SSE2>, <
139
140
141
142
143
144

.Lsse2_case:
	lea	(DST, N), TMP
	test	$8, TMP
	jz	.Lsse2_next
	sub	$8, N
Niels Möller's avatar
Niels Möller committed
145
146
	mov	(SRC, N), TMP
	xor	TMP, (DST, N)
147
148
	jmp	.Lsse2_next

149
	ALIGN(16)
150
.Lsse2_loop:
Niels Möller's avatar
Niels Möller committed
151
152
	movdqu	(SRC, N), %xmm0
	movdqa	(DST, N), %xmm1
153
154
155
156
157
158
159
160
161
162
163
	pxor	%xmm0, %xmm1
	movdqa	%xmm1, (DST, N)
.Lsse2_next:
	sub	$16, N
	ja	.Lsse2_loop
	
	C FIXME: See if we can do a full word first, before the
	C byte-wise final loop.
	jnz	.Lfinal		

	C Final operation is aligned
Niels Möller's avatar
Niels Möller committed
164
165
	movdqu	(SRC), %xmm0
	movdqa	(DST), %xmm1
166
167
	pxor	%xmm0, %xmm1
	movdqa	%xmm1, (DST)
Niels Möller's avatar
Niels Möller committed
168
169

	W64_EXIT(3, 0)
170
171
172
	ret
>)	

Niels Möller's avatar
Niels Möller committed
173
EPILOGUE(nettle_memxor)