1 /*
2 * CDDL HEADER START
3 *
4 * The contents of this file are subject to the terms of the
5 * Common Development and Distribution License (the "License").
6 * You may not use this file except in compliance with the License.
7 *
8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9 * or https://opensource.org/licenses/CDDL-1.0.
10 * See the License for the specific language governing permissions
11 * and limitations under the License.
12 *
13 * When distributing Covered Code, include this CDDL HEADER in each
14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15 * If applicable, add the following below this CDDL HEADER, with the
16 * fields enclosed by brackets "[]" replaced with your own identifying
17 * information: Portions Copyright [yyyy] [name of copyright owner]
18 *
19 * CDDL HEADER END
20 */
21
22 /*
23 * Copyright (c) 2009 Intel Corporation
24 * All Rights Reserved.
25 */
26 /*
27 * Copyright 2009 Sun Microsystems, Inc. All rights reserved.
28 * Use is subject to license terms.
29 */
30
31 /*
32 * Accelerated GHASH implementation with Intel PCLMULQDQ-NI
33 * instructions. This file contains an accelerated
34 * Galois Field Multiplication implementation.
35 *
36 * PCLMULQDQ is used to accelerate the most time-consuming part of GHASH,
37 * carry-less multiplication. More information about PCLMULQDQ can be
38 * found at:
39 * http://software.intel.com/en-us/articles/
40 * carry-less-multiplication-and-its-usage-for-computing-the-gcm-mode/
41 *
42 */
43
44 /*
45 * ====================================================================
46 * OpenSolaris OS modifications
47 *
48 * This source originates as file galois_hash_asm.c from
49 * Intel Corporation dated September 21, 2009.
50 *
51 * This OpenSolaris version has these major changes from the original source:
52 *
53 * 1. Added OpenSolaris ENTRY_NP/SET_SIZE macros from
54 * /usr/include/sys/asm_linkage.h, lint(1B) guards, and a dummy C function
55 * definition for lint.
56 *
57 * 2. Formatted code, added comments, and added #includes and #defines.
58 *
59 * 3. If bit CR0.TS is set, clear and set the TS bit, after and before
60 * calling kpreempt_disable() and kpreempt_enable().
61 * If the TS bit is not set, Save and restore %xmm registers at the beginning
62 * and end of function calls (%xmm* registers are not saved and restored by
63 * during kernel thread preemption).
64 *
65 * 4. Removed code to perform hashing. This is already done with C macro
66 * GHASH in gcm.c. For better performance, this removed code should be
67 * reintegrated in the future to replace the C GHASH macro.
68 *
69 * 5. Added code to byte swap 16-byte input and output.
70 *
71 * 6. Folded in comments from the original C source with embedded assembly
72 * (SB_w_shift_xor.c)
73 *
74 * 7. Renamed function and reordered parameters to match OpenSolaris:
75 * Intel interface:
76 * void galois_hash_asm(unsigned char *hk, unsigned char *s,
77 * unsigned char *d, int length)
78 * OpenSolaris OS interface:
79 * void gcm_mul_pclmulqdq(uint64_t *x_in, uint64_t *y, uint64_t *res);
80 * ====================================================================
81 */
82
83
84 #if defined(lint) || defined(__lint) /* lint */
85
86 #include <sys/types.h>
87
88 void
89 gcm_mul_pclmulqdq(uint64_t *x_in, uint64_t *y, uint64_t *res) {
90 (void) x_in, (void) y, (void) res;
91 }
92
93 #elif defined(HAVE_PCLMULQDQ) /* guard by instruction set */
94
95 #define _ASM
96 #include <sys/asm_linkage.h>
97
98 /*
99 * Use this mask to byte-swap a 16-byte integer with the pshufb instruction
100 */
101
102 // static uint8_t byte_swap16_mask[] = {
103 // 15, 14, 13, 12, 11, 10, 9, 8, 7, 6 ,5, 4, 3, 2, 1, 0 };
104 .section .rodata
105 .balign XMM_ALIGN
106 .Lbyte_swap16_mask:
107 .byte 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0
108
109
110 /*
111 * void gcm_mul_pclmulqdq(uint64_t *x_in, uint64_t *y, uint64_t *res);
112 *
113 * Perform a carry-less multiplication (that is, use XOR instead of the
114 * multiply operator) on P1 and P2 and place the result in P3.
115 *
116 * Byte swap the input and the output.
117 *
118 * Note: x_in, y, and res all point to a block of 20-byte numbers
119 * (an array of two 64-bit integers).
120 *
121 * Note2: For kernel code, caller is responsible for ensuring
122 * kpreempt_disable() has been called. This is because %xmm registers are
123 * not saved/restored. Clear and set the CR0.TS bit on entry and exit,
124 * respectively, if TS is set on entry. Otherwise, if TS is not set,
125 * save and restore %xmm registers on the stack.
126 *
127 * Note3: Original Intel definition:
128 * void galois_hash_asm(unsigned char *hk, unsigned char *s,
129 * unsigned char *d, int length)
130 *
131 * Note4: Register/parameter mapping:
132 * Intel:
133 * Parameter 1: %rcx (copied to %xmm0) hk or x_in
134 * Parameter 2: %rdx (copied to %xmm1) s or y
135 * Parameter 3: %rdi (result) d or res
136 * OpenSolaris:
137 * Parameter 1: %rdi (copied to %xmm0) x_in
138 * Parameter 2: %rsi (copied to %xmm1) y
139 * Parameter 3: %rdx (result) res
140 */
141
142 ENTRY_NP(gcm_mul_pclmulqdq)
143 //
144 // Copy Parameters
145 //
146 movdqu (%rdi), %xmm0 // P1
147 movdqu (%rsi), %xmm1 // P2
148
149 //
150 // Byte swap 16-byte input
151 //
152 lea .Lbyte_swap16_mask(%rip), %rax
153 movups (%rax), %xmm10
154 pshufb %xmm10, %xmm0
155 pshufb %xmm10, %xmm1
156
157
158 //
159 // Multiply with the hash key
160 //
161 movdqu %xmm0, %xmm3
162 pclmulqdq $0, %xmm1, %xmm3 // xmm3 holds a0*b0
163
164 movdqu %xmm0, %xmm4
165 pclmulqdq $16, %xmm1, %xmm4 // xmm4 holds a0*b1
166
167 movdqu %xmm0, %xmm5
168 pclmulqdq $1, %xmm1, %xmm5 // xmm5 holds a1*b0
169 movdqu %xmm0, %xmm6
170 pclmulqdq $17, %xmm1, %xmm6 // xmm6 holds a1*b1
171
172 pxor %xmm5, %xmm4 // xmm4 holds a0*b1 + a1*b0
173
174 movdqu %xmm4, %xmm5 // move the contents of xmm4 to xmm5
175 psrldq $8, %xmm4 // shift by xmm4 64 bits to the right
176 pslldq $8, %xmm5 // shift by xmm5 64 bits to the left
177 pxor %xmm5, %xmm3
178 pxor %xmm4, %xmm6 // Register pair <xmm6:xmm3> holds the result
179 // of the carry-less multiplication of
180 // xmm0 by xmm1.
181
182 // We shift the result of the multiplication by one bit position
183 // to the left to cope for the fact that the bits are reversed.
184 movdqu %xmm3, %xmm7
185 movdqu %xmm6, %xmm8
186 pslld $1, %xmm3
187 pslld $1, %xmm6
188 psrld $31, %xmm7
189 psrld $31, %xmm8
190 movdqu %xmm7, %xmm9
191 pslldq $4, %xmm8
192 pslldq $4, %xmm7
193 psrldq $12, %xmm9
194 por %xmm7, %xmm3
195 por %xmm8, %xmm6
196 por %xmm9, %xmm6
197
198 //
199 // First phase of the reduction
200 //
201 // Move xmm3 into xmm7, xmm8, xmm9 in order to perform the shifts
202 // independently.
203 movdqu %xmm3, %xmm7
204 movdqu %xmm3, %xmm8
205 movdqu %xmm3, %xmm9
206 pslld $31, %xmm7 // packed right shift shifting << 31
207 pslld $30, %xmm8 // packed right shift shifting << 30
208 pslld $25, %xmm9 // packed right shift shifting << 25
209 pxor %xmm8, %xmm7 // xor the shifted versions
210 pxor %xmm9, %xmm7
211 movdqu %xmm7, %xmm8
212 pslldq $12, %xmm7
213 psrldq $4, %xmm8
214 pxor %xmm7, %xmm3 // first phase of the reduction complete
215
216 //
217 // Second phase of the reduction
218 //
219 // Make 3 copies of xmm3 in xmm2, xmm4, xmm5 for doing these
220 // shift operations.
221 movdqu %xmm3, %xmm2
222 movdqu %xmm3, %xmm4 // packed left shifting >> 1
223 movdqu %xmm3, %xmm5
224 psrld $1, %xmm2
225 psrld $2, %xmm4 // packed left shifting >> 2
226 psrld $7, %xmm5 // packed left shifting >> 7
227 pxor %xmm4, %xmm2 // xor the shifted versions
228 pxor %xmm5, %xmm2
229 pxor %xmm8, %xmm2
230 pxor %xmm2, %xmm3
231 pxor %xmm3, %xmm6 // the result is in xmm6
232
233 //
234 // Byte swap 16-byte result
235 //
236 pshufb %xmm10, %xmm6 // %xmm10 has the swap mask
237
238 //
239 // Store the result
240 //
241 movdqu %xmm6, (%rdx) // P3
242
243
244 //
245 // Return
246 //
247 RET
248 SET_SIZE(gcm_mul_pclmulqdq)
249
250 #endif /* lint || __lint */
251
252 #ifdef __ELF__
253 .section .note.GNU-stack,"",%progbits
254 #endif
Cache object: 60abf18a625faa70a95948aa8ba7b1d5
|