1 /*
2 * Copyright (c) Facebook, Inc.
3 * All rights reserved.
4 *
5 * This source code is licensed under both the BSD-style license (found in the
6 * LICENSE file in the root directory of this source tree) and the GPLv2 (found
7 * in the COPYING file in the root directory of this source tree).
8 * You may select, at your option, one of the above-listed licenses.
9 */
10
11 #include "../common/portability_macros.h"
12
13 /* Stack marking
14 * ref: https://wiki.gentoo.org/wiki/Hardened/GNU_stack_quickstart
15 */
16 #if defined(__ELF__) && defined(__GNUC__)
17 .section .note.GNU-stack,"",%progbits
18 #endif
19
20 #if ZSTD_ENABLE_ASM_X86_64_BMI2
21
22 /* Calling convention:
23 *
24 * %rdi contains the first argument: HUF_DecompressAsmArgs*.
25 * %rbp isn't maintained (no frame pointer).
26 * %rsp contains the stack pointer that grows down.
27 * No red-zone is assumed, only addresses >= %rsp are used.
28 * All register contents are preserved.
29 *
30 * TODO: Support Windows calling convention.
31 */
32
33 ZSTD_HIDE_ASM_FUNCTION(HUF_decompress4X1_usingDTable_internal_bmi2_asm_loop)
34 ZSTD_HIDE_ASM_FUNCTION(HUF_decompress4X2_usingDTable_internal_bmi2_asm_loop)
35 ZSTD_HIDE_ASM_FUNCTION(_HUF_decompress4X2_usingDTable_internal_bmi2_asm_loop)
36 ZSTD_HIDE_ASM_FUNCTION(_HUF_decompress4X1_usingDTable_internal_bmi2_asm_loop)
37 .global HUF_decompress4X1_usingDTable_internal_bmi2_asm_loop
38 .global HUF_decompress4X2_usingDTable_internal_bmi2_asm_loop
39 .global _HUF_decompress4X1_usingDTable_internal_bmi2_asm_loop
40 .global _HUF_decompress4X2_usingDTable_internal_bmi2_asm_loop
41 .text
42
43 /* Sets up register mappings for clarity.
44 * op[], bits[], dtable & ip[0] each get their own register.
45 * ip[1,2,3] & olimit alias var[].
46 * %rax is a scratch register.
47 */
48
49 #define op0 rsi
50 #define op1 rbx
51 #define op2 rcx
52 #define op3 rdi
53
54 #define ip0 r8
55 #define ip1 r9
56 #define ip2 r10
57 #define ip3 r11
58
59 #define bits0 rbp
60 #define bits1 rdx
61 #define bits2 r12
62 #define bits3 r13
63 #define dtable r14
64 #define olimit r15
65
66 /* var[] aliases ip[1,2,3] & olimit
67 * ip[1,2,3] are saved every iteration.
68 * olimit is only used in compute_olimit.
69 */
70 #define var0 r15
71 #define var1 r9
72 #define var2 r10
73 #define var3 r11
74
75 /* 32-bit var registers */
76 #define vard0 r15d
77 #define vard1 r9d
78 #define vard2 r10d
79 #define vard3 r11d
80
81 /* Calls X(N) for each stream 0, 1, 2, 3. */
82 #define FOR_EACH_STREAM(X) \
83 X(0); \
84 X(1); \
85 X(2); \
86 X(3)
87
88 /* Calls X(N, idx) for each stream 0, 1, 2, 3. */
89 #define FOR_EACH_STREAM_WITH_INDEX(X, idx) \
90 X(0, idx); \
91 X(1, idx); \
92 X(2, idx); \
93 X(3, idx)
94
95 /* Define both _HUF_* & HUF_* symbols because MacOS
96 * C symbols are prefixed with '_' & Linux symbols aren't.
97 */
98 _HUF_decompress4X1_usingDTable_internal_bmi2_asm_loop:
99 HUF_decompress4X1_usingDTable_internal_bmi2_asm_loop:
100 /* Save all registers - even if they are callee saved for simplicity. */
101 push %rax
102 push %rbx
103 push %rcx
104 push %rdx
105 push %rbp
106 push %rsi
107 push %rdi
108 push %r8
109 push %r9
110 push %r10
111 push %r11
112 push %r12
113 push %r13
114 push %r14
115 push %r15
116
117 /* Read HUF_DecompressAsmArgs* args from %rax */
118 movq %rdi, %rax
119 movq 0(%rax), %ip0
120 movq 8(%rax), %ip1
121 movq 16(%rax), %ip2
122 movq 24(%rax), %ip3
123 movq 32(%rax), %op0
124 movq 40(%rax), %op1
125 movq 48(%rax), %op2
126 movq 56(%rax), %op3
127 movq 64(%rax), %bits0
128 movq 72(%rax), %bits1
129 movq 80(%rax), %bits2
130 movq 88(%rax), %bits3
131 movq 96(%rax), %dtable
132 push %rax /* argument */
133 push 104(%rax) /* ilimit */
134 push 112(%rax) /* oend */
135 push %olimit /* olimit space */
136
137 subq $24, %rsp
138
139 .L_4X1_compute_olimit:
140 /* Computes how many iterations we can do safely
141 * %r15, %rax may be clobbered
142 * rbx, rdx must be saved
143 * op3 & ip0 mustn't be clobbered
144 */
145 movq %rbx, 0(%rsp)
146 movq %rdx, 8(%rsp)
147
148 movq 32(%rsp), %rax /* rax = oend */
149 subq %op3, %rax /* rax = oend - op3 */
150
151 /* r15 = (oend - op3) / 5 */
152 movabsq $-3689348814741910323, %rdx
153 mulq %rdx
154 movq %rdx, %r15
155 shrq $2, %r15
156
157 movq %ip0, %rax /* rax = ip0 */
158 movq 40(%rsp), %rdx /* rdx = ilimit */
159 subq %rdx, %rax /* rax = ip0 - ilimit */
160 movq %rax, %rbx /* rbx = ip0 - ilimit */
161
162 /* rdx = (ip0 - ilimit) / 7 */
163 movabsq $2635249153387078803, %rdx
164 mulq %rdx
165 subq %rdx, %rbx
166 shrq %rbx
167 addq %rbx, %rdx
168 shrq $2, %rdx
169
170 /* r15 = min(%rdx, %r15) */
171 cmpq %rdx, %r15
172 cmova %rdx, %r15
173
174 /* r15 = r15 * 5 */
175 leaq (%r15, %r15, 4), %r15
176
177 /* olimit = op3 + r15 */
178 addq %op3, %olimit
179
180 movq 8(%rsp), %rdx
181 movq 0(%rsp), %rbx
182
183 /* If (op3 + 20 > olimit) */
184 movq %op3, %rax /* rax = op3 */
185 addq $20, %rax /* rax = op3 + 20 */
186 cmpq %rax, %olimit /* op3 + 20 > olimit */
187 jb .L_4X1_exit
188
189 /* If (ip1 < ip0) go to exit */
190 cmpq %ip0, %ip1
191 jb .L_4X1_exit
192
193 /* If (ip2 < ip1) go to exit */
194 cmpq %ip1, %ip2
195 jb .L_4X1_exit
196
197 /* If (ip3 < ip2) go to exit */
198 cmpq %ip2, %ip3
199 jb .L_4X1_exit
200
201 /* Reads top 11 bits from bits[n]
202 * Loads dt[bits[n]] into var[n]
203 */
204 #define GET_NEXT_DELT(n) \
205 movq $53, %var##n; \
206 shrxq %var##n, %bits##n, %var##n; \
207 movzwl (%dtable,%var##n,2),%vard##n
208
209 /* var[n] must contain the DTable entry computed with GET_NEXT_DELT
210 * Moves var[n] to %rax
211 * bits[n] <<= var[n] & 63
212 * op[n][idx] = %rax >> 8
213 * %ah is a way to access bits [8, 16) of %rax
214 */
215 #define DECODE_FROM_DELT(n, idx) \
216 movq %var##n, %rax; \
217 shlxq %var##n, %bits##n, %bits##n; \
218 movb %ah, idx(%op##n)
219
220 /* Assumes GET_NEXT_DELT has been called.
221 * Calls DECODE_FROM_DELT then GET_NEXT_DELT
222 */
223 #define DECODE_AND_GET_NEXT(n, idx) \
224 DECODE_FROM_DELT(n, idx); \
225 GET_NEXT_DELT(n) \
226
227 /* // ctz & nbBytes is stored in bits[n]
228 * // nbBits is stored in %rax
229 * ctz = CTZ[bits[n]]
230 * nbBits = ctz & 7
231 * nbBytes = ctz >> 3
232 * op[n] += 5
233 * ip[n] -= nbBytes
234 * // Note: x86-64 is little-endian ==> no bswap
235 * bits[n] = MEM_readST(ip[n]) | 1
236 * bits[n] <<= nbBits
237 */
238 #define RELOAD_BITS(n) \
239 bsfq %bits##n, %bits##n; \
240 movq %bits##n, %rax; \
241 andq $7, %rax; \
242 shrq $3, %bits##n; \
243 leaq 5(%op##n), %op##n; \
244 subq %bits##n, %ip##n; \
245 movq (%ip##n), %bits##n; \
246 orq $1, %bits##n; \
247 shlx %rax, %bits##n, %bits##n
248
249 /* Store clobbered variables on the stack */
250 movq %olimit, 24(%rsp)
251 movq %ip1, 0(%rsp)
252 movq %ip2, 8(%rsp)
253 movq %ip3, 16(%rsp)
254
255 /* Call GET_NEXT_DELT for each stream */
256 FOR_EACH_STREAM(GET_NEXT_DELT)
257
258 .p2align 6
259
260 .L_4X1_loop_body:
261 /* Decode 5 symbols in each of the 4 streams (20 total)
262 * Must have called GET_NEXT_DELT for each stream
263 */
264 FOR_EACH_STREAM_WITH_INDEX(DECODE_AND_GET_NEXT, 0)
265 FOR_EACH_STREAM_WITH_INDEX(DECODE_AND_GET_NEXT, 1)
266 FOR_EACH_STREAM_WITH_INDEX(DECODE_AND_GET_NEXT, 2)
267 FOR_EACH_STREAM_WITH_INDEX(DECODE_AND_GET_NEXT, 3)
268 FOR_EACH_STREAM_WITH_INDEX(DECODE_FROM_DELT, 4)
269
270 /* Load ip[1,2,3] from stack (var[] aliases them)
271 * ip[] is needed for RELOAD_BITS
272 * Each will be stored back to the stack after RELOAD
273 */
274 movq 0(%rsp), %ip1
275 movq 8(%rsp), %ip2
276 movq 16(%rsp), %ip3
277
278 /* Reload each stream & fetch the next table entry
279 * to prepare for the next iteration
280 */
281 RELOAD_BITS(0)
282 GET_NEXT_DELT(0)
283
284 RELOAD_BITS(1)
285 movq %ip1, 0(%rsp)
286 GET_NEXT_DELT(1)
287
288 RELOAD_BITS(2)
289 movq %ip2, 8(%rsp)
290 GET_NEXT_DELT(2)
291
292 RELOAD_BITS(3)
293 movq %ip3, 16(%rsp)
294 GET_NEXT_DELT(3)
295
296 /* If op3 < olimit: continue the loop */
297 cmp %op3, 24(%rsp)
298 ja .L_4X1_loop_body
299
300 /* Reload ip[1,2,3] from stack */
301 movq 0(%rsp), %ip1
302 movq 8(%rsp), %ip2
303 movq 16(%rsp), %ip3
304
305 /* Re-compute olimit */
306 jmp .L_4X1_compute_olimit
307
308 #undef GET_NEXT_DELT
309 #undef DECODE_FROM_DELT
310 #undef DECODE
311 #undef RELOAD_BITS
312 .L_4X1_exit:
313 addq $24, %rsp
314
315 /* Restore stack (oend & olimit) */
316 pop %rax /* olimit */
317 pop %rax /* oend */
318 pop %rax /* ilimit */
319 pop %rax /* arg */
320
321 /* Save ip / op / bits */
322 movq %ip0, 0(%rax)
323 movq %ip1, 8(%rax)
324 movq %ip2, 16(%rax)
325 movq %ip3, 24(%rax)
326 movq %op0, 32(%rax)
327 movq %op1, 40(%rax)
328 movq %op2, 48(%rax)
329 movq %op3, 56(%rax)
330 movq %bits0, 64(%rax)
331 movq %bits1, 72(%rax)
332 movq %bits2, 80(%rax)
333 movq %bits3, 88(%rax)
334
335 /* Restore registers */
336 pop %r15
337 pop %r14
338 pop %r13
339 pop %r12
340 pop %r11
341 pop %r10
342 pop %r9
343 pop %r8
344 pop %rdi
345 pop %rsi
346 pop %rbp
347 pop %rdx
348 pop %rcx
349 pop %rbx
350 pop %rax
351 ret
352
353 _HUF_decompress4X2_usingDTable_internal_bmi2_asm_loop:
354 HUF_decompress4X2_usingDTable_internal_bmi2_asm_loop:
355 /* Save all registers - even if they are callee saved for simplicity. */
356 push %rax
357 push %rbx
358 push %rcx
359 push %rdx
360 push %rbp
361 push %rsi
362 push %rdi
363 push %r8
364 push %r9
365 push %r10
366 push %r11
367 push %r12
368 push %r13
369 push %r14
370 push %r15
371
372 movq %rdi, %rax
373 movq 0(%rax), %ip0
374 movq 8(%rax), %ip1
375 movq 16(%rax), %ip2
376 movq 24(%rax), %ip3
377 movq 32(%rax), %op0
378 movq 40(%rax), %op1
379 movq 48(%rax), %op2
380 movq 56(%rax), %op3
381 movq 64(%rax), %bits0
382 movq 72(%rax), %bits1
383 movq 80(%rax), %bits2
384 movq 88(%rax), %bits3
385 movq 96(%rax), %dtable
386 push %rax /* argument */
387 push %rax /* olimit */
388 push 104(%rax) /* ilimit */
389
390 movq 112(%rax), %rax
391 push %rax /* oend3 */
392
393 movq %op3, %rax
394 push %rax /* oend2 */
395
396 movq %op2, %rax
397 push %rax /* oend1 */
398
399 movq %op1, %rax
400 push %rax /* oend0 */
401
402 /* Scratch space */
403 subq $8, %rsp
404
405 .L_4X2_compute_olimit:
406 /* Computes how many iterations we can do safely
407 * %r15, %rax may be clobbered
408 * rdx must be saved
409 * op[1,2,3,4] & ip0 mustn't be clobbered
410 */
411 movq %rdx, 0(%rsp)
412
413 /* We can consume up to 7 input bytes each iteration. */
414 movq %ip0, %rax /* rax = ip0 */
415 movq 40(%rsp), %rdx /* rdx = ilimit */
416 subq %rdx, %rax /* rax = ip0 - ilimit */
417 movq %rax, %r15 /* r15 = ip0 - ilimit */
418
419 /* rdx = rax / 7 */
420 movabsq $2635249153387078803, %rdx
421 mulq %rdx
422 subq %rdx, %r15
423 shrq %r15
424 addq %r15, %rdx
425 shrq $2, %rdx
426
427 /* r15 = (ip0 - ilimit) / 7 */
428 movq %rdx, %r15
429
430 movabsq $-3689348814741910323, %rdx
431 movq 8(%rsp), %rax /* rax = oend0 */
432 subq %op0, %rax /* rax = oend0 - op0 */
433 mulq %rdx
434 shrq $3, %rdx /* rdx = rax / 10 */
435
436 /* r15 = min(%rdx, %r15) */
437 cmpq %rdx, %r15
438 cmova %rdx, %r15
439
440 movabsq $-3689348814741910323, %rdx
441 movq 16(%rsp), %rax /* rax = oend1 */
442 subq %op1, %rax /* rax = oend1 - op1 */
443 mulq %rdx
444 shrq $3, %rdx /* rdx = rax / 10 */
445
446 /* r15 = min(%rdx, %r15) */
447 cmpq %rdx, %r15
448 cmova %rdx, %r15
449
450 movabsq $-3689348814741910323, %rdx
451 movq 24(%rsp), %rax /* rax = oend2 */
452 subq %op2, %rax /* rax = oend2 - op2 */
453 mulq %rdx
454 shrq $3, %rdx /* rdx = rax / 10 */
455
456 /* r15 = min(%rdx, %r15) */
457 cmpq %rdx, %r15
458 cmova %rdx, %r15
459
460 movabsq $-3689348814741910323, %rdx
461 movq 32(%rsp), %rax /* rax = oend3 */
462 subq %op3, %rax /* rax = oend3 - op3 */
463 mulq %rdx
464 shrq $3, %rdx /* rdx = rax / 10 */
465
466 /* r15 = min(%rdx, %r15) */
467 cmpq %rdx, %r15
468 cmova %rdx, %r15
469
470 /* olimit = op3 + 5 * r15 */
471 movq %r15, %rax
472 leaq (%op3, %rax, 4), %olimit
473 addq %rax, %olimit
474
475 movq 0(%rsp), %rdx
476
477 /* If (op3 + 10 > olimit) */
478 movq %op3, %rax /* rax = op3 */
479 addq $10, %rax /* rax = op3 + 10 */
480 cmpq %rax, %olimit /* op3 + 10 > olimit */
481 jb .L_4X2_exit
482
483 /* If (ip1 < ip0) go to exit */
484 cmpq %ip0, %ip1
485 jb .L_4X2_exit
486
487 /* If (ip2 < ip1) go to exit */
488 cmpq %ip1, %ip2
489 jb .L_4X2_exit
490
491 /* If (ip3 < ip2) go to exit */
492 cmpq %ip2, %ip3
493 jb .L_4X2_exit
494
495 #define DECODE(n, idx) \
496 movq %bits##n, %rax; \
497 shrq $53, %rax; \
498 movzwl 0(%dtable,%rax,4),%r8d; \
499 movzbl 2(%dtable,%rax,4),%r15d; \
500 movzbl 3(%dtable,%rax,4),%eax; \
501 movw %r8w, (%op##n); \
502 shlxq %r15, %bits##n, %bits##n; \
503 addq %rax, %op##n
504
505 #define RELOAD_BITS(n) \
506 bsfq %bits##n, %bits##n; \
507 movq %bits##n, %rax; \
508 shrq $3, %bits##n; \
509 andq $7, %rax; \
510 subq %bits##n, %ip##n; \
511 movq (%ip##n), %bits##n; \
512 orq $1, %bits##n; \
513 shlxq %rax, %bits##n, %bits##n
514
515
516 movq %olimit, 48(%rsp)
517
518 .p2align 6
519
520 .L_4X2_loop_body:
521 /* We clobber r8, so store it on the stack */
522 movq %r8, 0(%rsp)
523
524 /* Decode 5 symbols from each of the 4 streams (20 symbols total). */
525 FOR_EACH_STREAM_WITH_INDEX(DECODE, 0)
526 FOR_EACH_STREAM_WITH_INDEX(DECODE, 1)
527 FOR_EACH_STREAM_WITH_INDEX(DECODE, 2)
528 FOR_EACH_STREAM_WITH_INDEX(DECODE, 3)
529 FOR_EACH_STREAM_WITH_INDEX(DECODE, 4)
530
531 /* Reload r8 */
532 movq 0(%rsp), %r8
533
534 FOR_EACH_STREAM(RELOAD_BITS)
535
536 cmp %op3, 48(%rsp)
537 ja .L_4X2_loop_body
538 jmp .L_4X2_compute_olimit
539
540 #undef DECODE
541 #undef RELOAD_BITS
542 .L_4X2_exit:
543 addq $8, %rsp
544 /* Restore stack (oend & olimit) */
545 pop %rax /* oend0 */
546 pop %rax /* oend1 */
547 pop %rax /* oend2 */
548 pop %rax /* oend3 */
549 pop %rax /* ilimit */
550 pop %rax /* olimit */
551 pop %rax /* arg */
552
553 /* Save ip / op / bits */
554 movq %ip0, 0(%rax)
555 movq %ip1, 8(%rax)
556 movq %ip2, 16(%rax)
557 movq %ip3, 24(%rax)
558 movq %op0, 32(%rax)
559 movq %op1, 40(%rax)
560 movq %op2, 48(%rax)
561 movq %op3, 56(%rax)
562 movq %bits0, 64(%rax)
563 movq %bits1, 72(%rax)
564 movq %bits2, 80(%rax)
565 movq %bits3, 88(%rax)
566
567 /* Restore registers */
568 pop %r15
569 pop %r14
570 pop %r13
571 pop %r12
572 pop %r11
573 pop %r10
574 pop %r9
575 pop %r8
576 pop %rdi
577 pop %rsi
578 pop %rbp
579 pop %rdx
580 pop %rcx
581 pop %rbx
582 pop %rax
583 ret
584
585 #endif
Cache object: 04ea46d1163b839e725a4b7dbed18b27
|