The Design and Implementation of the FreeBSD Operating System, Second Edition
Now available: The Design and Implementation of the FreeBSD Operating System (Second Edition)


[ source navigation ] [ diff markup ] [ identifier search ] [ freetext search ] [ file search ] [ list types ] [ track identifier ]

FreeBSD/Linux Kernel Cross Reference
sys/contrib/zstd/lib/decompress/huf_decompress_amd64.S

Version: -  FREEBSD  -  FREEBSD-13-STABLE  -  FREEBSD-13-0  -  FREEBSD-12-STABLE  -  FREEBSD-12-0  -  FREEBSD-11-STABLE  -  FREEBSD-11-0  -  FREEBSD-10-STABLE  -  FREEBSD-10-0  -  FREEBSD-9-STABLE  -  FREEBSD-9-0  -  FREEBSD-8-STABLE  -  FREEBSD-8-0  -  FREEBSD-7-STABLE  -  FREEBSD-7-0  -  FREEBSD-6-STABLE  -  FREEBSD-6-0  -  FREEBSD-5-STABLE  -  FREEBSD-5-0  -  FREEBSD-4-STABLE  -  FREEBSD-3-STABLE  -  FREEBSD22  -  l41  -  OPENBSD  -  linux-2.6  -  MK84  -  PLAN9  -  xnu-8792 
SearchContext: -  none  -  3  -  10 

    1 /*
    2  * Copyright (c) Facebook, Inc.
    3  * All rights reserved.
    4  *
    5  * This source code is licensed under both the BSD-style license (found in the
    6  * LICENSE file in the root directory of this source tree) and the GPLv2 (found
    7  * in the COPYING file in the root directory of this source tree).
    8  * You may select, at your option, one of the above-listed licenses.
    9  */
   10 
   11 #include "../common/portability_macros.h"
   12 
   13 /* Stack marking
   14  * ref: https://wiki.gentoo.org/wiki/Hardened/GNU_stack_quickstart
   15  */
   16 #if defined(__ELF__) && defined(__GNUC__)
   17 .section .note.GNU-stack,"",%progbits
   18 #endif
   19 
   20 #if ZSTD_ENABLE_ASM_X86_64_BMI2
   21 
   22 /* Calling convention:
   23  *
   24  * %rdi contains the first argument: HUF_DecompressAsmArgs*.
   25  * %rbp isn't maintained (no frame pointer).
   26  * %rsp contains the stack pointer that grows down.
   27  *      No red-zone is assumed, only addresses >= %rsp are used.
   28  * All register contents are preserved.
   29  *
   30  * TODO: Support Windows calling convention.
   31  */
   32 
   33 ZSTD_HIDE_ASM_FUNCTION(HUF_decompress4X1_usingDTable_internal_bmi2_asm_loop)
   34 ZSTD_HIDE_ASM_FUNCTION(HUF_decompress4X2_usingDTable_internal_bmi2_asm_loop)
   35 ZSTD_HIDE_ASM_FUNCTION(_HUF_decompress4X2_usingDTable_internal_bmi2_asm_loop)
   36 ZSTD_HIDE_ASM_FUNCTION(_HUF_decompress4X1_usingDTable_internal_bmi2_asm_loop)
   37 .global HUF_decompress4X1_usingDTable_internal_bmi2_asm_loop
   38 .global HUF_decompress4X2_usingDTable_internal_bmi2_asm_loop
   39 .global _HUF_decompress4X1_usingDTable_internal_bmi2_asm_loop
   40 .global _HUF_decompress4X2_usingDTable_internal_bmi2_asm_loop
   41 .text
   42 
   43 /* Sets up register mappings for clarity.
   44  * op[], bits[], dtable & ip[0] each get their own register.
   45  * ip[1,2,3] & olimit alias var[].
   46  * %rax is a scratch register.
   47  */
   48 
   49 #define op0    rsi
   50 #define op1    rbx
   51 #define op2    rcx
   52 #define op3    rdi
   53 
   54 #define ip0    r8
   55 #define ip1    r9
   56 #define ip2    r10
   57 #define ip3    r11
   58 
   59 #define bits0  rbp
   60 #define bits1  rdx
   61 #define bits2  r12
   62 #define bits3  r13
   63 #define dtable r14
   64 #define olimit r15
   65 
   66 /* var[] aliases ip[1,2,3] & olimit
   67  * ip[1,2,3] are saved every iteration.
   68  * olimit is only used in compute_olimit.
   69  */
   70 #define var0   r15
   71 #define var1   r9
   72 #define var2   r10
   73 #define var3   r11
   74 
   75 /* 32-bit var registers */
   76 #define vard0  r15d
   77 #define vard1  r9d
   78 #define vard2  r10d
   79 #define vard3  r11d
   80 
   81 /* Calls X(N) for each stream 0, 1, 2, 3. */
   82 #define FOR_EACH_STREAM(X) \
   83     X(0);                  \
   84     X(1);                  \
   85     X(2);                  \
   86     X(3)
   87 
   88 /* Calls X(N, idx) for each stream 0, 1, 2, 3. */
   89 #define FOR_EACH_STREAM_WITH_INDEX(X, idx) \
   90     X(0, idx);                             \
   91     X(1, idx);                             \
   92     X(2, idx);                             \
   93     X(3, idx)
   94 
   95 /* Define both _HUF_* & HUF_* symbols because MacOS
   96  * C symbols are prefixed with '_' & Linux symbols aren't.
   97  */
   98 _HUF_decompress4X1_usingDTable_internal_bmi2_asm_loop:
   99 HUF_decompress4X1_usingDTable_internal_bmi2_asm_loop:
  100     /* Save all registers - even if they are callee saved for simplicity. */
  101     push %rax
  102     push %rbx
  103     push %rcx
  104     push %rdx
  105     push %rbp
  106     push %rsi
  107     push %rdi
  108     push %r8
  109     push %r9
  110     push %r10
  111     push %r11
  112     push %r12
  113     push %r13
  114     push %r14
  115     push %r15
  116 
  117     /* Read HUF_DecompressAsmArgs* args from %rax */
  118     movq %rdi, %rax
  119     movq  0(%rax), %ip0
  120     movq  8(%rax), %ip1
  121     movq 16(%rax), %ip2
  122     movq 24(%rax), %ip3
  123     movq 32(%rax), %op0
  124     movq 40(%rax), %op1
  125     movq 48(%rax), %op2
  126     movq 56(%rax), %op3
  127     movq 64(%rax), %bits0
  128     movq 72(%rax), %bits1
  129     movq 80(%rax), %bits2
  130     movq 88(%rax), %bits3
  131     movq 96(%rax), %dtable
  132     push %rax      /* argument */
  133     push 104(%rax) /* ilimit */
  134     push 112(%rax) /* oend */
  135     push %olimit   /* olimit space */
  136 
  137     subq $24, %rsp
  138 
  139 .L_4X1_compute_olimit:
  140     /* Computes how many iterations we can do safely
  141      * %r15, %rax may be clobbered
  142      * rbx, rdx must be saved
  143      * op3 & ip0 mustn't be clobbered
  144      */
  145     movq %rbx, 0(%rsp)
  146     movq %rdx, 8(%rsp)
  147 
  148     movq 32(%rsp), %rax /* rax = oend */
  149     subq %op3,    %rax  /* rax = oend - op3 */
  150 
  151     /* r15 = (oend - op3) / 5 */
  152     movabsq $-3689348814741910323, %rdx
  153     mulq %rdx
  154     movq %rdx, %r15
  155     shrq $2, %r15
  156 
  157     movq %ip0,     %rax /* rax = ip0 */
  158     movq 40(%rsp), %rdx /* rdx = ilimit */
  159     subq %rdx,     %rax /* rax = ip0 - ilimit */
  160     movq %rax,     %rbx /* rbx = ip0 - ilimit */
  161 
  162     /* rdx = (ip0 - ilimit) / 7 */
  163     movabsq $2635249153387078803, %rdx
  164     mulq %rdx
  165     subq %rdx, %rbx
  166     shrq %rbx
  167     addq %rbx, %rdx
  168     shrq $2, %rdx
  169 
  170     /* r15 = min(%rdx, %r15) */
  171     cmpq %rdx, %r15
  172     cmova %rdx, %r15
  173 
  174     /* r15 = r15 * 5 */
  175     leaq (%r15, %r15, 4), %r15
  176 
  177     /* olimit = op3 + r15 */
  178     addq %op3, %olimit
  179 
  180     movq 8(%rsp), %rdx
  181     movq 0(%rsp), %rbx
  182 
  183     /* If (op3 + 20 > olimit) */
  184     movq %op3, %rax    /* rax = op3 */
  185     addq $20,  %rax    /* rax = op3 + 20 */
  186     cmpq %rax, %olimit /* op3 + 20 > olimit */
  187     jb .L_4X1_exit
  188 
  189     /* If (ip1 < ip0) go to exit */
  190     cmpq %ip0, %ip1
  191     jb .L_4X1_exit
  192 
  193     /* If (ip2 < ip1) go to exit */
  194     cmpq %ip1, %ip2
  195     jb .L_4X1_exit
  196 
  197     /* If (ip3 < ip2) go to exit */
  198     cmpq %ip2, %ip3
  199     jb .L_4X1_exit
  200 
  201 /* Reads top 11 bits from bits[n]
  202  * Loads dt[bits[n]] into var[n]
  203  */
  204 #define GET_NEXT_DELT(n)                \
  205     movq $53, %var##n;                  \
  206     shrxq %var##n, %bits##n, %var##n;   \
  207     movzwl (%dtable,%var##n,2),%vard##n
  208 
  209 /* var[n] must contain the DTable entry computed with GET_NEXT_DELT
  210  * Moves var[n] to %rax
  211  * bits[n] <<= var[n] & 63
  212  * op[n][idx] = %rax >> 8
  213  * %ah is a way to access bits [8, 16) of %rax
  214  */
  215 #define DECODE_FROM_DELT(n, idx)       \
  216     movq %var##n, %rax;                \
  217     shlxq %var##n, %bits##n, %bits##n; \
  218     movb %ah, idx(%op##n)
  219 
  220 /* Assumes GET_NEXT_DELT has been called.
  221  * Calls DECODE_FROM_DELT then GET_NEXT_DELT
  222  */
  223 #define DECODE_AND_GET_NEXT(n, idx) \
  224     DECODE_FROM_DELT(n, idx);       \
  225     GET_NEXT_DELT(n)                \
  226 
  227 /* // ctz & nbBytes is stored in bits[n]
  228  * // nbBits is stored in %rax
  229  * ctz  = CTZ[bits[n]]
  230  * nbBits  = ctz & 7
  231  * nbBytes = ctz >> 3
  232  * op[n]  += 5
  233  * ip[n]  -= nbBytes
  234  * // Note: x86-64 is little-endian ==> no bswap
  235  * bits[n] = MEM_readST(ip[n]) | 1
  236  * bits[n] <<= nbBits
  237  */
  238 #define RELOAD_BITS(n)             \
  239     bsfq %bits##n, %bits##n;       \
  240     movq %bits##n, %rax;           \
  241     andq $7, %rax;                 \
  242     shrq $3, %bits##n;             \
  243     leaq 5(%op##n), %op##n;        \
  244     subq %bits##n, %ip##n;         \
  245     movq (%ip##n), %bits##n;       \
  246     orq $1, %bits##n;              \
  247     shlx %rax, %bits##n, %bits##n
  248 
  249     /* Store clobbered variables on the stack */
  250     movq %olimit, 24(%rsp)
  251     movq %ip1, 0(%rsp)
  252     movq %ip2, 8(%rsp)
  253     movq %ip3, 16(%rsp)
  254 
  255     /* Call GET_NEXT_DELT for each stream */
  256     FOR_EACH_STREAM(GET_NEXT_DELT)
  257 
  258     .p2align 6
  259 
  260 .L_4X1_loop_body:
  261     /* Decode 5 symbols in each of the 4 streams (20 total)
  262      * Must have called GET_NEXT_DELT for each stream
  263      */
  264     FOR_EACH_STREAM_WITH_INDEX(DECODE_AND_GET_NEXT, 0)
  265     FOR_EACH_STREAM_WITH_INDEX(DECODE_AND_GET_NEXT, 1)
  266     FOR_EACH_STREAM_WITH_INDEX(DECODE_AND_GET_NEXT, 2)
  267     FOR_EACH_STREAM_WITH_INDEX(DECODE_AND_GET_NEXT, 3)
  268     FOR_EACH_STREAM_WITH_INDEX(DECODE_FROM_DELT, 4)
  269 
  270     /* Load ip[1,2,3] from stack (var[] aliases them)
  271      * ip[] is needed for RELOAD_BITS
  272      * Each will be stored back to the stack after RELOAD
  273      */
  274     movq 0(%rsp), %ip1
  275     movq 8(%rsp), %ip2
  276     movq 16(%rsp), %ip3
  277 
  278     /* Reload each stream & fetch the next table entry
  279      * to prepare for the next iteration
  280      */
  281     RELOAD_BITS(0)
  282     GET_NEXT_DELT(0)
  283 
  284     RELOAD_BITS(1)
  285     movq %ip1, 0(%rsp)
  286     GET_NEXT_DELT(1)
  287 
  288     RELOAD_BITS(2)
  289     movq %ip2, 8(%rsp)
  290     GET_NEXT_DELT(2)
  291 
  292     RELOAD_BITS(3)
  293     movq %ip3, 16(%rsp)
  294     GET_NEXT_DELT(3)
  295 
  296     /* If op3 < olimit: continue the loop */
  297     cmp %op3, 24(%rsp)
  298     ja .L_4X1_loop_body
  299 
  300     /* Reload ip[1,2,3] from stack */
  301     movq 0(%rsp), %ip1
  302     movq 8(%rsp), %ip2
  303     movq 16(%rsp), %ip3
  304 
  305     /* Re-compute olimit */
  306     jmp .L_4X1_compute_olimit
  307 
  308 #undef GET_NEXT_DELT
  309 #undef DECODE_FROM_DELT
  310 #undef DECODE
  311 #undef RELOAD_BITS
  312 .L_4X1_exit:
  313     addq $24, %rsp
  314 
  315     /* Restore stack (oend & olimit) */
  316     pop %rax /* olimit */
  317     pop %rax /* oend */
  318     pop %rax /* ilimit */
  319     pop %rax /* arg */
  320 
  321     /* Save ip / op / bits */
  322     movq %ip0,  0(%rax)
  323     movq %ip1,  8(%rax)
  324     movq %ip2, 16(%rax)
  325     movq %ip3, 24(%rax)
  326     movq %op0, 32(%rax)
  327     movq %op1, 40(%rax)
  328     movq %op2, 48(%rax)
  329     movq %op3, 56(%rax)
  330     movq %bits0, 64(%rax)
  331     movq %bits1, 72(%rax)
  332     movq %bits2, 80(%rax)
  333     movq %bits3, 88(%rax)
  334 
  335     /* Restore registers */
  336     pop %r15
  337     pop %r14
  338     pop %r13
  339     pop %r12
  340     pop %r11
  341     pop %r10
  342     pop %r9
  343     pop %r8
  344     pop %rdi
  345     pop %rsi
  346     pop %rbp
  347     pop %rdx
  348     pop %rcx
  349     pop %rbx
  350     pop %rax
  351     ret
  352 
  353 _HUF_decompress4X2_usingDTable_internal_bmi2_asm_loop:
  354 HUF_decompress4X2_usingDTable_internal_bmi2_asm_loop:
  355     /* Save all registers - even if they are callee saved for simplicity. */
  356     push %rax
  357     push %rbx
  358     push %rcx
  359     push %rdx
  360     push %rbp
  361     push %rsi
  362     push %rdi
  363     push %r8
  364     push %r9
  365     push %r10
  366     push %r11
  367     push %r12
  368     push %r13
  369     push %r14
  370     push %r15
  371 
  372     movq %rdi, %rax
  373     movq  0(%rax), %ip0
  374     movq  8(%rax), %ip1
  375     movq 16(%rax), %ip2
  376     movq 24(%rax), %ip3
  377     movq 32(%rax), %op0
  378     movq 40(%rax), %op1
  379     movq 48(%rax), %op2
  380     movq 56(%rax), %op3
  381     movq 64(%rax), %bits0
  382     movq 72(%rax), %bits1
  383     movq 80(%rax), %bits2
  384     movq 88(%rax), %bits3
  385     movq 96(%rax), %dtable
  386     push %rax      /* argument */
  387     push %rax      /* olimit */
  388     push 104(%rax) /* ilimit */
  389 
  390     movq 112(%rax), %rax
  391     push %rax /* oend3 */
  392 
  393     movq %op3, %rax
  394     push %rax /* oend2 */
  395 
  396     movq %op2, %rax
  397     push %rax /* oend1 */
  398 
  399     movq %op1, %rax
  400     push %rax /* oend0 */
  401 
  402     /* Scratch space */
  403     subq $8, %rsp
  404 
  405 .L_4X2_compute_olimit:
  406     /* Computes how many iterations we can do safely
  407      * %r15, %rax may be clobbered
  408      * rdx must be saved
  409      * op[1,2,3,4] & ip0 mustn't be clobbered
  410      */
  411     movq %rdx, 0(%rsp)
  412 
  413     /* We can consume up to 7 input bytes each iteration. */
  414     movq %ip0,     %rax  /* rax = ip0 */
  415     movq 40(%rsp), %rdx  /* rdx = ilimit */
  416     subq %rdx,     %rax  /* rax = ip0 - ilimit */
  417     movq %rax,    %r15   /* r15 = ip0 - ilimit */
  418 
  419     /* rdx = rax / 7 */
  420     movabsq $2635249153387078803, %rdx
  421     mulq %rdx
  422     subq %rdx, %r15
  423     shrq %r15
  424     addq %r15, %rdx
  425     shrq $2, %rdx
  426 
  427     /* r15 = (ip0 - ilimit) / 7 */
  428     movq %rdx, %r15
  429 
  430     movabsq $-3689348814741910323, %rdx
  431     movq 8(%rsp), %rax /* rax = oend0 */
  432     subq %op0,    %rax /* rax = oend0 - op0 */
  433     mulq %rdx
  434     shrq $3,      %rdx /* rdx = rax / 10 */
  435 
  436     /* r15 = min(%rdx, %r15) */
  437     cmpq  %rdx, %r15
  438     cmova %rdx, %r15
  439 
  440     movabsq $-3689348814741910323, %rdx
  441     movq 16(%rsp), %rax /* rax = oend1 */
  442     subq %op1,     %rax /* rax = oend1 - op1 */
  443     mulq %rdx
  444     shrq $3,       %rdx /* rdx = rax / 10 */
  445 
  446     /* r15 = min(%rdx, %r15) */
  447     cmpq  %rdx, %r15
  448     cmova %rdx, %r15
  449 
  450     movabsq $-3689348814741910323, %rdx
  451     movq 24(%rsp), %rax /* rax = oend2 */
  452     subq %op2,     %rax /* rax = oend2 - op2 */
  453     mulq %rdx
  454     shrq $3,       %rdx /* rdx = rax / 10 */
  455 
  456     /* r15 = min(%rdx, %r15) */
  457     cmpq  %rdx, %r15
  458     cmova %rdx, %r15
  459 
  460     movabsq $-3689348814741910323, %rdx
  461     movq 32(%rsp), %rax /* rax = oend3 */
  462     subq %op3,     %rax /* rax = oend3 - op3 */
  463     mulq %rdx
  464     shrq $3,       %rdx /* rdx = rax / 10 */
  465 
  466     /* r15 = min(%rdx, %r15) */
  467     cmpq  %rdx, %r15
  468     cmova %rdx, %r15
  469 
  470     /* olimit = op3 + 5 * r15 */
  471     movq %r15, %rax
  472     leaq (%op3, %rax, 4), %olimit
  473     addq %rax, %olimit
  474 
  475     movq 0(%rsp), %rdx
  476 
  477     /* If (op3 + 10 > olimit) */
  478     movq %op3, %rax    /* rax = op3 */
  479     addq $10,  %rax    /* rax = op3 + 10 */
  480     cmpq %rax, %olimit /* op3 + 10 > olimit */
  481     jb .L_4X2_exit
  482 
  483     /* If (ip1 < ip0) go to exit */
  484     cmpq %ip0, %ip1
  485     jb .L_4X2_exit
  486 
  487     /* If (ip2 < ip1) go to exit */
  488     cmpq %ip1, %ip2
  489     jb .L_4X2_exit
  490 
  491     /* If (ip3 < ip2) go to exit */
  492     cmpq %ip2, %ip3
  493     jb .L_4X2_exit
  494 
  495 #define DECODE(n, idx)              \
  496     movq %bits##n, %rax;            \
  497     shrq $53, %rax;                 \
  498     movzwl 0(%dtable,%rax,4),%r8d;  \
  499     movzbl 2(%dtable,%rax,4),%r15d; \
  500     movzbl 3(%dtable,%rax,4),%eax;  \
  501     movw %r8w, (%op##n);            \
  502     shlxq %r15, %bits##n, %bits##n; \
  503     addq %rax, %op##n
  504 
  505 #define RELOAD_BITS(n)              \
  506     bsfq %bits##n, %bits##n;        \
  507     movq %bits##n, %rax;            \
  508     shrq $3, %bits##n;              \
  509     andq $7, %rax;                  \
  510     subq %bits##n, %ip##n;          \
  511     movq (%ip##n), %bits##n;        \
  512     orq $1, %bits##n;               \
  513     shlxq %rax, %bits##n, %bits##n
  514 
  515 
  516     movq %olimit, 48(%rsp)
  517 
  518     .p2align 6
  519 
  520 .L_4X2_loop_body:
  521     /* We clobber r8, so store it on the stack */
  522     movq %r8, 0(%rsp)
  523 
  524     /* Decode 5 symbols from each of the 4 streams (20 symbols total). */
  525     FOR_EACH_STREAM_WITH_INDEX(DECODE, 0)
  526     FOR_EACH_STREAM_WITH_INDEX(DECODE, 1)
  527     FOR_EACH_STREAM_WITH_INDEX(DECODE, 2)
  528     FOR_EACH_STREAM_WITH_INDEX(DECODE, 3)
  529     FOR_EACH_STREAM_WITH_INDEX(DECODE, 4)
  530 
  531     /* Reload r8 */
  532     movq 0(%rsp), %r8
  533 
  534     FOR_EACH_STREAM(RELOAD_BITS)
  535 
  536     cmp %op3, 48(%rsp)
  537     ja .L_4X2_loop_body
  538     jmp .L_4X2_compute_olimit
  539 
  540 #undef DECODE
  541 #undef RELOAD_BITS
  542 .L_4X2_exit:
  543     addq $8, %rsp
  544     /* Restore stack (oend & olimit) */
  545     pop %rax /* oend0 */
  546     pop %rax /* oend1 */
  547     pop %rax /* oend2 */
  548     pop %rax /* oend3 */
  549     pop %rax /* ilimit */
  550     pop %rax /* olimit */
  551     pop %rax /* arg */
  552 
  553     /* Save ip / op / bits */
  554     movq %ip0,  0(%rax)
  555     movq %ip1,  8(%rax)
  556     movq %ip2, 16(%rax)
  557     movq %ip3, 24(%rax)
  558     movq %op0, 32(%rax)
  559     movq %op1, 40(%rax)
  560     movq %op2, 48(%rax)
  561     movq %op3, 56(%rax)
  562     movq %bits0, 64(%rax)
  563     movq %bits1, 72(%rax)
  564     movq %bits2, 80(%rax)
  565     movq %bits3, 88(%rax)
  566 
  567     /* Restore registers */
  568     pop %r15
  569     pop %r14
  570     pop %r13
  571     pop %r12
  572     pop %r11
  573     pop %r10
  574     pop %r9
  575     pop %r8
  576     pop %rdi
  577     pop %rsi
  578     pop %rbp
  579     pop %rdx
  580     pop %rcx
  581     pop %rbx
  582     pop %rax
  583     ret
  584 
  585 #endif

Cache object: 04ea46d1163b839e725a4b7dbed18b27


[ source navigation ] [ diff markup ] [ identifier search ] [ freetext search ] [ file search ] [ list types ] [ track identifier ]


This page is part of the FreeBSD/Linux Linux Kernel Cross-Reference, and was automatically generated using a modified version of the LXR engine.