The Design and Implementation of the FreeBSD Operating System, Second Edition
Now available: The Design and Implementation of the FreeBSD Operating System (Second Edition)


[ source navigation ] [ diff markup ] [ identifier search ] [ freetext search ] [ file search ] [ list types ] [ track identifier ]

FreeBSD/Linux Kernel Cross Reference
sys/contrib/openzfs/module/icp/asm-x86_64/modes/gcm_pclmulqdq.S

Version: -  FREEBSD  -  FREEBSD-13-STABLE  -  FREEBSD-13-0  -  FREEBSD-12-STABLE  -  FREEBSD-12-0  -  FREEBSD-11-STABLE  -  FREEBSD-11-0  -  FREEBSD-10-STABLE  -  FREEBSD-10-0  -  FREEBSD-9-STABLE  -  FREEBSD-9-0  -  FREEBSD-8-STABLE  -  FREEBSD-8-0  -  FREEBSD-7-STABLE  -  FREEBSD-7-0  -  FREEBSD-6-STABLE  -  FREEBSD-6-0  -  FREEBSD-5-STABLE  -  FREEBSD-5-0  -  FREEBSD-4-STABLE  -  FREEBSD-3-STABLE  -  FREEBSD22  -  l41  -  OPENBSD  -  linux-2.6  -  MK84  -  PLAN9  -  xnu-8792 
SearchContext: -  none  -  3  -  10 

    1 /*
    2  * CDDL HEADER START
    3  *
    4  * The contents of this file are subject to the terms of the
    5  * Common Development and Distribution License (the "License").
    6  * You may not use this file except in compliance with the License.
    7  *
    8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
    9  * or https://opensource.org/licenses/CDDL-1.0.
   10  * See the License for the specific language governing permissions
   11  * and limitations under the License.
   12  *
   13  * When distributing Covered Code, include this CDDL HEADER in each
   14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
   15  * If applicable, add the following below this CDDL HEADER, with the
   16  * fields enclosed by brackets "[]" replaced with your own identifying
   17  * information: Portions Copyright [yyyy] [name of copyright owner]
   18  *
   19  * CDDL HEADER END
   20  */
   21 
   22 /*
   23  * Copyright (c) 2009 Intel Corporation
   24  * All Rights Reserved.
   25  */
   26 /*
   27  * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
   28  * Use is subject to license terms.
   29  */
   30 
   31 /*
   32  * Accelerated GHASH implementation with Intel PCLMULQDQ-NI
   33  * instructions.  This file contains an accelerated
   34  * Galois Field Multiplication implementation.
   35  *
   36  * PCLMULQDQ is used to accelerate the most time-consuming part of GHASH,
   37  * carry-less multiplication. More information about PCLMULQDQ can be
   38  * found at:
   39  * http://software.intel.com/en-us/articles/
   40  * carry-less-multiplication-and-its-usage-for-computing-the-gcm-mode/
   41  *
   42  */
   43 
   44 /*
   45  * ====================================================================
   46  * OpenSolaris OS modifications
   47  *
   48  * This source originates as file galois_hash_asm.c from
   49  * Intel Corporation dated September 21, 2009.
   50  *
   51  * This OpenSolaris version has these major changes from the original source:
   52  *
   53  * 1. Added OpenSolaris ENTRY_NP/SET_SIZE macros from
   54  * /usr/include/sys/asm_linkage.h, lint(1B) guards, and a dummy C function
   55  * definition for lint.
   56  *
   57  * 2. Formatted code, added comments, and added #includes and #defines.
   58  *
   59  * 3. If bit CR0.TS is set, clear and set the TS bit, after and before
   60  * calling kpreempt_disable() and kpreempt_enable().
   61  * If the TS bit is not set, Save and restore %xmm registers at the beginning
   62  * and end of function calls (%xmm* registers are not saved and restored by
   63  * during kernel thread preemption).
   64  *
   65  * 4. Removed code to perform hashing.  This is already done with C macro
   66  * GHASH in gcm.c.  For better performance, this removed code should be
   67  * reintegrated in the future to replace the C GHASH macro.
   68  *
   69  * 5. Added code to byte swap 16-byte input and output.
   70  *
   71  * 6. Folded in comments from the original C source with embedded assembly
   72  * (SB_w_shift_xor.c)
   73  *
   74  * 7. Renamed function and reordered parameters to match OpenSolaris:
   75  * Intel interface:
   76  *      void galois_hash_asm(unsigned char *hk, unsigned char *s,
   77  *              unsigned char *d, int length)
   78  * OpenSolaris OS interface:
   79  *      void gcm_mul_pclmulqdq(uint64_t *x_in, uint64_t *y, uint64_t *res);
   80  * ====================================================================
   81  */
   82 
   83 
   84 #if defined(lint) || defined(__lint)    /* lint */
   85 
   86 #include <sys/types.h>
   87 
   88 void
   89 gcm_mul_pclmulqdq(uint64_t *x_in, uint64_t *y, uint64_t *res) {
   90         (void) x_in, (void) y, (void) res;
   91 }
   92 
   93 #elif defined(HAVE_PCLMULQDQ)   /* guard by instruction set */
   94 
   95 #define _ASM
   96 #include <sys/asm_linkage.h>
   97 
   98 /*
   99  * Use this mask to byte-swap a 16-byte integer with the pshufb instruction
  100  */
  101 
  102 // static uint8_t byte_swap16_mask[] = {
  103 //       15, 14, 13, 12, 11, 10, 9, 8, 7, 6 ,5, 4, 3, 2, 1, 0 };
  104 .section .rodata
  105 .balign XMM_ALIGN
  106 .Lbyte_swap16_mask:
  107         .byte   15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0
  108 
  109 
  110 /*
  111  * void gcm_mul_pclmulqdq(uint64_t *x_in, uint64_t *y, uint64_t *res);
  112  *
  113  * Perform a carry-less multiplication (that is, use XOR instead of the
  114  * multiply operator) on P1 and P2 and place the result in P3.
  115  *
  116  * Byte swap the input and the output.
  117  *
  118  * Note: x_in, y, and res all point to a block of 20-byte numbers
  119  * (an array of two 64-bit integers).
  120  *
  121  * Note2: For kernel code, caller is responsible for ensuring
  122  * kpreempt_disable() has been called.  This is because %xmm registers are
  123  * not saved/restored.  Clear and set the CR0.TS bit on entry and exit,
  124  * respectively, if TS is set on entry.  Otherwise, if TS is not set,
  125  * save and restore %xmm registers on the stack.
  126  *
  127  * Note3: Original Intel definition:
  128  * void galois_hash_asm(unsigned char *hk, unsigned char *s,
  129  *      unsigned char *d, int length)
  130  *
  131  * Note4: Register/parameter mapping:
  132  * Intel:
  133  *      Parameter 1: %rcx (copied to %xmm0)     hk or x_in
  134  *      Parameter 2: %rdx (copied to %xmm1)     s or y
  135  *      Parameter 3: %rdi (result)              d or res
  136  * OpenSolaris:
  137  *      Parameter 1: %rdi (copied to %xmm0)     x_in
  138  *      Parameter 2: %rsi (copied to %xmm1)     y
  139  *      Parameter 3: %rdx (result)              res
  140  */
  141 
  142 ENTRY_NP(gcm_mul_pclmulqdq)
  143         //
  144         // Copy Parameters
  145         //
  146         movdqu  (%rdi), %xmm0   // P1
  147         movdqu  (%rsi), %xmm1   // P2
  148 
  149         //
  150         // Byte swap 16-byte input
  151         //
  152         lea     .Lbyte_swap16_mask(%rip), %rax
  153         movups  (%rax), %xmm10
  154         pshufb  %xmm10, %xmm0
  155         pshufb  %xmm10, %xmm1
  156 
  157 
  158         //
  159         // Multiply with the hash key
  160         //
  161         movdqu  %xmm0, %xmm3
  162         pclmulqdq $0, %xmm1, %xmm3      // xmm3 holds a0*b0
  163 
  164         movdqu  %xmm0, %xmm4
  165         pclmulqdq $16, %xmm1, %xmm4     // xmm4 holds a0*b1
  166 
  167         movdqu  %xmm0, %xmm5
  168         pclmulqdq $1, %xmm1, %xmm5      // xmm5 holds a1*b0
  169         movdqu  %xmm0, %xmm6
  170         pclmulqdq $17, %xmm1, %xmm6     // xmm6 holds a1*b1
  171 
  172         pxor    %xmm5, %xmm4    // xmm4 holds a0*b1 + a1*b0
  173 
  174         movdqu  %xmm4, %xmm5    // move the contents of xmm4 to xmm5
  175         psrldq  $8, %xmm4       // shift by xmm4 64 bits to the right
  176         pslldq  $8, %xmm5       // shift by xmm5 64 bits to the left
  177         pxor    %xmm5, %xmm3
  178         pxor    %xmm4, %xmm6    // Register pair <xmm6:xmm3> holds the result
  179                                 // of the carry-less multiplication of
  180                                 // xmm0 by xmm1.
  181 
  182         // We shift the result of the multiplication by one bit position
  183         // to the left to cope for the fact that the bits are reversed.
  184         movdqu  %xmm3, %xmm7
  185         movdqu  %xmm6, %xmm8
  186         pslld   $1, %xmm3
  187         pslld   $1, %xmm6
  188         psrld   $31, %xmm7
  189         psrld   $31, %xmm8
  190         movdqu  %xmm7, %xmm9
  191         pslldq  $4, %xmm8
  192         pslldq  $4, %xmm7
  193         psrldq  $12, %xmm9
  194         por     %xmm7, %xmm3
  195         por     %xmm8, %xmm6
  196         por     %xmm9, %xmm6
  197 
  198         //
  199         // First phase of the reduction
  200         //
  201         // Move xmm3 into xmm7, xmm8, xmm9 in order to perform the shifts
  202         // independently.
  203         movdqu  %xmm3, %xmm7
  204         movdqu  %xmm3, %xmm8
  205         movdqu  %xmm3, %xmm9
  206         pslld   $31, %xmm7      // packed right shift shifting << 31
  207         pslld   $30, %xmm8      // packed right shift shifting << 30
  208         pslld   $25, %xmm9      // packed right shift shifting << 25
  209         pxor    %xmm8, %xmm7    // xor the shifted versions
  210         pxor    %xmm9, %xmm7
  211         movdqu  %xmm7, %xmm8
  212         pslldq  $12, %xmm7
  213         psrldq  $4, %xmm8
  214         pxor    %xmm7, %xmm3    // first phase of the reduction complete
  215 
  216         //
  217         // Second phase of the reduction
  218         //
  219         // Make 3 copies of xmm3 in xmm2, xmm4, xmm5 for doing these
  220         // shift operations.
  221         movdqu  %xmm3, %xmm2
  222         movdqu  %xmm3, %xmm4    // packed left shifting >> 1
  223         movdqu  %xmm3, %xmm5
  224         psrld   $1, %xmm2
  225         psrld   $2, %xmm4       // packed left shifting >> 2
  226         psrld   $7, %xmm5       // packed left shifting >> 7
  227         pxor    %xmm4, %xmm2    // xor the shifted versions
  228         pxor    %xmm5, %xmm2
  229         pxor    %xmm8, %xmm2
  230         pxor    %xmm2, %xmm3
  231         pxor    %xmm3, %xmm6    // the result is in xmm6
  232 
  233         //
  234         // Byte swap 16-byte result
  235         //
  236         pshufb  %xmm10, %xmm6   // %xmm10 has the swap mask
  237 
  238         //
  239         // Store the result
  240         //
  241         movdqu  %xmm6, (%rdx)   // P3
  242 
  243 
  244         //
  245         // Return
  246         //
  247         RET
  248         SET_SIZE(gcm_mul_pclmulqdq)
  249 
  250 #endif  /* lint || __lint */
  251 
  252 #ifdef __ELF__
  253 .section .note.GNU-stack,"",%progbits
  254 #endif

Cache object: 60abf18a625faa70a95948aa8ba7b1d5


[ source navigation ] [ diff markup ] [ identifier search ] [ freetext search ] [ file search ] [ list types ] [ track identifier ]


This page is part of the FreeBSD/Linux Linux Kernel Cross-Reference, and was automatically generated using a modified version of the LXR engine.