The Design and Implementation of the FreeBSD Operating System, Second Edition
Now available: The Design and Implementation of the FreeBSD Operating System (Second Edition)


[ source navigation ] [ diff markup ] [ identifier search ] [ freetext search ] [ file search ] [ list types ] [ track identifier ]

FreeBSD/Linux Kernel Cross Reference
sys/contrib/openzfs/module/icp/asm-x86_64/aes/aes_amd64.S

Version: -  FREEBSD  -  FREEBSD-13-STABLE  -  FREEBSD-13-0  -  FREEBSD-12-STABLE  -  FREEBSD-12-0  -  FREEBSD-11-STABLE  -  FREEBSD-11-0  -  FREEBSD-10-STABLE  -  FREEBSD-10-0  -  FREEBSD-9-STABLE  -  FREEBSD-9-0  -  FREEBSD-8-STABLE  -  FREEBSD-8-0  -  FREEBSD-7-STABLE  -  FREEBSD-7-0  -  FREEBSD-6-STABLE  -  FREEBSD-6-0  -  FREEBSD-5-STABLE  -  FREEBSD-5-0  -  FREEBSD-4-STABLE  -  FREEBSD-3-STABLE  -  FREEBSD22  -  l41  -  OPENBSD  -  linux-2.6  -  MK84  -  PLAN9  -  xnu-8792 
SearchContext: -  none  -  3  -  10 

    1 /*
    2  * ---------------------------------------------------------------------------
    3  * Copyright (c) 1998-2007, Brian Gladman, Worcester, UK. All rights reserved.
    4  *
    5  * LICENSE TERMS
    6  *
    7  * The free distribution and use of this software is allowed (with or without
    8  * changes) provided that:
    9  *
   10  *  1. source code distributions include the above copyright notice, this
   11  *     list of conditions and the following disclaimer;
   12  *
   13  *  2. binary distributions include the above copyright notice, this list
   14  *     of conditions and the following disclaimer in their documentation;
   15  *
   16  *  3. the name of the copyright holder is not used to endorse products
   17  *     built using this software without specific written permission.
   18  *
   19  * DISCLAIMER
   20  *
   21  * This software is provided 'as is' with no explicit or implied warranties
   22  * in respect of its properties, including, but not limited to, correctness
   23  * and/or fitness for purpose.
   24  * ---------------------------------------------------------------------------
   25  * Issue 20/12/2007
   26  *
   27  * I am grateful to Dag Arne Osvik for many discussions of the techniques that
   28  * can be used to optimise AES assembler code on AMD64/EM64T architectures.
   29  * Some of the techniques used in this implementation are the result of
   30  * suggestions made by him for which I am most grateful.
   31  *
   32  * An AES implementation for AMD64 processors using the YASM assembler.  This
   33  * implementation provides only encryption, decryption and hence requires key
   34  * scheduling support in C. It uses 8k bytes of tables but its encryption and
   35  * decryption performance is very close to that obtained using large tables.
   36  * It can use either MS Windows or Gnu/Linux/OpenSolaris OS calling conventions,
   37  * which are as follows:
   38  *               ms windows  gnu/linux/opensolaris os
   39  *
   40  *   in_blk          rcx     rdi
   41  *   out_blk         rdx     rsi
   42  *   context (cx)     r8     rdx
   43  *
   44  *   preserved       rsi      -    + rbx, rbp, rsp, r12, r13, r14 & r15
   45  *   registers       rdi      -      on both
   46  *
   47  *   destroyed        -      rsi   + rax, rcx, rdx, r8, r9, r10 & r11
   48  *   registers        -      rdi     on both
   49  *
   50  * The convention used here is that for gnu/linux/opensolaris os.
   51  *
   52  * This code provides the standard AES block size (128 bits, 16 bytes) and the
   53  * three standard AES key sizes (128, 192 and 256 bits). It has the same call
   54  * interface as my C implementation.  It uses the Microsoft C AMD64 calling
   55  * conventions in which the three parameters are placed in  rcx, rdx and r8
   56  * respectively.  The rbx, rsi, rdi, rbp and r12..r15 registers are preserved.
   57  *
   58  * OpenSolaris Note:
   59  * Modified to use GNU/Linux/Solaris calling conventions.
   60  * That is parameters are placed in rdi, rsi, rdx, and rcx, respectively.
   61  *
   62  *     AES_RETURN aes_encrypt(const unsigned char in_blk[],
   63  *                   unsigned char out_blk[], const aes_encrypt_ctx cx[1])/
   64  *
   65  *     AES_RETURN aes_decrypt(const unsigned char in_blk[],
   66  *                   unsigned char out_blk[], const aes_decrypt_ctx cx[1])/
   67  *
   68  *     AES_RETURN aes_encrypt_key<NNN>(const unsigned char key[],
   69  *                                            const aes_encrypt_ctx cx[1])/
   70  *
   71  *     AES_RETURN aes_decrypt_key<NNN>(const unsigned char key[],
   72  *                                            const aes_decrypt_ctx cx[1])/
   73  *
   74  *     AES_RETURN aes_encrypt_key(const unsigned char key[],
   75  *                           unsigned int len, const aes_decrypt_ctx cx[1])/
   76  *
   77  *     AES_RETURN aes_decrypt_key(const unsigned char key[],
   78  *                           unsigned int len, const aes_decrypt_ctx cx[1])/
   79  *
   80  * where <NNN> is 128, 102 or 256.  In the last two calls the length can be in
   81  * either bits or bytes.
   82  *
   83  * Comment in/out the following lines to obtain the desired subroutines. These
   84  * selections MUST match those in the C header file aesopt.h
   85  */
   86 #define AES_REV_DKS       /* define if key decryption schedule is reversed */
   87 
   88 #define LAST_ROUND_TABLES /* define for the faster version using extra tables */
   89 
   90 /*
   91  * The encryption key schedule has the following in memory layout where N is the
   92  * number of rounds (10, 12 or 14):
   93  *
   94  * lo: | input key (round 0)  |  / each round is four 32-bit words
   95  *     | encryption round 1   |
   96  *     | encryption round 2   |
   97  *     ....
   98  *     | encryption round N-1 |
   99  * hi: | encryption round N   |
  100  *
  101  * The decryption key schedule is normally set up so that it has the same
  102  * layout as above by actually reversing the order of the encryption key
  103  * schedule in memory (this happens when AES_REV_DKS is set):
  104  *
  105  * lo: | decryption round 0   | =              | encryption round N   |
  106  *     | decryption round 1   | = INV_MIX_COL[ | encryption round N-1 | ]
  107  *     | decryption round 2   | = INV_MIX_COL[ | encryption round N-2 | ]
  108  *     ....                       ....
  109  *     | decryption round N-1 | = INV_MIX_COL[ | encryption round 1   | ]
  110  * hi: | decryption round N   | =              | input key (round 0)  |
  111  *
  112  * with rounds except the first and last modified using inv_mix_column()
  113  * But if AES_REV_DKS is NOT set the order of keys is left as it is for
  114  * encryption so that it has to be accessed in reverse when used for
  115  * decryption (although the inverse mix column modifications are done)
  116  *
  117  * lo: | decryption round 0   | =              | input key (round 0)  |
  118  *     | decryption round 1   | = INV_MIX_COL[ | encryption round 1   | ]
  119  *     | decryption round 2   | = INV_MIX_COL[ | encryption round 2   | ]
  120  *     ....                       ....
  121  *     | decryption round N-1 | = INV_MIX_COL[ | encryption round N-1 | ]
  122  * hi: | decryption round N   | =              | encryption round N   |
  123  *
  124  * This layout is faster when the assembler key scheduling provided here
  125  * is used.
  126  *
  127  * End of user defines
  128  */
  129 
  130 /*
  131  * ---------------------------------------------------------------------------
  132  * OpenSolaris OS modifications
  133  *
  134  * This source originates from Brian Gladman file aes_amd64.asm
  135  * in http://fp.gladman.plus.com/AES/aes-src-04-03-08.zip
  136  * with these changes:
  137  *
  138  * 1. Removed MS Windows-specific code within DLL_EXPORT, _SEH_, and
  139  * !__GNUC__ ifdefs.  Also removed ENCRYPTION, DECRYPTION,
  140  * AES_128, AES_192, AES_256, AES_VAR ifdefs.
  141  *
  142  * 2. Translate yasm/nasm %define and .macro definitions to cpp(1) #define
  143  *
  144  * 3. Translate yasm/nasm %ifdef/%ifndef to cpp(1) #ifdef
  145  *
  146  * 4. Translate Intel/yasm/nasm syntax to ATT/OpenSolaris as(1) syntax
  147  * (operands reversed, literals prefixed with "$", registers prefixed with "%",
  148  * and "[register+offset]", addressing changed to "offset(register)",
  149  * parenthesis in constant expressions "()" changed to square brackets "[]",
  150  * "." removed from  local (numeric) labels, and other changes.
  151  * Examples:
  152  * Intel/yasm/nasm Syntax       ATT/OpenSolaris Syntax
  153  * mov  rax,(4*20h)             mov     $[4*0x20],%rax
  154  * mov  rax,[ebx+20h]           mov     0x20(%ebx),%rax
  155  * lea  rax,[ebx+ecx]           lea     (%ebx,%ecx),%rax
  156  * sub  rax,[ebx+ecx*4-20h]     sub     -0x20(%ebx,%ecx,4),%rax
  157  *
  158  * 5. Added OpenSolaris ENTRY_NP/SET_SIZE macros from
  159  * /usr/include/sys/asm_linkage.h, lint(1B) guards, and dummy C function
  160  * definitions for lint.
  161  *
  162  * 6. Renamed functions and reordered parameters to match OpenSolaris:
  163  * Original Gladman interface:
  164  *      int aes_encrypt(const unsigned char *in,
  165  *              unsigned char *out, const aes_encrypt_ctx cx[1])/
  166  *      int aes_decrypt(const unsigned char *in,
  167  *              unsigned char *out, const aes_encrypt_ctx cx[1])/
  168  * Note: aes_encrypt_ctx contains ks, a 60 element array of uint32_t,
  169  * and a union type, inf., containing inf.l, a uint32_t and
  170  * inf.b, a 4-element array of uint32_t.  Only b[0] in the array (aka "l") is
  171  * used and contains the key schedule length * 16 where key schedule length is
  172  * 10, 12, or 14 bytes.
  173  *
  174  * OpenSolaris OS interface:
  175  *      void aes_encrypt_amd64(const aes_ks_t *ks, int Nr,
  176  *              const uint32_t pt[4], uint32_t ct[4])/
  177  *      void aes_decrypt_amd64(const aes_ks_t *ks, int Nr,
  178  *              const uint32_t pt[4], uint32_t ct[4])/
  179  *      typedef union {uint64_t ks64[(MAX_AES_NR + 1) * 4]/
  180  *               uint32_t ks32[(MAX_AES_NR + 1) * 4]/ } aes_ks_t/
  181  * Note: ks is the AES key schedule, Nr is number of rounds, pt is plain text,
  182  * ct is crypto text, and MAX_AES_NR is 14.
  183  * For the x86 64-bit architecture, OpenSolaris OS uses ks32 instead of ks64.
  184  */
  185 
  186 #if defined(lint) || defined(__lint)
  187 
  188 #include <sys/types.h>
  189 void
  190 aes_encrypt_amd64(const uint32_t rk[], int Nr, const uint32_t pt[4],
  191        uint32_t ct[4]) {
  192                 (void) rk, (void) Nr, (void) pt, (void) ct;
  193 }
  194 void
  195 aes_decrypt_amd64(const uint32_t rk[], int Nr, const uint32_t ct[4],
  196        uint32_t pt[4]) {
  197                 (void) rk, (void) Nr, (void) pt, (void) ct;
  198 }
  199 
  200 
  201 #else
  202 
  203 #define _ASM
  204 #include <sys/asm_linkage.h>
  205 
  206 #define KS_LENGTH       60
  207 
  208 #define raxd            eax
  209 #define rdxd            edx
  210 #define rcxd            ecx
  211 #define rbxd            ebx
  212 #define rsid            esi
  213 #define rdid            edi
  214 
  215 #define raxb            al
  216 #define rdxb            dl
  217 #define rcxb            cl
  218 #define rbxb            bl
  219 #define rsib            sil
  220 #define rdib            dil
  221 
  222 // finite field multiplies by {02}, {04} and {08}
  223 
  224 #define f2(x) ((x<<1)^(((x>>7)&1)*0x11b))
  225 #define f4(x) ((x<<2)^(((x>>6)&1)*0x11b)^(((x>>6)&2)*0x11b))
  226 #define f8(x) ((x<<3)^(((x>>5)&1)*0x11b)^(((x>>5)&2)*0x11b)^(((x>>5)&4)*0x11b))
  227 
  228 // finite field multiplies required in table generation
  229 
  230 #define f3(x) ((f2(x)) ^ (x))
  231 #define f9(x) ((f8(x)) ^ (x))
  232 #define fb(x) ((f8(x)) ^ (f2(x)) ^ (x))
  233 #define fd(x) ((f8(x)) ^ (f4(x)) ^ (x))
  234 #define fe(x) ((f8(x)) ^ (f4(x)) ^ (f2(x)))
  235 
  236 // macros for expanding S-box data
  237 
  238 #define u8(x) (f2(x)), (x), (x), (f3(x)), (f2(x)), (x), (x), (f3(x))
  239 #define v8(x) (fe(x)), (f9(x)), (fd(x)), (fb(x)), (fe(x)), (f9(x)), (fd(x)), (x)
  240 #define w8(x) (x), 0, 0, 0, (x), 0, 0, 0
  241 
  242 #define enc_vals(x)     \
  243    .byte x(0x63),x(0x7c),x(0x77),x(0x7b),x(0xf2),x(0x6b),x(0x6f),x(0xc5); \
  244    .byte x(0x30),x(0x01),x(0x67),x(0x2b),x(0xfe),x(0xd7),x(0xab),x(0x76); \
  245    .byte x(0xca),x(0x82),x(0xc9),x(0x7d),x(0xfa),x(0x59),x(0x47),x(0xf0); \
  246    .byte x(0xad),x(0xd4),x(0xa2),x(0xaf),x(0x9c),x(0xa4),x(0x72),x(0xc0); \
  247    .byte x(0xb7),x(0xfd),x(0x93),x(0x26),x(0x36),x(0x3f),x(0xf7),x(0xcc); \
  248    .byte x(0x34),x(0xa5),x(0xe5),x(0xf1),x(0x71),x(0xd8),x(0x31),x(0x15); \
  249    .byte x(0x04),x(0xc7),x(0x23),x(0xc3),x(0x18),x(0x96),x(0x05),x(0x9a); \
  250    .byte x(0x07),x(0x12),x(0x80),x(0xe2),x(0xeb),x(0x27),x(0xb2),x(0x75); \
  251    .byte x(0x09),x(0x83),x(0x2c),x(0x1a),x(0x1b),x(0x6e),x(0x5a),x(0xa0); \
  252    .byte x(0x52),x(0x3b),x(0xd6),x(0xb3),x(0x29),x(0xe3),x(0x2f),x(0x84); \
  253    .byte x(0x53),x(0xd1),x(0x00),x(0xed),x(0x20),x(0xfc),x(0xb1),x(0x5b); \
  254    .byte x(0x6a),x(0xcb),x(0xbe),x(0x39),x(0x4a),x(0x4c),x(0x58),x(0xcf); \
  255    .byte x(0xd0),x(0xef),x(0xaa),x(0xfb),x(0x43),x(0x4d),x(0x33),x(0x85); \
  256    .byte x(0x45),x(0xf9),x(0x02),x(0x7f),x(0x50),x(0x3c),x(0x9f),x(0xa8); \
  257    .byte x(0x51),x(0xa3),x(0x40),x(0x8f),x(0x92),x(0x9d),x(0x38),x(0xf5); \
  258    .byte x(0xbc),x(0xb6),x(0xda),x(0x21),x(0x10),x(0xff),x(0xf3),x(0xd2); \
  259    .byte x(0xcd),x(0x0c),x(0x13),x(0xec),x(0x5f),x(0x97),x(0x44),x(0x17); \
  260    .byte x(0xc4),x(0xa7),x(0x7e),x(0x3d),x(0x64),x(0x5d),x(0x19),x(0x73); \
  261    .byte x(0x60),x(0x81),x(0x4f),x(0xdc),x(0x22),x(0x2a),x(0x90),x(0x88); \
  262    .byte x(0x46),x(0xee),x(0xb8),x(0x14),x(0xde),x(0x5e),x(0x0b),x(0xdb); \
  263    .byte x(0xe0),x(0x32),x(0x3a),x(0x0a),x(0x49),x(0x06),x(0x24),x(0x5c); \
  264    .byte x(0xc2),x(0xd3),x(0xac),x(0x62),x(0x91),x(0x95),x(0xe4),x(0x79); \
  265    .byte x(0xe7),x(0xc8),x(0x37),x(0x6d),x(0x8d),x(0xd5),x(0x4e),x(0xa9); \
  266    .byte x(0x6c),x(0x56),x(0xf4),x(0xea),x(0x65),x(0x7a),x(0xae),x(0x08); \
  267    .byte x(0xba),x(0x78),x(0x25),x(0x2e),x(0x1c),x(0xa6),x(0xb4),x(0xc6); \
  268    .byte x(0xe8),x(0xdd),x(0x74),x(0x1f),x(0x4b),x(0xbd),x(0x8b),x(0x8a); \
  269    .byte x(0x70),x(0x3e),x(0xb5),x(0x66),x(0x48),x(0x03),x(0xf6),x(0x0e); \
  270    .byte x(0x61),x(0x35),x(0x57),x(0xb9),x(0x86),x(0xc1),x(0x1d),x(0x9e); \
  271    .byte x(0xe1),x(0xf8),x(0x98),x(0x11),x(0x69),x(0xd9),x(0x8e),x(0x94); \
  272    .byte x(0x9b),x(0x1e),x(0x87),x(0xe9),x(0xce),x(0x55),x(0x28),x(0xdf); \
  273    .byte x(0x8c),x(0xa1),x(0x89),x(0x0d),x(0xbf),x(0xe6),x(0x42),x(0x68); \
  274    .byte x(0x41),x(0x99),x(0x2d),x(0x0f),x(0xb0),x(0x54),x(0xbb),x(0x16)
  275 
  276 #define dec_vals(x) \
  277    .byte x(0x52),x(0x09),x(0x6a),x(0xd5),x(0x30),x(0x36),x(0xa5),x(0x38); \
  278    .byte x(0xbf),x(0x40),x(0xa3),x(0x9e),x(0x81),x(0xf3),x(0xd7),x(0xfb); \
  279    .byte x(0x7c),x(0xe3),x(0x39),x(0x82),x(0x9b),x(0x2f),x(0xff),x(0x87); \
  280    .byte x(0x34),x(0x8e),x(0x43),x(0x44),x(0xc4),x(0xde),x(0xe9),x(0xcb); \
  281    .byte x(0x54),x(0x7b),x(0x94),x(0x32),x(0xa6),x(0xc2),x(0x23),x(0x3d); \
  282    .byte x(0xee),x(0x4c),x(0x95),x(0x0b),x(0x42),x(0xfa),x(0xc3),x(0x4e); \
  283    .byte x(0x08),x(0x2e),x(0xa1),x(0x66),x(0x28),x(0xd9),x(0x24),x(0xb2); \
  284    .byte x(0x76),x(0x5b),x(0xa2),x(0x49),x(0x6d),x(0x8b),x(0xd1),x(0x25); \
  285    .byte x(0x72),x(0xf8),x(0xf6),x(0x64),x(0x86),x(0x68),x(0x98),x(0x16); \
  286    .byte x(0xd4),x(0xa4),x(0x5c),x(0xcc),x(0x5d),x(0x65),x(0xb6),x(0x92); \
  287    .byte x(0x6c),x(0x70),x(0x48),x(0x50),x(0xfd),x(0xed),x(0xb9),x(0xda); \
  288    .byte x(0x5e),x(0x15),x(0x46),x(0x57),x(0xa7),x(0x8d),x(0x9d),x(0x84); \
  289    .byte x(0x90),x(0xd8),x(0xab),x(0x00),x(0x8c),x(0xbc),x(0xd3),x(0x0a); \
  290    .byte x(0xf7),x(0xe4),x(0x58),x(0x05),x(0xb8),x(0xb3),x(0x45),x(0x06); \
  291    .byte x(0xd0),x(0x2c),x(0x1e),x(0x8f),x(0xca),x(0x3f),x(0x0f),x(0x02); \
  292    .byte x(0xc1),x(0xaf),x(0xbd),x(0x03),x(0x01),x(0x13),x(0x8a),x(0x6b); \
  293    .byte x(0x3a),x(0x91),x(0x11),x(0x41),x(0x4f),x(0x67),x(0xdc),x(0xea); \
  294    .byte x(0x97),x(0xf2),x(0xcf),x(0xce),x(0xf0),x(0xb4),x(0xe6),x(0x73); \
  295    .byte x(0x96),x(0xac),x(0x74),x(0x22),x(0xe7),x(0xad),x(0x35),x(0x85); \
  296    .byte x(0xe2),x(0xf9),x(0x37),x(0xe8),x(0x1c),x(0x75),x(0xdf),x(0x6e); \
  297    .byte x(0x47),x(0xf1),x(0x1a),x(0x71),x(0x1d),x(0x29),x(0xc5),x(0x89); \
  298    .byte x(0x6f),x(0xb7),x(0x62),x(0x0e),x(0xaa),x(0x18),x(0xbe),x(0x1b); \
  299    .byte x(0xfc),x(0x56),x(0x3e),x(0x4b),x(0xc6),x(0xd2),x(0x79),x(0x20); \
  300    .byte x(0x9a),x(0xdb),x(0xc0),x(0xfe),x(0x78),x(0xcd),x(0x5a),x(0xf4); \
  301    .byte x(0x1f),x(0xdd),x(0xa8),x(0x33),x(0x88),x(0x07),x(0xc7),x(0x31); \
  302    .byte x(0xb1),x(0x12),x(0x10),x(0x59),x(0x27),x(0x80),x(0xec),x(0x5f); \
  303    .byte x(0x60),x(0x51),x(0x7f),x(0xa9),x(0x19),x(0xb5),x(0x4a),x(0x0d); \
  304    .byte x(0x2d),x(0xe5),x(0x7a),x(0x9f),x(0x93),x(0xc9),x(0x9c),x(0xef); \
  305    .byte x(0xa0),x(0xe0),x(0x3b),x(0x4d),x(0xae),x(0x2a),x(0xf5),x(0xb0); \
  306    .byte x(0xc8),x(0xeb),x(0xbb),x(0x3c),x(0x83),x(0x53),x(0x99),x(0x61); \
  307    .byte x(0x17),x(0x2b),x(0x04),x(0x7e),x(0xba),x(0x77),x(0xd6),x(0x26); \
  308    .byte x(0xe1),x(0x69),x(0x14),x(0x63),x(0x55),x(0x21),x(0x0c),x(0x7d)
  309 
  310 #define tptr    %rbp    /* table pointer */
  311 #define kptr    %r8     /* key schedule pointer */
  312 #define fofs    128     /* adjust offset in key schedule to keep |disp| < 128 */
  313 #define fk_ref(x, y)    -16*x+fofs+4*y(kptr)
  314 
  315 #ifdef  AES_REV_DKS
  316 #define rofs            128
  317 #define ik_ref(x, y)    -16*x+rofs+4*y(kptr)
  318 
  319 #else
  320 #define rofs            -128
  321 #define ik_ref(x, y)    16*x+rofs+4*y(kptr)
  322 #endif  /* AES_REV_DKS */
  323 
  324 #define tab_0(x)        (tptr,x,8)
  325 #define tab_1(x)        3(tptr,x,8)
  326 #define tab_2(x)        2(tptr,x,8)
  327 #define tab_3(x)        1(tptr,x,8)
  328 #define tab_f(x)        1(tptr,x,8)
  329 #define tab_i(x)        7(tptr,x,8)
  330 
  331 #define ff_rnd(p1, p2, p3, p4, round)   /* normal forward round */ \
  332         mov     fk_ref(round,0), p1; \
  333         mov     fk_ref(round,1), p2; \
  334         mov     fk_ref(round,2), p3; \
  335         mov     fk_ref(round,3), p4; \
  336  \
  337         movzx   %al, %esi; \
  338         movzx   %ah, %edi; \
  339         shr     $16, %eax; \
  340         xor     tab_0(%rsi), p1; \
  341         xor     tab_1(%rdi), p4; \
  342         movzx   %al, %esi; \
  343         movzx   %ah, %edi; \
  344         xor     tab_2(%rsi), p3; \
  345         xor     tab_3(%rdi), p2; \
  346  \
  347         movzx   %bl, %esi; \
  348         movzx   %bh, %edi; \
  349         shr     $16, %ebx; \
  350         xor     tab_0(%rsi), p2; \
  351         xor     tab_1(%rdi), p1; \
  352         movzx   %bl, %esi; \
  353         movzx   %bh, %edi; \
  354         xor     tab_2(%rsi), p4; \
  355         xor     tab_3(%rdi), p3; \
  356  \
  357         movzx   %cl, %esi; \
  358         movzx   %ch, %edi; \
  359         shr     $16, %ecx; \
  360         xor     tab_0(%rsi), p3; \
  361         xor     tab_1(%rdi), p2; \
  362         movzx   %cl, %esi; \
  363         movzx   %ch, %edi; \
  364         xor     tab_2(%rsi), p1; \
  365         xor     tab_3(%rdi), p4; \
  366  \
  367         movzx   %dl, %esi; \
  368         movzx   %dh, %edi; \
  369         shr     $16, %edx; \
  370         xor     tab_0(%rsi), p4; \
  371         xor     tab_1(%rdi), p3; \
  372         movzx   %dl, %esi; \
  373         movzx   %dh, %edi; \
  374         xor     tab_2(%rsi), p2; \
  375         xor     tab_3(%rdi), p1; \
  376  \
  377         mov     p1, %eax; \
  378         mov     p2, %ebx; \
  379         mov     p3, %ecx; \
  380         mov     p4, %edx
  381 
  382 #ifdef  LAST_ROUND_TABLES
  383 
  384 #define fl_rnd(p1, p2, p3, p4, round)   /* last forward round */ \
  385         add     $2048, tptr; \
  386         mov     fk_ref(round,0), p1; \
  387         mov     fk_ref(round,1), p2; \
  388         mov     fk_ref(round,2), p3; \
  389         mov     fk_ref(round,3), p4; \
  390  \
  391         movzx   %al, %esi; \
  392         movzx   %ah, %edi; \
  393         shr     $16, %eax; \
  394         xor     tab_0(%rsi), p1; \
  395         xor     tab_1(%rdi), p4; \
  396         movzx   %al, %esi; \
  397         movzx   %ah, %edi; \
  398         xor     tab_2(%rsi), p3; \
  399         xor     tab_3(%rdi), p2; \
  400  \
  401         movzx   %bl, %esi; \
  402         movzx   %bh, %edi; \
  403         shr     $16, %ebx; \
  404         xor     tab_0(%rsi), p2; \
  405         xor     tab_1(%rdi), p1; \
  406         movzx   %bl, %esi; \
  407         movzx   %bh, %edi; \
  408         xor     tab_2(%rsi), p4; \
  409         xor     tab_3(%rdi), p3; \
  410  \
  411         movzx   %cl, %esi; \
  412         movzx   %ch, %edi; \
  413         shr     $16, %ecx; \
  414         xor     tab_0(%rsi), p3; \
  415         xor     tab_1(%rdi), p2; \
  416         movzx   %cl, %esi; \
  417         movzx   %ch, %edi; \
  418         xor     tab_2(%rsi), p1; \
  419         xor     tab_3(%rdi), p4; \
  420  \
  421         movzx   %dl, %esi; \
  422         movzx   %dh, %edi; \
  423         shr     $16, %edx; \
  424         xor     tab_0(%rsi), p4; \
  425         xor     tab_1(%rdi), p3; \
  426         movzx   %dl, %esi; \
  427         movzx   %dh, %edi; \
  428         xor     tab_2(%rsi), p2; \
  429         xor     tab_3(%rdi), p1
  430 
  431 #else
  432 
  433 #define fl_rnd(p1, p2, p3, p4, round)   /* last forward round */ \
  434         mov     fk_ref(round,0), p1; \
  435         mov     fk_ref(round,1), p2; \
  436         mov     fk_ref(round,2), p3; \
  437         mov     fk_ref(round,3), p4; \
  438  \
  439         movzx   %al, %esi; \
  440         movzx   %ah, %edi; \
  441         shr     $16, %eax; \
  442         movzx   tab_f(%rsi), %esi; \
  443         movzx   tab_f(%rdi), %edi; \
  444         xor     %esi, p1; \
  445         rol     $8, %edi; \
  446         xor     %edi, p4; \
  447         movzx   %al, %esi; \
  448         movzx   %ah, %edi; \
  449         movzx   tab_f(%rsi), %esi; \
  450         movzx   tab_f(%rdi), %edi; \
  451         rol     $16, %esi; \
  452         rol     $24, %edi; \
  453         xor     %esi, p3; \
  454         xor     %edi, p2; \
  455  \
  456         movzx   %bl, %esi; \
  457         movzx   %bh, %edi; \
  458         shr     $16, %ebx; \
  459         movzx   tab_f(%rsi), %esi; \
  460         movzx   tab_f(%rdi), %edi; \
  461         xor     %esi, p2; \
  462         rol     $8, %edi; \
  463         xor     %edi, p1; \
  464         movzx   %bl, %esi; \
  465         movzx   %bh, %edi; \
  466         movzx   tab_f(%rsi), %esi; \
  467         movzx   tab_f(%rdi), %edi; \
  468         rol     $16, %esi; \
  469         rol     $24, %edi; \
  470         xor     %esi, p4; \
  471         xor     %edi, p3; \
  472  \
  473         movzx   %cl, %esi; \
  474         movzx   %ch, %edi; \
  475         movzx   tab_f(%rsi), %esi; \
  476         movzx   tab_f(%rdi), %edi; \
  477         shr     $16, %ecx; \
  478         xor     %esi, p3; \
  479         rol     $8, %edi; \
  480         xor     %edi, p2; \
  481         movzx   %cl, %esi; \
  482         movzx   %ch, %edi; \
  483         movzx   tab_f(%rsi), %esi; \
  484         movzx   tab_f(%rdi), %edi; \
  485         rol     $16, %esi; \
  486         rol     $24, %edi; \
  487         xor     %esi, p1; \
  488         xor     %edi, p4; \
  489  \
  490         movzx   %dl, %esi; \
  491         movzx   %dh, %edi; \
  492         movzx   tab_f(%rsi), %esi; \
  493         movzx   tab_f(%rdi), %edi; \
  494         shr     $16, %edx; \
  495         xor     %esi, p4; \
  496         rol     $8, %edi; \
  497         xor     %edi, p3; \
  498         movzx   %dl, %esi; \
  499         movzx   %dh, %edi; \
  500         movzx   tab_f(%rsi), %esi; \
  501         movzx   tab_f(%rdi), %edi; \
  502         rol     $16, %esi; \
  503         rol     $24, %edi; \
  504         xor     %esi, p2; \
  505         xor     %edi, p1
  506 
  507 #endif  /* LAST_ROUND_TABLES */
  508 
  509 #define ii_rnd(p1, p2, p3, p4, round)   /* normal inverse round */ \
  510         mov     ik_ref(round,0), p1; \
  511         mov     ik_ref(round,1), p2; \
  512         mov     ik_ref(round,2), p3; \
  513         mov     ik_ref(round,3), p4; \
  514  \
  515         movzx   %al, %esi; \
  516         movzx   %ah, %edi; \
  517         shr     $16, %eax; \
  518         xor     tab_0(%rsi), p1; \
  519         xor     tab_1(%rdi), p2; \
  520         movzx   %al, %esi; \
  521         movzx   %ah, %edi; \
  522         xor     tab_2(%rsi), p3; \
  523         xor     tab_3(%rdi), p4; \
  524  \
  525         movzx   %bl, %esi; \
  526         movzx   %bh, %edi; \
  527         shr     $16, %ebx; \
  528         xor     tab_0(%rsi), p2; \
  529         xor     tab_1(%rdi), p3; \
  530         movzx   %bl, %esi; \
  531         movzx   %bh, %edi; \
  532         xor     tab_2(%rsi), p4; \
  533         xor     tab_3(%rdi), p1; \
  534  \
  535         movzx   %cl, %esi; \
  536         movzx   %ch, %edi; \
  537         shr     $16, %ecx; \
  538         xor     tab_0(%rsi), p3; \
  539         xor     tab_1(%rdi), p4; \
  540         movzx   %cl, %esi; \
  541         movzx   %ch, %edi; \
  542         xor     tab_2(%rsi), p1; \
  543         xor     tab_3(%rdi), p2; \
  544  \
  545         movzx   %dl, %esi; \
  546         movzx   %dh, %edi; \
  547         shr     $16, %edx; \
  548         xor     tab_0(%rsi), p4; \
  549         xor     tab_1(%rdi), p1; \
  550         movzx   %dl, %esi; \
  551         movzx   %dh, %edi; \
  552         xor     tab_2(%rsi), p2; \
  553         xor     tab_3(%rdi), p3; \
  554  \
  555         mov     p1, %eax; \
  556         mov     p2, %ebx; \
  557         mov     p3, %ecx; \
  558         mov     p4, %edx
  559 
  560 #ifdef  LAST_ROUND_TABLES
  561 
  562 #define il_rnd(p1, p2, p3, p4, round)   /* last inverse round */ \
  563         add     $2048, tptr; \
  564         mov     ik_ref(round,0), p1; \
  565         mov     ik_ref(round,1), p2; \
  566         mov     ik_ref(round,2), p3; \
  567         mov     ik_ref(round,3), p4; \
  568  \
  569         movzx   %al, %esi; \
  570         movzx   %ah, %edi; \
  571         shr     $16, %eax; \
  572         xor     tab_0(%rsi), p1; \
  573         xor     tab_1(%rdi), p2; \
  574         movzx   %al, %esi; \
  575         movzx   %ah, %edi; \
  576         xor     tab_2(%rsi), p3; \
  577         xor     tab_3(%rdi), p4; \
  578  \
  579         movzx   %bl, %esi; \
  580         movzx   %bh, %edi; \
  581         shr     $16, %ebx; \
  582         xor     tab_0(%rsi), p2; \
  583         xor     tab_1(%rdi), p3; \
  584         movzx   %bl, %esi; \
  585         movzx   %bh, %edi; \
  586         xor     tab_2(%rsi), p4; \
  587         xor     tab_3(%rdi), p1; \
  588  \
  589         movzx   %cl, %esi; \
  590         movzx   %ch, %edi; \
  591         shr     $16, %ecx; \
  592         xor     tab_0(%rsi), p3; \
  593         xor     tab_1(%rdi), p4; \
  594         movzx   %cl, %esi; \
  595         movzx   %ch, %edi; \
  596         xor     tab_2(%rsi), p1; \
  597         xor     tab_3(%rdi), p2; \
  598  \
  599         movzx   %dl, %esi; \
  600         movzx   %dh, %edi; \
  601         shr     $16, %edx; \
  602         xor     tab_0(%rsi), p4; \
  603         xor     tab_1(%rdi), p1; \
  604         movzx   %dl, %esi; \
  605         movzx   %dh, %edi; \
  606         xor     tab_2(%rsi), p2; \
  607         xor     tab_3(%rdi), p3
  608 
  609 #else
  610 
  611 #define il_rnd(p1, p2, p3, p4, round)   /* last inverse round */ \
  612         mov     ik_ref(round,0), p1; \
  613         mov     ik_ref(round,1), p2; \
  614         mov     ik_ref(round,2), p3; \
  615         mov     ik_ref(round,3), p4; \
  616  \
  617         movzx   %al, %esi; \
  618         movzx   %ah, %edi; \
  619         movzx   tab_i(%rsi), %esi; \
  620         movzx   tab_i(%rdi), %edi; \
  621         shr     $16, %eax; \
  622         xor     %esi, p1; \
  623         rol     $8, %edi; \
  624         xor     %edi, p2; \
  625         movzx   %al, %esi; \
  626         movzx   %ah, %edi; \
  627         movzx   tab_i(%rsi), %esi; \
  628         movzx   tab_i(%rdi), %edi; \
  629         rol     $16, %esi; \
  630         rol     $24, %edi; \
  631         xor     %esi, p3; \
  632         xor     %edi, p4; \
  633  \
  634         movzx   %bl, %esi; \
  635         movzx   %bh, %edi; \
  636         movzx   tab_i(%rsi), %esi; \
  637         movzx   tab_i(%rdi), %edi; \
  638         shr     $16, %ebx; \
  639         xor     %esi, p2; \
  640         rol     $8, %edi; \
  641         xor     %edi, p3; \
  642         movzx   %bl, %esi; \
  643         movzx   %bh, %edi; \
  644         movzx   tab_i(%rsi), %esi; \
  645         movzx   tab_i(%rdi), %edi; \
  646         rol     $16, %esi; \
  647         rol     $24, %edi; \
  648         xor     %esi, p4; \
  649         xor     %edi, p1; \
  650  \
  651         movzx   %cl, %esi; \
  652         movzx   %ch, %edi; \
  653         movzx   tab_i(%rsi), %esi; \
  654         movzx   tab_i(%rdi), %edi; \
  655         shr     $16, %ecx; \
  656         xor     %esi, p3; \
  657         rol     $8, %edi; \
  658         xor     %edi, p4; \
  659         movzx   %cl, %esi; \
  660         movzx   %ch, %edi; \
  661         movzx   tab_i(%rsi), %esi; \
  662         movzx   tab_i(%rdi), %edi; \
  663         rol     $16, %esi; \
  664         rol     $24, %edi; \
  665         xor     %esi, p1; \
  666         xor     %edi, p2; \
  667  \
  668         movzx   %dl, %esi; \
  669         movzx   %dh, %edi; \
  670         movzx   tab_i(%rsi), %esi; \
  671         movzx   tab_i(%rdi), %edi; \
  672         shr     $16, %edx; \
  673         xor     %esi, p4; \
  674         rol     $8, %edi; \
  675         xor     %edi, p1; \
  676         movzx   %dl, %esi; \
  677         movzx   %dh, %edi; \
  678         movzx   tab_i(%rsi), %esi; \
  679         movzx   tab_i(%rdi), %edi; \
  680         rol     $16, %esi; \
  681         rol     $24, %edi; \
  682         xor     %esi, p2; \
  683         xor     %edi, p3
  684 
  685 #endif  /* LAST_ROUND_TABLES */
  686 
  687 /*
  688  * OpenSolaris OS:
  689  * void aes_encrypt_amd64(const aes_ks_t *ks, int Nr,
  690  *      const uint32_t pt[4], uint32_t ct[4])/
  691  *
  692  * Original interface:
  693  * int aes_encrypt(const unsigned char *in,
  694  *      unsigned char *out, const aes_encrypt_ctx cx[1])/
  695  */
  696 SECTION_STATIC
  697 .balign 64
  698 enc_tab:
  699         enc_vals(u8)
  700 #ifdef  LAST_ROUND_TABLES
  701         // Last Round Tables:
  702         enc_vals(w8)
  703 #endif
  704 
  705 
  706 ENTRY_NP(aes_encrypt_amd64)
  707         ENDBR
  708 #ifdef  GLADMAN_INTERFACE
  709         // Original interface
  710         sub     $[4*8], %rsp    // gnu/linux/opensolaris binary interface
  711         mov     %rsi, (%rsp)    // output pointer (P2)
  712         mov     %rdx, %r8       // context (P3)
  713 
  714         mov     %rbx, 1*8(%rsp) // P1: input pointer in rdi
  715         mov     %rbp, 2*8(%rsp) // P2: output pointer in (rsp)
  716         mov     %r12, 3*8(%rsp) // P3: context in r8
  717         movzx   4*KS_LENGTH(kptr), %esi // Get byte key length * 16
  718 
  719 #else
  720         // OpenSolaris OS interface
  721         sub     $(4*8), %rsp    // Make room on stack to save registers
  722         mov     %rcx, (%rsp)    // Save output pointer (P4) on stack
  723         mov     %rdi, %r8       // context (P1)
  724         mov     %rdx, %rdi      // P3: save input pointer
  725         shl     $4, %esi        // P2: esi byte key length * 16
  726 
  727         mov     %rbx, 1*8(%rsp) // Save registers
  728         mov     %rbp, 2*8(%rsp)
  729         mov     %r12, 3*8(%rsp)
  730         // P1: context in r8
  731         // P2: byte key length * 16 in esi
  732         // P3: input pointer in rdi
  733         // P4: output pointer in (rsp)
  734 #endif  /* GLADMAN_INTERFACE */
  735 
  736         lea     enc_tab(%rip), tptr
  737         sub     $fofs, kptr
  738 
  739         // Load input block into registers
  740         mov     (%rdi), %eax
  741         mov     1*4(%rdi), %ebx
  742         mov     2*4(%rdi), %ecx
  743         mov     3*4(%rdi), %edx
  744 
  745         xor     fofs(kptr), %eax
  746         xor     fofs+4(kptr), %ebx
  747         xor     fofs+8(kptr), %ecx
  748         xor     fofs+12(kptr), %edx
  749 
  750         lea     (kptr,%rsi), kptr
  751         // Jump based on byte key length * 16:
  752         cmp     $(10*16), %esi
  753         je      3f
  754         cmp     $(12*16), %esi
  755         je      2f
  756         cmp     $(14*16), %esi
  757         je      1f
  758         mov     $-1, %rax       // error
  759         jmp     4f
  760 
  761         // Perform normal forward rounds
  762 1:      ff_rnd(%r9d, %r10d, %r11d, %r12d, 13)
  763         ff_rnd(%r9d, %r10d, %r11d, %r12d, 12)
  764 2:      ff_rnd(%r9d, %r10d, %r11d, %r12d, 11)
  765         ff_rnd(%r9d, %r10d, %r11d, %r12d, 10)
  766 3:      ff_rnd(%r9d, %r10d, %r11d, %r12d,  9)
  767         ff_rnd(%r9d, %r10d, %r11d, %r12d,  8)
  768         ff_rnd(%r9d, %r10d, %r11d, %r12d,  7)
  769         ff_rnd(%r9d, %r10d, %r11d, %r12d,  6)
  770         ff_rnd(%r9d, %r10d, %r11d, %r12d,  5)
  771         ff_rnd(%r9d, %r10d, %r11d, %r12d,  4)
  772         ff_rnd(%r9d, %r10d, %r11d, %r12d,  3)
  773         ff_rnd(%r9d, %r10d, %r11d, %r12d,  2)
  774         ff_rnd(%r9d, %r10d, %r11d, %r12d,  1)
  775         fl_rnd(%r9d, %r10d, %r11d, %r12d,  0)
  776 
  777         // Copy results
  778         mov     (%rsp), %rbx
  779         mov     %r9d, (%rbx)
  780         mov     %r10d, 4(%rbx)
  781         mov     %r11d, 8(%rbx)
  782         mov     %r12d, 12(%rbx)
  783         xor     %rax, %rax
  784 4:      // Restore registers
  785         mov     1*8(%rsp), %rbx
  786         mov     2*8(%rsp), %rbp
  787         mov     3*8(%rsp), %r12
  788         add     $(4*8), %rsp
  789         RET
  790 
  791         SET_SIZE(aes_encrypt_amd64)
  792 
  793 /*
  794  * OpenSolaris OS:
  795  * void aes_decrypt_amd64(const aes_ks_t *ks, int Nr,
  796  *      const uint32_t pt[4], uint32_t ct[4])/
  797  *
  798  * Original interface:
  799  * int aes_decrypt(const unsigned char *in,
  800  *      unsigned char *out, const aes_encrypt_ctx cx[1])/
  801  */
  802 SECTION_STATIC
  803 .balign 64
  804 dec_tab:
  805         dec_vals(v8)
  806 #ifdef  LAST_ROUND_TABLES
  807         // Last Round Tables:
  808         dec_vals(w8)
  809 #endif
  810 
  811 
  812 ENTRY_NP(aes_decrypt_amd64)
  813         ENDBR
  814 #ifdef  GLADMAN_INTERFACE
  815         // Original interface
  816         sub     $[4*8], %rsp    // gnu/linux/opensolaris binary interface
  817         mov     %rsi, (%rsp)    // output pointer (P2)
  818         mov     %rdx, %r8       // context (P3)
  819 
  820         mov     %rbx, 1*8(%rsp) // P1: input pointer in rdi
  821         mov     %rbp, 2*8(%rsp) // P2: output pointer in (rsp)
  822         mov     %r12, 3*8(%rsp) // P3: context in r8
  823         movzx   4*KS_LENGTH(kptr), %esi // Get byte key length * 16
  824 
  825 #else
  826         // OpenSolaris OS interface
  827         sub     $(4*8), %rsp    // Make room on stack to save registers
  828         mov     %rcx, (%rsp)    // Save output pointer (P4) on stack
  829         mov     %rdi, %r8       // context (P1)
  830         mov     %rdx, %rdi      // P3: save input pointer
  831         shl     $4, %esi        // P2: esi byte key length * 16
  832 
  833         mov     %rbx, 1*8(%rsp) // Save registers
  834         mov     %rbp, 2*8(%rsp)
  835         mov     %r12, 3*8(%rsp)
  836         // P1: context in r8
  837         // P2: byte key length * 16 in esi
  838         // P3: input pointer in rdi
  839         // P4: output pointer in (rsp)
  840 #endif  /* GLADMAN_INTERFACE */
  841 
  842         lea     dec_tab(%rip), tptr
  843         sub     $rofs, kptr
  844 
  845         // Load input block into registers
  846         mov     (%rdi), %eax
  847         mov     1*4(%rdi), %ebx
  848         mov     2*4(%rdi), %ecx
  849         mov     3*4(%rdi), %edx
  850 
  851 #ifdef AES_REV_DKS
  852         mov     kptr, %rdi
  853         lea     (kptr,%rsi), kptr
  854 #else
  855         lea     (kptr,%rsi), %rdi
  856 #endif
  857 
  858         xor     rofs(%rdi), %eax
  859         xor     rofs+4(%rdi), %ebx
  860         xor     rofs+8(%rdi), %ecx
  861         xor     rofs+12(%rdi), %edx
  862 
  863         // Jump based on byte key length * 16:
  864         cmp     $(10*16), %esi
  865         je      3f
  866         cmp     $(12*16), %esi
  867         je      2f
  868         cmp     $(14*16), %esi
  869         je      1f
  870         mov     $-1, %rax       // error
  871         jmp     4f
  872 
  873         // Perform normal inverse rounds
  874 1:      ii_rnd(%r9d, %r10d, %r11d, %r12d, 13)
  875         ii_rnd(%r9d, %r10d, %r11d, %r12d, 12)
  876 2:      ii_rnd(%r9d, %r10d, %r11d, %r12d, 11)
  877         ii_rnd(%r9d, %r10d, %r11d, %r12d, 10)
  878 3:      ii_rnd(%r9d, %r10d, %r11d, %r12d,  9)
  879         ii_rnd(%r9d, %r10d, %r11d, %r12d,  8)
  880         ii_rnd(%r9d, %r10d, %r11d, %r12d,  7)
  881         ii_rnd(%r9d, %r10d, %r11d, %r12d,  6)
  882         ii_rnd(%r9d, %r10d, %r11d, %r12d,  5)
  883         ii_rnd(%r9d, %r10d, %r11d, %r12d,  4)
  884         ii_rnd(%r9d, %r10d, %r11d, %r12d,  3)
  885         ii_rnd(%r9d, %r10d, %r11d, %r12d,  2)
  886         ii_rnd(%r9d, %r10d, %r11d, %r12d,  1)
  887         il_rnd(%r9d, %r10d, %r11d, %r12d,  0)
  888 
  889         // Copy results
  890         mov     (%rsp), %rbx
  891         mov     %r9d, (%rbx)
  892         mov     %r10d, 4(%rbx)
  893         mov     %r11d, 8(%rbx)
  894         mov     %r12d, 12(%rbx)
  895         xor     %rax, %rax
  896 4:      // Restore registers
  897         mov     1*8(%rsp), %rbx
  898         mov     2*8(%rsp), %rbp
  899         mov     3*8(%rsp), %r12
  900         add     $(4*8), %rsp
  901         RET
  902 
  903         SET_SIZE(aes_decrypt_amd64)
  904 #endif /* lint || __lint */
  905 
  906 #ifdef __ELF__
  907 .section .note.GNU-stack,"",%progbits
  908 #endif

Cache object: 8a552fddfe9c0d7e40bc49f6e2ae755b


[ source navigation ] [ diff markup ] [ identifier search ] [ freetext search ] [ file search ] [ list types ] [ track identifier ]


This page is part of the FreeBSD/Linux Linux Kernel Cross-Reference, and was automatically generated using a modified version of the LXR engine.