The Design and Implementation of the FreeBSD Operating System, Second Edition
Now available: The Design and Implementation of the FreeBSD Operating System (Second Edition)


[ source navigation ] [ diff markup ] [ identifier search ] [ freetext search ] [ file search ] [ list types ] [ track identifier ]

FreeBSD/Linux Kernel Cross Reference
sys/crypto/aesni/aesni_ghash.c

Version: -  FREEBSD  -  FREEBSD-13-STABLE  -  FREEBSD-13-0  -  FREEBSD-12-STABLE  -  FREEBSD-12-0  -  FREEBSD-11-STABLE  -  FREEBSD-11-0  -  FREEBSD-10-STABLE  -  FREEBSD-10-0  -  FREEBSD-9-STABLE  -  FREEBSD-9-0  -  FREEBSD-8-STABLE  -  FREEBSD-8-0  -  FREEBSD-7-STABLE  -  FREEBSD-7-0  -  FREEBSD-6-STABLE  -  FREEBSD-6-0  -  FREEBSD-5-STABLE  -  FREEBSD-5-0  -  FREEBSD-4-STABLE  -  FREEBSD-3-STABLE  -  FREEBSD22  -  l41  -  OPENBSD  -  linux-2.6  -  MK84  -  PLAN9  -  xnu-8792 
SearchContext: -  none  -  3  -  10 

    1 /*-
    2  * Copyright (c) 2014 The FreeBSD Foundation
    3  * All rights reserved.
    4  *
    5  * This software was developed by John-Mark Gurney under
    6  * the sponsorship of the FreeBSD Foundation and
    7  * Rubicon Communications, LLC (Netgate).
    8  * Redistribution and use in source and binary forms, with or without
    9  * modification, are permitted provided that the following conditions
   10  * are met:
   11  * 1.  Redistributions of source code must retain the above copyright
   12  *     notice, this list of conditions and the following disclaimer.
   13  * 2.  Redistributions in binary form must reproduce the above copyright
   14  *     notice, this list of conditions and the following disclaimer in the
   15  *     documentation and/or other materials provided with the distribution.
   16  *
   17  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
   18  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
   19  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
   20  * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
   21  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
   22  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
   23  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
   24  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
   25  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
   26  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
   27  * SUCH DAMAGE.
   28  *
   29  *
   30  *      $FreeBSD$
   31  *
   32  */
   33 
   34 /*
   35  * Figure 5, 8 and 12 are copied from the Intel white paper:
   36  * Intel® Carry-Less Multiplication Instruction and its Usage for
   37  * Computing the GCM Mode
   38  *
   39  * and as such are:
   40  * Copyright © 2010 Intel Corporation.
   41  * All rights reserved.
   42  * 
   43  * Redistribution and use in source and binary forms, with or without
   44  * modification, are permitted provided that the following conditions
   45  * are met:
   46  *   * Redistributions of source code must retain the above copyright
   47  *     notice, this list of conditions and the following disclaimer.
   48  *   * Redistributions in binary form must reproduce the above copyright
   49  *     notice, this list of conditions and the following disclaimer in the
   50  *     documentation and/or other materials provided with the distribution.
   51  *   * Neither the name of Intel Corporation nor the
   52  *     names of its contributors may be used to endorse or promote products
   53  *     derived from this software without specific prior written permission.
   54  * 
   55  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
   56  * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
   57  * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
   58  * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
   59  * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
   60  * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
   61  * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
   62  * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
   63  * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
   64  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
   65  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
   66  */
   67 
   68 #ifdef _KERNEL
   69 #include <crypto/aesni/aesni.h>
   70 #include <crypto/aesni/aesni_os.h>
   71 #else
   72 #include <stdint.h>
   73 #endif
   74 
   75 #include <wmmintrin.h>
   76 #include <emmintrin.h>
   77 #include <smmintrin.h>
   78 
   79 static inline int
   80 m128icmp(__m128i a, __m128i b)
   81 {
   82         __m128i cmp;
   83 
   84         cmp = _mm_cmpeq_epi32(a, b);
   85 
   86         return _mm_movemask_epi8(cmp) == 0xffff;
   87 }
   88 
   89 #ifdef __i386__
   90 static inline __m128i
   91 _mm_insert_epi64(__m128i a, int64_t b, const int ndx)
   92 {  
   93 
   94         if (!ndx) {
   95                 a = _mm_insert_epi32(a, b, 0);
   96                 a = _mm_insert_epi32(a, b >> 32, 1);
   97         } else {
   98                 a = _mm_insert_epi32(a, b, 2);
   99                 a = _mm_insert_epi32(a, b >> 32, 3);
  100         }
  101 
  102         return a;
  103 }
  104 #endif
  105 
  106 /* some code from carry-less-multiplication-instruction-in-gcm-mode-paper.pdf */
  107 
  108 /* Figure 5. Code Sample - Performing Ghash Using Algorithms 1 and 5 (C) */
  109 static void
  110 gfmul(__m128i a, __m128i b, __m128i *res)
  111 {
  112         __m128i tmp2, tmp3, tmp4, tmp5, tmp6, tmp7, tmp8, tmp9;
  113 
  114         tmp3 = _mm_clmulepi64_si128(a, b, 0x00);
  115         tmp4 = _mm_clmulepi64_si128(a, b, 0x10);
  116         tmp5 = _mm_clmulepi64_si128(a, b, 0x01);
  117         tmp6 = _mm_clmulepi64_si128(a, b, 0x11);
  118 
  119         tmp4 = _mm_xor_si128(tmp4, tmp5);
  120         tmp5 = _mm_slli_si128(tmp4, 8);
  121         tmp4 = _mm_srli_si128(tmp4, 8);
  122         tmp3 = _mm_xor_si128(tmp3, tmp5);
  123         tmp6 = _mm_xor_si128(tmp6, tmp4);
  124 
  125         tmp7 = _mm_srli_epi32(tmp3, 31);
  126         tmp8 = _mm_srli_epi32(tmp6, 31);
  127         tmp3 = _mm_slli_epi32(tmp3, 1);
  128         tmp6 = _mm_slli_epi32(tmp6, 1);
  129 
  130         tmp9 = _mm_srli_si128(tmp7, 12);
  131         tmp8 = _mm_slli_si128(tmp8, 4);
  132         tmp7 = _mm_slli_si128(tmp7, 4);
  133         tmp3 = _mm_or_si128(tmp3, tmp7);
  134         tmp6 = _mm_or_si128(tmp6, tmp8);
  135         tmp6 = _mm_or_si128(tmp6, tmp9);
  136 
  137         tmp7 = _mm_slli_epi32(tmp3, 31);
  138         tmp8 = _mm_slli_epi32(tmp3, 30);
  139         tmp9 = _mm_slli_epi32(tmp3, 25);
  140 
  141         tmp7 = _mm_xor_si128(tmp7, tmp8);
  142         tmp7 = _mm_xor_si128(tmp7, tmp9);
  143         tmp8 = _mm_srli_si128(tmp7, 4);
  144         tmp7 = _mm_slli_si128(tmp7, 12);
  145         tmp3 = _mm_xor_si128(tmp3, tmp7);
  146 
  147         tmp2 = _mm_srli_epi32(tmp3, 1);
  148         tmp4 = _mm_srli_epi32(tmp3, 2);
  149         tmp5 = _mm_srli_epi32(tmp3, 7);
  150         tmp2 = _mm_xor_si128(tmp2, tmp4);
  151         tmp2 = _mm_xor_si128(tmp2, tmp5);
  152         tmp2 = _mm_xor_si128(tmp2, tmp8);
  153         tmp3 = _mm_xor_si128(tmp3, tmp2);
  154         tmp6 = _mm_xor_si128(tmp6, tmp3);
  155 
  156         *res = tmp6;
  157 }
  158 
  159 /*
  160  * Figure 8. Code Sample - Performing Ghash Using an Aggregated Reduction
  161  * Method */
  162 static void
  163 reduce4(__m128i H1, __m128i H2, __m128i H3, __m128i H4,
  164     __m128i X1, __m128i X2, __m128i X3, __m128i X4, __m128i *res)
  165 {
  166         /*algorithm by Krzysztof Jankowski, Pierre Laurent - Intel*/
  167         __m128i H1_X1_lo, H1_X1_hi, H2_X2_lo, H2_X2_hi, H3_X3_lo,
  168             H3_X3_hi, H4_X4_lo, H4_X4_hi, lo, hi;
  169         __m128i tmp0, tmp1, tmp2, tmp3;
  170         __m128i tmp4, tmp5, tmp6, tmp7;
  171         __m128i tmp8, tmp9;
  172 
  173         H1_X1_lo = _mm_clmulepi64_si128(H1, X1, 0x00);
  174         H2_X2_lo = _mm_clmulepi64_si128(H2, X2, 0x00);
  175         H3_X3_lo = _mm_clmulepi64_si128(H3, X3, 0x00);
  176         H4_X4_lo = _mm_clmulepi64_si128(H4, X4, 0x00);
  177 
  178         lo = _mm_xor_si128(H1_X1_lo, H2_X2_lo);
  179         lo = _mm_xor_si128(lo, H3_X3_lo);
  180         lo = _mm_xor_si128(lo, H4_X4_lo);
  181 
  182         H1_X1_hi = _mm_clmulepi64_si128(H1, X1, 0x11);
  183         H2_X2_hi = _mm_clmulepi64_si128(H2, X2, 0x11);
  184         H3_X3_hi = _mm_clmulepi64_si128(H3, X3, 0x11);
  185         H4_X4_hi = _mm_clmulepi64_si128(H4, X4, 0x11);
  186 
  187         hi = _mm_xor_si128(H1_X1_hi, H2_X2_hi);
  188         hi = _mm_xor_si128(hi, H3_X3_hi);
  189         hi = _mm_xor_si128(hi, H4_X4_hi);
  190 
  191         tmp0 = _mm_shuffle_epi32(H1, 78);
  192         tmp4 = _mm_shuffle_epi32(X1, 78);
  193         tmp0 = _mm_xor_si128(tmp0, H1);
  194         tmp4 = _mm_xor_si128(tmp4, X1);
  195         tmp1 = _mm_shuffle_epi32(H2, 78);
  196         tmp5 = _mm_shuffle_epi32(X2, 78);
  197         tmp1 = _mm_xor_si128(tmp1, H2);
  198         tmp5 = _mm_xor_si128(tmp5, X2);
  199         tmp2 = _mm_shuffle_epi32(H3, 78);
  200         tmp6 = _mm_shuffle_epi32(X3, 78);
  201         tmp2 = _mm_xor_si128(tmp2, H3);
  202         tmp6 = _mm_xor_si128(tmp6, X3);
  203         tmp3 = _mm_shuffle_epi32(H4, 78);
  204         tmp7 = _mm_shuffle_epi32(X4, 78);
  205         tmp3 = _mm_xor_si128(tmp3, H4);
  206         tmp7 = _mm_xor_si128(tmp7, X4);
  207 
  208         tmp0 = _mm_clmulepi64_si128(tmp0, tmp4, 0x00);
  209         tmp1 = _mm_clmulepi64_si128(tmp1, tmp5, 0x00);
  210         tmp2 = _mm_clmulepi64_si128(tmp2, tmp6, 0x00);
  211         tmp3 = _mm_clmulepi64_si128(tmp3, tmp7, 0x00);
  212 
  213         tmp0 = _mm_xor_si128(tmp0, lo);
  214         tmp0 = _mm_xor_si128(tmp0, hi);
  215         tmp0 = _mm_xor_si128(tmp1, tmp0);
  216         tmp0 = _mm_xor_si128(tmp2, tmp0);
  217         tmp0 = _mm_xor_si128(tmp3, tmp0);
  218 
  219         tmp4 = _mm_slli_si128(tmp0, 8);
  220         tmp0 = _mm_srli_si128(tmp0, 8);
  221 
  222         lo = _mm_xor_si128(tmp4, lo);
  223         hi = _mm_xor_si128(tmp0, hi);
  224 
  225         tmp3 = lo;
  226         tmp6 = hi;
  227 
  228         tmp7 = _mm_srli_epi32(tmp3, 31);
  229         tmp8 = _mm_srli_epi32(tmp6, 31);
  230         tmp3 = _mm_slli_epi32(tmp3, 1);
  231         tmp6 = _mm_slli_epi32(tmp6, 1);
  232 
  233         tmp9 = _mm_srli_si128(tmp7, 12);
  234         tmp8 = _mm_slli_si128(tmp8, 4);
  235         tmp7 = _mm_slli_si128(tmp7, 4);
  236         tmp3 = _mm_or_si128(tmp3, tmp7);
  237         tmp6 = _mm_or_si128(tmp6, tmp8);
  238         tmp6 = _mm_or_si128(tmp6, tmp9);
  239 
  240         tmp7 = _mm_slli_epi32(tmp3, 31);
  241         tmp8 = _mm_slli_epi32(tmp3, 30);
  242         tmp9 = _mm_slli_epi32(tmp3, 25);
  243 
  244         tmp7 = _mm_xor_si128(tmp7, tmp8);
  245         tmp7 = _mm_xor_si128(tmp7, tmp9);
  246         tmp8 = _mm_srli_si128(tmp7, 4);
  247         tmp7 = _mm_slli_si128(tmp7, 12);
  248         tmp3 = _mm_xor_si128(tmp3, tmp7);
  249 
  250         tmp2 = _mm_srli_epi32(tmp3, 1);
  251         tmp4 = _mm_srli_epi32(tmp3, 2);
  252         tmp5 = _mm_srli_epi32(tmp3, 7);
  253         tmp2 = _mm_xor_si128(tmp2, tmp4);
  254         tmp2 = _mm_xor_si128(tmp2, tmp5);
  255         tmp2 = _mm_xor_si128(tmp2, tmp8);
  256         tmp3 = _mm_xor_si128(tmp3, tmp2);
  257         tmp6 = _mm_xor_si128(tmp6, tmp3);
  258 
  259         *res = tmp6;
  260 }
  261 
  262 /*
  263  * Figure 12. AES-GCM: Processing Four Blocks in Parallel with Aggregated
  264  * Every Four Blocks
  265  */
  266 /*
  267  * per NIST SP-800-38D, 5.2.1.1, len(p) <= 2^39-256 (in bits), or
  268  * 2^32-256*8*16 bytes.
  269  */
  270 void
  271 AES_GCM_encrypt(const unsigned char *in, unsigned char *out,
  272         const unsigned char *addt, const unsigned char *ivec,
  273         unsigned char *tag, uint32_t nbytes, uint32_t abytes, int ibytes,
  274         const unsigned char *key, int nr)
  275 {
  276         int i, j ,k;
  277         __m128i tmp1, tmp2, tmp3, tmp4;
  278         __m128i tmp5, tmp6, tmp7, tmp8;
  279         __m128i H, H2, H3, H4, Y, T;
  280         const __m128i *KEY = (const __m128i *)key;
  281         __m128i ctr1, ctr2, ctr3, ctr4;
  282         __m128i ctr5, ctr6, ctr7, ctr8;
  283         __m128i last_block = _mm_setzero_si128();
  284         __m128i ONE = _mm_set_epi32(0, 1, 0, 0);
  285         __m128i EIGHT = _mm_set_epi32(0, 8, 0, 0);
  286         __m128i BSWAP_EPI64 = _mm_set_epi8(8,9,10,11,12,13,14,15,0,1,2,3,4,5,6,
  287             7);
  288         __m128i BSWAP_MASK = _mm_set_epi8(0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,
  289             15);
  290         __m128i X = _mm_setzero_si128();
  291 
  292         if (ibytes == 96/8) {
  293                 Y = _mm_loadu_si128((const __m128i *)ivec);
  294                 Y = _mm_insert_epi32(Y, 0x1000000, 3);
  295                 /*(Compute E[ZERO, KS] and E[Y0, KS] together*/
  296                 tmp1 = _mm_xor_si128(X, KEY[0]);
  297                 tmp2 = _mm_xor_si128(Y, KEY[0]);
  298                 for (j=1; j < nr-1; j+=2) {
  299                         tmp1 = _mm_aesenc_si128(tmp1, KEY[j]);
  300                         tmp2 = _mm_aesenc_si128(tmp2, KEY[j]);
  301 
  302                         tmp1 = _mm_aesenc_si128(tmp1, KEY[j+1]);
  303                         tmp2 = _mm_aesenc_si128(tmp2, KEY[j+1]);
  304                 }
  305                 tmp1 = _mm_aesenc_si128(tmp1, KEY[nr-1]);
  306                 tmp2 = _mm_aesenc_si128(tmp2, KEY[nr-1]);
  307 
  308                 H = _mm_aesenclast_si128(tmp1, KEY[nr]);
  309                 T = _mm_aesenclast_si128(tmp2, KEY[nr]);
  310 
  311                 H = _mm_shuffle_epi8(H, BSWAP_MASK);
  312         } else {
  313                 tmp1 = _mm_xor_si128(X, KEY[0]);
  314                 for (j=1; j <nr; j++)
  315                         tmp1 = _mm_aesenc_si128(tmp1, KEY[j]);
  316                 H = _mm_aesenclast_si128(tmp1, KEY[nr]);
  317 
  318                 H = _mm_shuffle_epi8(H, BSWAP_MASK);
  319                 Y = _mm_setzero_si128();
  320 
  321                 for (i=0; i < ibytes/16; i++) {
  322                         tmp1 = _mm_loadu_si128(&((const __m128i *)ivec)[i]);
  323                         tmp1 = _mm_shuffle_epi8(tmp1, BSWAP_MASK);
  324                         Y = _mm_xor_si128(Y, tmp1);
  325                         gfmul(Y, H, &Y);
  326                 }
  327                 if (ibytes%16) {
  328                         for (j=0; j < ibytes%16; j++)
  329                                 ((unsigned char*)&last_block)[j] = ivec[i*16+j];
  330                         tmp1 = last_block;
  331                         tmp1 = _mm_shuffle_epi8(tmp1, BSWAP_MASK);
  332                         Y = _mm_xor_si128(Y, tmp1);
  333                         gfmul(Y, H, &Y);
  334                 }
  335                 tmp1 = _mm_insert_epi64(tmp1, (uint64_t)ibytes*8, 0);
  336                 tmp1 = _mm_insert_epi64(tmp1, 0, 1);
  337 
  338                 Y = _mm_xor_si128(Y, tmp1);
  339                 gfmul(Y, H, &Y);
  340                 Y = _mm_shuffle_epi8(Y, BSWAP_MASK); /*Compute E(K, Y0)*/
  341                 tmp1 = _mm_xor_si128(Y, KEY[0]);
  342                 for (j=1; j < nr; j++)
  343                         tmp1 = _mm_aesenc_si128(tmp1, KEY[j]);
  344                 T = _mm_aesenclast_si128(tmp1, KEY[nr]);
  345         }
  346 
  347         gfmul(H,H,&H2);
  348         gfmul(H,H2,&H3);
  349         gfmul(H,H3,&H4);
  350 
  351         for (i=0; i<abytes/16/4; i++) {
  352                 tmp1 = _mm_loadu_si128(&((const __m128i *)addt)[i*4]);
  353                 tmp2 = _mm_loadu_si128(&((const __m128i *)addt)[i*4+1]);
  354                 tmp3 = _mm_loadu_si128(&((const __m128i *)addt)[i*4+2]);
  355                 tmp4 = _mm_loadu_si128(&((const __m128i *)addt)[i*4+3]);
  356 
  357                 tmp1 = _mm_shuffle_epi8(tmp1, BSWAP_MASK);
  358                 tmp2 = _mm_shuffle_epi8(tmp2, BSWAP_MASK);
  359                 tmp3 = _mm_shuffle_epi8(tmp3, BSWAP_MASK);
  360                 tmp4 = _mm_shuffle_epi8(tmp4, BSWAP_MASK);
  361                 tmp1 = _mm_xor_si128(X, tmp1);
  362 
  363                 reduce4(H, H2, H3, H4, tmp4, tmp3, tmp2, tmp1, &X);
  364         }
  365         for (i=i*4; i<abytes/16; i++) {
  366                 tmp1 = _mm_loadu_si128(&((const __m128i *)addt)[i]);
  367                 tmp1 = _mm_shuffle_epi8(tmp1, BSWAP_MASK);
  368                 X = _mm_xor_si128(X,tmp1);
  369                 gfmul(X, H, &X);
  370         }
  371         if (abytes%16) {
  372                 last_block = _mm_setzero_si128();
  373                 for (j=0; j<abytes%16; j++)
  374                         ((unsigned char*)&last_block)[j] = addt[i*16+j];
  375                 tmp1 = last_block;
  376                 tmp1 = _mm_shuffle_epi8(tmp1, BSWAP_MASK);
  377                 X =_mm_xor_si128(X,tmp1);
  378                 gfmul(X,H,&X);
  379         }
  380 
  381         ctr1 = _mm_shuffle_epi8(Y, BSWAP_EPI64);
  382         ctr1 = _mm_add_epi64(ctr1, ONE);
  383         ctr2 = _mm_add_epi64(ctr1, ONE);
  384         ctr3 = _mm_add_epi64(ctr2, ONE);
  385         ctr4 = _mm_add_epi64(ctr3, ONE);
  386         ctr5 = _mm_add_epi64(ctr4, ONE);
  387         ctr6 = _mm_add_epi64(ctr5, ONE);
  388         ctr7 = _mm_add_epi64(ctr6, ONE);
  389         ctr8 = _mm_add_epi64(ctr7, ONE);
  390 
  391         for (i=0; i<nbytes/16/8; i++) {
  392                 tmp1 = _mm_shuffle_epi8(ctr1, BSWAP_EPI64);
  393                 tmp2 = _mm_shuffle_epi8(ctr2, BSWAP_EPI64);
  394                 tmp3 = _mm_shuffle_epi8(ctr3, BSWAP_EPI64);
  395                 tmp4 = _mm_shuffle_epi8(ctr4, BSWAP_EPI64);
  396                 tmp5 = _mm_shuffle_epi8(ctr5, BSWAP_EPI64);
  397                 tmp6 = _mm_shuffle_epi8(ctr6, BSWAP_EPI64);
  398                 tmp7 = _mm_shuffle_epi8(ctr7, BSWAP_EPI64);
  399                 tmp8 = _mm_shuffle_epi8(ctr8, BSWAP_EPI64);
  400 
  401                 ctr1 = _mm_add_epi64(ctr1, EIGHT);
  402                 ctr2 = _mm_add_epi64(ctr2, EIGHT);
  403                 ctr3 = _mm_add_epi64(ctr3, EIGHT);
  404                 ctr4 = _mm_add_epi64(ctr4, EIGHT);
  405                 ctr5 = _mm_add_epi64(ctr5, EIGHT);
  406                 ctr6 = _mm_add_epi64(ctr6, EIGHT);
  407                 ctr7 = _mm_add_epi64(ctr7, EIGHT);
  408                 ctr8 = _mm_add_epi64(ctr8, EIGHT);
  409 
  410                 tmp1 =_mm_xor_si128(tmp1, KEY[0]);
  411                 tmp2 =_mm_xor_si128(tmp2, KEY[0]);
  412                 tmp3 =_mm_xor_si128(tmp3, KEY[0]);
  413                 tmp4 =_mm_xor_si128(tmp4, KEY[0]);
  414                 tmp5 =_mm_xor_si128(tmp5, KEY[0]);
  415                 tmp6 =_mm_xor_si128(tmp6, KEY[0]);
  416                 tmp7 =_mm_xor_si128(tmp7, KEY[0]);
  417                 tmp8 =_mm_xor_si128(tmp8, KEY[0]);
  418 
  419                 for (j=1; j<nr; j++) {
  420                         tmp1 = _mm_aesenc_si128(tmp1, KEY[j]);
  421                         tmp2 = _mm_aesenc_si128(tmp2, KEY[j]);
  422                         tmp3 = _mm_aesenc_si128(tmp3, KEY[j]);
  423                         tmp4 = _mm_aesenc_si128(tmp4, KEY[j]);
  424                         tmp5 = _mm_aesenc_si128(tmp5, KEY[j]);
  425                         tmp6 = _mm_aesenc_si128(tmp6, KEY[j]);
  426                         tmp7 = _mm_aesenc_si128(tmp7, KEY[j]);
  427                         tmp8 = _mm_aesenc_si128(tmp8, KEY[j]);
  428                 }
  429                 tmp1 =_mm_aesenclast_si128(tmp1, KEY[nr]);
  430                 tmp2 =_mm_aesenclast_si128(tmp2, KEY[nr]);
  431                 tmp3 =_mm_aesenclast_si128(tmp3, KEY[nr]);
  432                 tmp4 =_mm_aesenclast_si128(tmp4, KEY[nr]);
  433                 tmp5 =_mm_aesenclast_si128(tmp5, KEY[nr]);
  434                 tmp6 =_mm_aesenclast_si128(tmp6, KEY[nr]);
  435                 tmp7 =_mm_aesenclast_si128(tmp7, KEY[nr]);
  436                 tmp8 =_mm_aesenclast_si128(tmp8, KEY[nr]);
  437 
  438                 tmp1 = _mm_xor_si128(tmp1,
  439                     _mm_loadu_si128(&((const __m128i *)in)[i*8+0]));
  440                 tmp2 = _mm_xor_si128(tmp2,
  441                     _mm_loadu_si128(&((const __m128i *)in)[i*8+1]));
  442                 tmp3 = _mm_xor_si128(tmp3,
  443                     _mm_loadu_si128(&((const __m128i *)in)[i*8+2]));
  444                 tmp4 = _mm_xor_si128(tmp4,
  445                     _mm_loadu_si128(&((const __m128i *)in)[i*8+3]));
  446                 tmp5 = _mm_xor_si128(tmp5,
  447                     _mm_loadu_si128(&((const __m128i *)in)[i*8+4]));
  448                 tmp6 = _mm_xor_si128(tmp6,
  449                     _mm_loadu_si128(&((const __m128i *)in)[i*8+5]));
  450                 tmp7 = _mm_xor_si128(tmp7,
  451                     _mm_loadu_si128(&((const __m128i *)in)[i*8+6]));
  452                 tmp8 = _mm_xor_si128(tmp8,
  453                     _mm_loadu_si128(&((const __m128i *)in)[i*8+7]));
  454 
  455                 _mm_storeu_si128(&((__m128i*)out)[i*8+0], tmp1);
  456                 _mm_storeu_si128(&((__m128i*)out)[i*8+1], tmp2);
  457                 _mm_storeu_si128(&((__m128i*)out)[i*8+2], tmp3);
  458                 _mm_storeu_si128(&((__m128i*)out)[i*8+3], tmp4);
  459                 _mm_storeu_si128(&((__m128i*)out)[i*8+4], tmp5);
  460                 _mm_storeu_si128(&((__m128i*)out)[i*8+5], tmp6);
  461                 _mm_storeu_si128(&((__m128i*)out)[i*8+6], tmp7);
  462                 _mm_storeu_si128(&((__m128i*)out)[i*8+7], tmp8);
  463 
  464                 tmp1 = _mm_shuffle_epi8(tmp1, BSWAP_MASK);
  465                 tmp2 = _mm_shuffle_epi8(tmp2, BSWAP_MASK);
  466                 tmp3 = _mm_shuffle_epi8(tmp3, BSWAP_MASK);
  467                 tmp4 = _mm_shuffle_epi8(tmp4, BSWAP_MASK);
  468                 tmp5 = _mm_shuffle_epi8(tmp5, BSWAP_MASK);
  469                 tmp6 = _mm_shuffle_epi8(tmp6, BSWAP_MASK);
  470                 tmp7 = _mm_shuffle_epi8(tmp7, BSWAP_MASK);
  471                 tmp8 = _mm_shuffle_epi8(tmp8, BSWAP_MASK);
  472 
  473                 tmp1 = _mm_xor_si128(X, tmp1);
  474 
  475                 reduce4(H, H2, H3, H4, tmp4, tmp3, tmp2, tmp1, &X);
  476 
  477                 tmp5 = _mm_xor_si128(X, tmp5);
  478                 reduce4(H, H2, H3, H4, tmp8, tmp7, tmp6, tmp5, &X);
  479         }
  480         for (k=i*8; k<nbytes/16; k++) {
  481                 tmp1 = _mm_shuffle_epi8(ctr1, BSWAP_EPI64);
  482                 ctr1 = _mm_add_epi64(ctr1, ONE);
  483                 tmp1 = _mm_xor_si128(tmp1, KEY[0]);
  484                 for (j=1; j<nr-1; j+=2) {
  485                         tmp1 = _mm_aesenc_si128(tmp1, KEY[j]);
  486                         tmp1 = _mm_aesenc_si128(tmp1, KEY[j+1]);
  487                 }
  488                 tmp1 = _mm_aesenc_si128(tmp1, KEY[nr-1]);
  489                 tmp1 = _mm_aesenclast_si128(tmp1, KEY[nr]);
  490                 tmp1 = _mm_xor_si128(tmp1,
  491                     _mm_loadu_si128(&((const __m128i *)in)[k]));
  492                 _mm_storeu_si128(&((__m128i*)out)[k], tmp1);
  493                 tmp1 = _mm_shuffle_epi8(tmp1, BSWAP_MASK);
  494                 X = _mm_xor_si128(X, tmp1);
  495                 gfmul(X,H,&X);
  496         }
  497         //If remains one incomplete block
  498         if (nbytes%16) {
  499                 tmp1 = _mm_shuffle_epi8(ctr1, BSWAP_EPI64);
  500                 tmp1 = _mm_xor_si128(tmp1, KEY[0]);
  501                 for (j=1; j<nr-1; j+=2) {
  502                         tmp1 = _mm_aesenc_si128(tmp1, KEY[j]);
  503                         tmp1 = _mm_aesenc_si128(tmp1, KEY[j+1]);
  504                 }
  505                 tmp1 = _mm_aesenc_si128(tmp1, KEY[nr-1]);
  506                 tmp1 = _mm_aesenclast_si128(tmp1, KEY[nr]);
  507                 last_block = _mm_setzero_si128();
  508                 memcpy(&last_block, &((const __m128i *)in)[k],
  509                     nbytes % 16);
  510                 last_block = _mm_xor_si128(last_block, tmp1);
  511                 for (j=0; j<nbytes%16; j++)
  512                         out[k*16+j] = ((unsigned char*)&last_block)[j];
  513                 for ((void)j; j<16; j++)
  514                         ((unsigned char*)&last_block)[j] = 0;
  515                 tmp1 = last_block;
  516                 tmp1 = _mm_shuffle_epi8(tmp1, BSWAP_MASK);
  517                 X = _mm_xor_si128(X, tmp1);
  518                 gfmul(X, H, &X);
  519         }
  520         tmp1 = _mm_insert_epi64(tmp1, (uint64_t)nbytes*8, 0);
  521         tmp1 = _mm_insert_epi64(tmp1, (uint64_t)abytes*8, 1);
  522 
  523         X = _mm_xor_si128(X, tmp1);
  524         gfmul(X,H,&X);
  525         X = _mm_shuffle_epi8(X, BSWAP_MASK);
  526         T = _mm_xor_si128(X, T);
  527         _mm_storeu_si128((__m128i*)tag, T);
  528 }
  529 
  530 /* My modification of _encrypt to be _decrypt */
  531 int
  532 AES_GCM_decrypt(const unsigned char *in, unsigned char *out,
  533         const unsigned char *addt, const unsigned char *ivec,
  534         const unsigned char *tag, uint32_t nbytes, uint32_t abytes, int ibytes,
  535         const unsigned char *key, int nr)
  536 {
  537         int i, j ,k;
  538         __m128i tmp1, tmp2, tmp3, tmp4;
  539         __m128i tmp5, tmp6, tmp7, tmp8;
  540         __m128i H, H2, H3, H4, Y, T;
  541         const __m128i *KEY = (const __m128i *)key;
  542         __m128i ctr1, ctr2, ctr3, ctr4;
  543         __m128i ctr5, ctr6, ctr7, ctr8;
  544         __m128i last_block = _mm_setzero_si128();
  545         __m128i ONE = _mm_set_epi32(0, 1, 0, 0);
  546         __m128i EIGHT = _mm_set_epi32(0, 8, 0, 0);
  547         __m128i BSWAP_EPI64 = _mm_set_epi8(8,9,10,11,12,13,14,15,0,1,2,3,4,5,6,
  548             7);
  549         __m128i BSWAP_MASK = _mm_set_epi8(0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,
  550             15);
  551         __m128i X = _mm_setzero_si128();
  552 
  553         if (ibytes == 96/8) {
  554                 Y = _mm_loadu_si128((const __m128i *)ivec);
  555                 Y = _mm_insert_epi32(Y, 0x1000000, 3);
  556                 /*(Compute E[ZERO, KS] and E[Y0, KS] together*/
  557                 tmp1 = _mm_xor_si128(X, KEY[0]);
  558                 tmp2 = _mm_xor_si128(Y, KEY[0]);
  559                 for (j=1; j < nr-1; j+=2) {
  560                         tmp1 = _mm_aesenc_si128(tmp1, KEY[j]);
  561                         tmp2 = _mm_aesenc_si128(tmp2, KEY[j]);
  562 
  563                         tmp1 = _mm_aesenc_si128(tmp1, KEY[j+1]);
  564                         tmp2 = _mm_aesenc_si128(tmp2, KEY[j+1]);
  565                 }
  566                 tmp1 = _mm_aesenc_si128(tmp1, KEY[nr-1]);
  567                 tmp2 = _mm_aesenc_si128(tmp2, KEY[nr-1]);
  568 
  569                 H = _mm_aesenclast_si128(tmp1, KEY[nr]);
  570                 T = _mm_aesenclast_si128(tmp2, KEY[nr]);
  571 
  572                 H = _mm_shuffle_epi8(H, BSWAP_MASK);
  573         } else {
  574                 tmp1 = _mm_xor_si128(X, KEY[0]);
  575                 for (j=1; j <nr; j++)
  576                         tmp1 = _mm_aesenc_si128(tmp1, KEY[j]);
  577                 H = _mm_aesenclast_si128(tmp1, KEY[nr]);
  578 
  579                 H = _mm_shuffle_epi8(H, BSWAP_MASK);
  580                 Y = _mm_setzero_si128();
  581 
  582                 for (i=0; i < ibytes/16; i++) {
  583                         tmp1 = _mm_loadu_si128(&((const __m128i *)ivec)[i]);
  584                         tmp1 = _mm_shuffle_epi8(tmp1, BSWAP_MASK);
  585                         Y = _mm_xor_si128(Y, tmp1);
  586                         gfmul(Y, H, &Y);
  587                 }
  588                 if (ibytes%16) {
  589                         for (j=0; j < ibytes%16; j++)
  590                                 ((unsigned char*)&last_block)[j] = ivec[i*16+j];
  591                         tmp1 = last_block;
  592                         tmp1 = _mm_shuffle_epi8(tmp1, BSWAP_MASK);
  593                         Y = _mm_xor_si128(Y, tmp1);
  594                         gfmul(Y, H, &Y);
  595                 }
  596                 tmp1 = _mm_insert_epi64(tmp1, (uint64_t)ibytes*8, 0);
  597                 tmp1 = _mm_insert_epi64(tmp1, 0, 1);
  598 
  599                 Y = _mm_xor_si128(Y, tmp1);
  600                 gfmul(Y, H, &Y);
  601                 Y = _mm_shuffle_epi8(Y, BSWAP_MASK); /*Compute E(K, Y0)*/
  602                 tmp1 = _mm_xor_si128(Y, KEY[0]);
  603                 for (j=1; j < nr; j++)
  604                         tmp1 = _mm_aesenc_si128(tmp1, KEY[j]);
  605                 T = _mm_aesenclast_si128(tmp1, KEY[nr]);
  606         }
  607 
  608         gfmul(H,H,&H2);
  609         gfmul(H,H2,&H3);
  610         gfmul(H,H3,&H4);
  611 
  612         for (i=0; i<abytes/16/4; i++) {
  613                 tmp1 = _mm_loadu_si128(&((const __m128i *)addt)[i*4]);
  614                 tmp2 = _mm_loadu_si128(&((const __m128i *)addt)[i*4+1]);
  615                 tmp3 = _mm_loadu_si128(&((const __m128i *)addt)[i*4+2]);
  616                 tmp4 = _mm_loadu_si128(&((const __m128i *)addt)[i*4+3]);
  617 
  618                 tmp1 = _mm_shuffle_epi8(tmp1, BSWAP_MASK);
  619                 tmp2 = _mm_shuffle_epi8(tmp2, BSWAP_MASK);
  620                 tmp3 = _mm_shuffle_epi8(tmp3, BSWAP_MASK);
  621                 tmp4 = _mm_shuffle_epi8(tmp4, BSWAP_MASK);
  622 
  623                 tmp1 = _mm_xor_si128(X, tmp1);
  624 
  625                 reduce4(H, H2, H3, H4, tmp4, tmp3, tmp2, tmp1, &X);
  626         }
  627         for (i=i*4; i<abytes/16; i++) {
  628                 tmp1 = _mm_loadu_si128(&((const __m128i *)addt)[i]);
  629                 tmp1 = _mm_shuffle_epi8(tmp1, BSWAP_MASK);
  630                 X = _mm_xor_si128(X,tmp1);
  631                 gfmul(X, H, &X);
  632         }
  633         if (abytes%16) {
  634                 last_block = _mm_setzero_si128();
  635                 for (j=0; j<abytes%16; j++)
  636                         ((unsigned char*)&last_block)[j] = addt[i*16+j];
  637                 tmp1 = last_block;
  638                 tmp1 = _mm_shuffle_epi8(tmp1, BSWAP_MASK);
  639                 X =_mm_xor_si128(X,tmp1);
  640                 gfmul(X,H,&X);
  641         }
  642 
  643         /* This is where we validate the cipher text before decrypt */
  644         for (i = 0; i<nbytes/16/4; i++) {
  645                 tmp1 = _mm_loadu_si128(&((const __m128i *)in)[i*4]);
  646                 tmp2 = _mm_loadu_si128(&((const __m128i *)in)[i*4+1]);
  647                 tmp3 = _mm_loadu_si128(&((const __m128i *)in)[i*4+2]);
  648                 tmp4 = _mm_loadu_si128(&((const __m128i *)in)[i*4+3]);
  649 
  650                 tmp1 = _mm_shuffle_epi8(tmp1, BSWAP_MASK);
  651                 tmp2 = _mm_shuffle_epi8(tmp2, BSWAP_MASK);
  652                 tmp3 = _mm_shuffle_epi8(tmp3, BSWAP_MASK);
  653                 tmp4 = _mm_shuffle_epi8(tmp4, BSWAP_MASK);
  654 
  655                 tmp1 = _mm_xor_si128(X, tmp1);
  656 
  657                 reduce4(H, H2, H3, H4, tmp4, tmp3, tmp2, tmp1, &X);
  658         }
  659         for (i = i*4; i<nbytes/16; i++) {
  660                 tmp1 = _mm_loadu_si128(&((const __m128i *)in)[i]);
  661                 tmp1 = _mm_shuffle_epi8(tmp1, BSWAP_MASK);
  662                 X = _mm_xor_si128(X, tmp1);
  663                 gfmul(X,H,&X);
  664         }
  665         if (nbytes%16) {
  666                 last_block = _mm_setzero_si128();
  667                 for (j=0; j<nbytes%16; j++)
  668                         ((unsigned char*)&last_block)[j] = in[i*16+j];
  669                 tmp1 = last_block;
  670                 tmp1 = _mm_shuffle_epi8(tmp1, BSWAP_MASK);
  671                 X = _mm_xor_si128(X, tmp1);
  672                 gfmul(X, H, &X);
  673         }
  674 
  675         tmp1 = _mm_insert_epi64(tmp1, (uint64_t)nbytes*8, 0);
  676         tmp1 = _mm_insert_epi64(tmp1, (uint64_t)abytes*8, 1);
  677 
  678         X = _mm_xor_si128(X, tmp1);
  679         gfmul(X,H,&X);
  680         X = _mm_shuffle_epi8(X, BSWAP_MASK);
  681         T = _mm_xor_si128(X, T);
  682 
  683         if (!m128icmp(T, _mm_loadu_si128((const __m128i*)tag)))
  684                 return 0; //in case the authentication failed
  685 
  686         ctr1 = _mm_shuffle_epi8(Y, BSWAP_EPI64);
  687         ctr1 = _mm_add_epi64(ctr1, ONE);
  688         ctr2 = _mm_add_epi64(ctr1, ONE);
  689         ctr3 = _mm_add_epi64(ctr2, ONE);
  690         ctr4 = _mm_add_epi64(ctr3, ONE);
  691         ctr5 = _mm_add_epi64(ctr4, ONE);
  692         ctr6 = _mm_add_epi64(ctr5, ONE);
  693         ctr7 = _mm_add_epi64(ctr6, ONE);
  694         ctr8 = _mm_add_epi64(ctr7, ONE);
  695 
  696         for (i=0; i<nbytes/16/8; i++) {
  697                 tmp1 = _mm_shuffle_epi8(ctr1, BSWAP_EPI64);
  698                 tmp2 = _mm_shuffle_epi8(ctr2, BSWAP_EPI64);
  699                 tmp3 = _mm_shuffle_epi8(ctr3, BSWAP_EPI64);
  700                 tmp4 = _mm_shuffle_epi8(ctr4, BSWAP_EPI64);
  701                 tmp5 = _mm_shuffle_epi8(ctr5, BSWAP_EPI64);
  702                 tmp6 = _mm_shuffle_epi8(ctr6, BSWAP_EPI64);
  703                 tmp7 = _mm_shuffle_epi8(ctr7, BSWAP_EPI64);
  704                 tmp8 = _mm_shuffle_epi8(ctr8, BSWAP_EPI64);
  705 
  706                 ctr1 = _mm_add_epi64(ctr1, EIGHT);
  707                 ctr2 = _mm_add_epi64(ctr2, EIGHT);
  708                 ctr3 = _mm_add_epi64(ctr3, EIGHT);
  709                 ctr4 = _mm_add_epi64(ctr4, EIGHT);
  710                 ctr5 = _mm_add_epi64(ctr5, EIGHT);
  711                 ctr6 = _mm_add_epi64(ctr6, EIGHT);
  712                 ctr7 = _mm_add_epi64(ctr7, EIGHT);
  713                 ctr8 = _mm_add_epi64(ctr8, EIGHT);
  714 
  715                 tmp1 =_mm_xor_si128(tmp1, KEY[0]);
  716                 tmp2 =_mm_xor_si128(tmp2, KEY[0]);
  717                 tmp3 =_mm_xor_si128(tmp3, KEY[0]);
  718                 tmp4 =_mm_xor_si128(tmp4, KEY[0]);
  719                 tmp5 =_mm_xor_si128(tmp5, KEY[0]);
  720                 tmp6 =_mm_xor_si128(tmp6, KEY[0]);
  721                 tmp7 =_mm_xor_si128(tmp7, KEY[0]);
  722                 tmp8 =_mm_xor_si128(tmp8, KEY[0]);
  723 
  724                 for (j=1; j<nr; j++) {
  725                         tmp1 = _mm_aesenc_si128(tmp1, KEY[j]);
  726                         tmp2 = _mm_aesenc_si128(tmp2, KEY[j]);
  727                         tmp3 = _mm_aesenc_si128(tmp3, KEY[j]);
  728                         tmp4 = _mm_aesenc_si128(tmp4, KEY[j]);
  729                         tmp5 = _mm_aesenc_si128(tmp5, KEY[j]);
  730                         tmp6 = _mm_aesenc_si128(tmp6, KEY[j]);
  731                         tmp7 = _mm_aesenc_si128(tmp7, KEY[j]);
  732                         tmp8 = _mm_aesenc_si128(tmp8, KEY[j]);
  733                 }
  734                 tmp1 =_mm_aesenclast_si128(tmp1, KEY[nr]);
  735                 tmp2 =_mm_aesenclast_si128(tmp2, KEY[nr]);
  736                 tmp3 =_mm_aesenclast_si128(tmp3, KEY[nr]);
  737                 tmp4 =_mm_aesenclast_si128(tmp4, KEY[nr]);
  738                 tmp5 =_mm_aesenclast_si128(tmp5, KEY[nr]);
  739                 tmp6 =_mm_aesenclast_si128(tmp6, KEY[nr]);
  740                 tmp7 =_mm_aesenclast_si128(tmp7, KEY[nr]);
  741                 tmp8 =_mm_aesenclast_si128(tmp8, KEY[nr]);
  742 
  743                 tmp1 = _mm_xor_si128(tmp1,
  744                     _mm_loadu_si128(&((const __m128i *)in)[i*8+0]));
  745                 tmp2 = _mm_xor_si128(tmp2,
  746                     _mm_loadu_si128(&((const __m128i *)in)[i*8+1]));
  747                 tmp3 = _mm_xor_si128(tmp3,
  748                     _mm_loadu_si128(&((const __m128i *)in)[i*8+2]));
  749                 tmp4 = _mm_xor_si128(tmp4,
  750                     _mm_loadu_si128(&((const __m128i *)in)[i*8+3]));
  751                 tmp5 = _mm_xor_si128(tmp5,
  752                     _mm_loadu_si128(&((const __m128i *)in)[i*8+4]));
  753                 tmp6 = _mm_xor_si128(tmp6,
  754                     _mm_loadu_si128(&((const __m128i *)in)[i*8+5]));
  755                 tmp7 = _mm_xor_si128(tmp7,
  756                     _mm_loadu_si128(&((const __m128i *)in)[i*8+6]));
  757                 tmp8 = _mm_xor_si128(tmp8,
  758                     _mm_loadu_si128(&((const __m128i *)in)[i*8+7]));
  759 
  760                 _mm_storeu_si128(&((__m128i*)out)[i*8+0], tmp1);
  761                 _mm_storeu_si128(&((__m128i*)out)[i*8+1], tmp2);
  762                 _mm_storeu_si128(&((__m128i*)out)[i*8+2], tmp3);
  763                 _mm_storeu_si128(&((__m128i*)out)[i*8+3], tmp4);
  764                 _mm_storeu_si128(&((__m128i*)out)[i*8+4], tmp5);
  765                 _mm_storeu_si128(&((__m128i*)out)[i*8+5], tmp6);
  766                 _mm_storeu_si128(&((__m128i*)out)[i*8+6], tmp7);
  767                 _mm_storeu_si128(&((__m128i*)out)[i*8+7], tmp8);
  768 
  769                 tmp1 = _mm_shuffle_epi8(tmp1, BSWAP_MASK);
  770                 tmp2 = _mm_shuffle_epi8(tmp2, BSWAP_MASK);
  771                 tmp3 = _mm_shuffle_epi8(tmp3, BSWAP_MASK);
  772                 tmp4 = _mm_shuffle_epi8(tmp4, BSWAP_MASK);
  773                 tmp5 = _mm_shuffle_epi8(tmp5, BSWAP_MASK);
  774                 tmp6 = _mm_shuffle_epi8(tmp6, BSWAP_MASK);
  775                 tmp7 = _mm_shuffle_epi8(tmp7, BSWAP_MASK);
  776                 tmp8 = _mm_shuffle_epi8(tmp8, BSWAP_MASK);
  777         }
  778         for (k=i*8; k<nbytes/16; k++) {
  779                 tmp1 = _mm_shuffle_epi8(ctr1, BSWAP_EPI64);
  780                 ctr1 = _mm_add_epi64(ctr1, ONE);
  781                 tmp1 = _mm_xor_si128(tmp1, KEY[0]);
  782                 for (j=1; j<nr-1; j+=2) {
  783                         tmp1 = _mm_aesenc_si128(tmp1, KEY[j]);
  784                         tmp1 = _mm_aesenc_si128(tmp1, KEY[j+1]);
  785                 }
  786                 tmp1 = _mm_aesenc_si128(tmp1, KEY[nr-1]);
  787                 tmp1 = _mm_aesenclast_si128(tmp1, KEY[nr]);
  788                 tmp1 = _mm_xor_si128(tmp1,
  789                     _mm_loadu_si128(&((const __m128i *)in)[k]));
  790                 _mm_storeu_si128(&((__m128i*)out)[k], tmp1);
  791         }
  792         //If remains one incomplete block
  793         if (nbytes%16) {
  794                 tmp1 = _mm_shuffle_epi8(ctr1, BSWAP_EPI64);
  795                 tmp1 = _mm_xor_si128(tmp1, KEY[0]);
  796                 for (j=1; j<nr-1; j+=2) {
  797                         tmp1 = _mm_aesenc_si128(tmp1, KEY[j]);
  798                         tmp1 = _mm_aesenc_si128(tmp1, KEY[j+1]);
  799                 }
  800                 tmp1 = _mm_aesenc_si128(tmp1, KEY[nr-1]);
  801                 tmp1 = _mm_aesenclast_si128(tmp1, KEY[nr]);
  802                 tmp1 = _mm_xor_si128(tmp1,
  803                     _mm_loadu_si128(&((const __m128i *)in)[k]));
  804                 last_block = tmp1;
  805                 for (j=0; j<nbytes%16; j++)
  806                         out[k*16+j] = ((unsigned char*)&last_block)[j];
  807         }
  808         return 1; //when sucessfull returns 1
  809 }

Cache object: e9abdbcf66219e33d5f94e80c153ca2c


[ source navigation ] [ diff markup ] [ identifier search ] [ freetext search ] [ file search ] [ list types ] [ track identifier ]


This page is part of the FreeBSD/Linux Linux Kernel Cross-Reference, and was automatically generated using a modified version of the LXR engine.