The Design and Implementation of the FreeBSD Operating System, Second Edition
Now available: The Design and Implementation of the FreeBSD Operating System (Second Edition)


[ source navigation ] [ diff markup ] [ identifier search ] [ freetext search ] [ file search ] [ list types ] [ track identifier ]

FreeBSD/Linux Kernel Cross Reference
sys/crypto/openssl/aarch64/chacha-armv8.S

Version: -  FREEBSD  -  FREEBSD-13-STABLE  -  FREEBSD-13-0  -  FREEBSD-12-STABLE  -  FREEBSD-12-0  -  FREEBSD-11-STABLE  -  FREEBSD-11-0  -  FREEBSD-10-STABLE  -  FREEBSD-10-0  -  FREEBSD-9-STABLE  -  FREEBSD-9-0  -  FREEBSD-8-STABLE  -  FREEBSD-8-0  -  FREEBSD-7-STABLE  -  FREEBSD-7-0  -  FREEBSD-6-STABLE  -  FREEBSD-6-0  -  FREEBSD-5-STABLE  -  FREEBSD-5-0  -  FREEBSD-4-STABLE  -  FREEBSD-3-STABLE  -  FREEBSD22  -  l41  -  OPENBSD  -  linux-2.6  -  MK84  -  PLAN9  -  xnu-8792 
SearchContext: -  none  -  3  -  10 

    1 /* $FreeBSD$ */
    2 /* Do not modify. This file is auto-generated from chacha-armv8.pl. */
    3 #include "arm_arch.h"
    4 
    5 .text
    6 
    7 
    8 .hidden OPENSSL_armcap_P
    9 
   10 .align  5
   11 .Lsigma:
   12 .quad   0x3320646e61707865,0x6b20657479622d32           // endian-neutral
   13 .Lone:
   14 .long   1,0,0,0
   15 .LOPENSSL_armcap_P:
   16 #ifdef  __ILP32__
   17 .long   OPENSSL_armcap_P-.
   18 #else
   19 .quad   OPENSSL_armcap_P-.
   20 #endif
   21 .byte   67,104,97,67,104,97,50,48,32,102,111,114,32,65,82,77,118,56,44,32,67,82,89,80,84,79,71,65,77,83,32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103,62,0
   22 .align  2
   23 
   24 .globl  ChaCha20_ctr32
   25 .type   ChaCha20_ctr32,%function
   26 .align  5
   27 ChaCha20_ctr32:
   28         cbz     x2,.Labort
   29         adr     x5,.LOPENSSL_armcap_P
   30         cmp     x2,#192
   31         b.lo    .Lshort
   32 #ifdef  __ILP32__
   33         ldrsw   x6,[x5]
   34 #else
   35         ldr     x6,[x5]
   36 #endif
   37         ldr     w17,[x6,x5]
   38         tst     w17,#ARMV7_NEON
   39         b.ne    ChaCha20_neon
   40 
   41 .Lshort:
   42 .inst   0xd503233f                      // paciasp
   43         stp     x29,x30,[sp,#-96]!
   44         add     x29,sp,#0
   45 
   46         adr     x5,.Lsigma
   47         stp     x19,x20,[sp,#16]
   48         stp     x21,x22,[sp,#32]
   49         stp     x23,x24,[sp,#48]
   50         stp     x25,x26,[sp,#64]
   51         stp     x27,x28,[sp,#80]
   52         sub     sp,sp,#64
   53 
   54         ldp     x22,x23,[x5]            // load sigma
   55         ldp     x24,x25,[x3]            // load key
   56         ldp     x26,x27,[x3,#16]
   57         ldp     x28,x30,[x4]            // load counter
   58 #ifdef  __ARMEB__
   59         ror     x24,x24,#32
   60         ror     x25,x25,#32
   61         ror     x26,x26,#32
   62         ror     x27,x27,#32
   63         ror     x28,x28,#32
   64         ror     x30,x30,#32
   65 #endif
   66 
   67 .Loop_outer:
   68         mov     w5,w22                  // unpack key block
   69         lsr     x6,x22,#32
   70         mov     w7,w23
   71         lsr     x8,x23,#32
   72         mov     w9,w24
   73         lsr     x10,x24,#32
   74         mov     w11,w25
   75         lsr     x12,x25,#32
   76         mov     w13,w26
   77         lsr     x14,x26,#32
   78         mov     w15,w27
   79         lsr     x16,x27,#32
   80         mov     w17,w28
   81         lsr     x19,x28,#32
   82         mov     w20,w30
   83         lsr     x21,x30,#32
   84 
   85         mov     x4,#10
   86         subs    x2,x2,#64
   87 .Loop:
   88         sub     x4,x4,#1
   89         add     w5,w5,w9
   90         add     w6,w6,w10
   91         add     w7,w7,w11
   92         add     w8,w8,w12
   93         eor     w17,w17,w5
   94         eor     w19,w19,w6
   95         eor     w20,w20,w7
   96         eor     w21,w21,w8
   97         ror     w17,w17,#16
   98         ror     w19,w19,#16
   99         ror     w20,w20,#16
  100         ror     w21,w21,#16
  101         add     w13,w13,w17
  102         add     w14,w14,w19
  103         add     w15,w15,w20
  104         add     w16,w16,w21
  105         eor     w9,w9,w13
  106         eor     w10,w10,w14
  107         eor     w11,w11,w15
  108         eor     w12,w12,w16
  109         ror     w9,w9,#20
  110         ror     w10,w10,#20
  111         ror     w11,w11,#20
  112         ror     w12,w12,#20
  113         add     w5,w5,w9
  114         add     w6,w6,w10
  115         add     w7,w7,w11
  116         add     w8,w8,w12
  117         eor     w17,w17,w5
  118         eor     w19,w19,w6
  119         eor     w20,w20,w7
  120         eor     w21,w21,w8
  121         ror     w17,w17,#24
  122         ror     w19,w19,#24
  123         ror     w20,w20,#24
  124         ror     w21,w21,#24
  125         add     w13,w13,w17
  126         add     w14,w14,w19
  127         add     w15,w15,w20
  128         add     w16,w16,w21
  129         eor     w9,w9,w13
  130         eor     w10,w10,w14
  131         eor     w11,w11,w15
  132         eor     w12,w12,w16
  133         ror     w9,w9,#25
  134         ror     w10,w10,#25
  135         ror     w11,w11,#25
  136         ror     w12,w12,#25
  137         add     w5,w5,w10
  138         add     w6,w6,w11
  139         add     w7,w7,w12
  140         add     w8,w8,w9
  141         eor     w21,w21,w5
  142         eor     w17,w17,w6
  143         eor     w19,w19,w7
  144         eor     w20,w20,w8
  145         ror     w21,w21,#16
  146         ror     w17,w17,#16
  147         ror     w19,w19,#16
  148         ror     w20,w20,#16
  149         add     w15,w15,w21
  150         add     w16,w16,w17
  151         add     w13,w13,w19
  152         add     w14,w14,w20
  153         eor     w10,w10,w15
  154         eor     w11,w11,w16
  155         eor     w12,w12,w13
  156         eor     w9,w9,w14
  157         ror     w10,w10,#20
  158         ror     w11,w11,#20
  159         ror     w12,w12,#20
  160         ror     w9,w9,#20
  161         add     w5,w5,w10
  162         add     w6,w6,w11
  163         add     w7,w7,w12
  164         add     w8,w8,w9
  165         eor     w21,w21,w5
  166         eor     w17,w17,w6
  167         eor     w19,w19,w7
  168         eor     w20,w20,w8
  169         ror     w21,w21,#24
  170         ror     w17,w17,#24
  171         ror     w19,w19,#24
  172         ror     w20,w20,#24
  173         add     w15,w15,w21
  174         add     w16,w16,w17
  175         add     w13,w13,w19
  176         add     w14,w14,w20
  177         eor     w10,w10,w15
  178         eor     w11,w11,w16
  179         eor     w12,w12,w13
  180         eor     w9,w9,w14
  181         ror     w10,w10,#25
  182         ror     w11,w11,#25
  183         ror     w12,w12,#25
  184         ror     w9,w9,#25
  185         cbnz    x4,.Loop
  186 
  187         add     w5,w5,w22               // accumulate key block
  188         add     x6,x6,x22,lsr#32
  189         add     w7,w7,w23
  190         add     x8,x8,x23,lsr#32
  191         add     w9,w9,w24
  192         add     x10,x10,x24,lsr#32
  193         add     w11,w11,w25
  194         add     x12,x12,x25,lsr#32
  195         add     w13,w13,w26
  196         add     x14,x14,x26,lsr#32
  197         add     w15,w15,w27
  198         add     x16,x16,x27,lsr#32
  199         add     w17,w17,w28
  200         add     x19,x19,x28,lsr#32
  201         add     w20,w20,w30
  202         add     x21,x21,x30,lsr#32
  203 
  204         b.lo    .Ltail
  205 
  206         add     x5,x5,x6,lsl#32 // pack
  207         add     x7,x7,x8,lsl#32
  208         ldp     x6,x8,[x1,#0]           // load input
  209         add     x9,x9,x10,lsl#32
  210         add     x11,x11,x12,lsl#32
  211         ldp     x10,x12,[x1,#16]
  212         add     x13,x13,x14,lsl#32
  213         add     x15,x15,x16,lsl#32
  214         ldp     x14,x16,[x1,#32]
  215         add     x17,x17,x19,lsl#32
  216         add     x20,x20,x21,lsl#32
  217         ldp     x19,x21,[x1,#48]
  218         add     x1,x1,#64
  219 #ifdef  __ARMEB__
  220         rev     x5,x5
  221         rev     x7,x7
  222         rev     x9,x9
  223         rev     x11,x11
  224         rev     x13,x13
  225         rev     x15,x15
  226         rev     x17,x17
  227         rev     x20,x20
  228 #endif
  229         eor     x5,x5,x6
  230         eor     x7,x7,x8
  231         eor     x9,x9,x10
  232         eor     x11,x11,x12
  233         eor     x13,x13,x14
  234         eor     x15,x15,x16
  235         eor     x17,x17,x19
  236         eor     x20,x20,x21
  237 
  238         stp     x5,x7,[x0,#0]           // store output
  239         add     x28,x28,#1                      // increment counter
  240         stp     x9,x11,[x0,#16]
  241         stp     x13,x15,[x0,#32]
  242         stp     x17,x20,[x0,#48]
  243         add     x0,x0,#64
  244 
  245         b.hi    .Loop_outer
  246 
  247         ldp     x19,x20,[x29,#16]
  248         add     sp,sp,#64
  249         ldp     x21,x22,[x29,#32]
  250         ldp     x23,x24,[x29,#48]
  251         ldp     x25,x26,[x29,#64]
  252         ldp     x27,x28,[x29,#80]
  253         ldp     x29,x30,[sp],#96
  254 .inst   0xd50323bf                      // autiasp
  255 .Labort:
  256         ret
  257 
  258 .align  4
  259 .Ltail:
  260         add     x2,x2,#64
  261 .Less_than_64:
  262         sub     x0,x0,#1
  263         add     x1,x1,x2
  264         add     x0,x0,x2
  265         add     x4,sp,x2
  266         neg     x2,x2
  267 
  268         add     x5,x5,x6,lsl#32 // pack
  269         add     x7,x7,x8,lsl#32
  270         add     x9,x9,x10,lsl#32
  271         add     x11,x11,x12,lsl#32
  272         add     x13,x13,x14,lsl#32
  273         add     x15,x15,x16,lsl#32
  274         add     x17,x17,x19,lsl#32
  275         add     x20,x20,x21,lsl#32
  276 #ifdef  __ARMEB__
  277         rev     x5,x5
  278         rev     x7,x7
  279         rev     x9,x9
  280         rev     x11,x11
  281         rev     x13,x13
  282         rev     x15,x15
  283         rev     x17,x17
  284         rev     x20,x20
  285 #endif
  286         stp     x5,x7,[sp,#0]
  287         stp     x9,x11,[sp,#16]
  288         stp     x13,x15,[sp,#32]
  289         stp     x17,x20,[sp,#48]
  290 
  291 .Loop_tail:
  292         ldrb    w10,[x1,x2]
  293         ldrb    w11,[x4,x2]
  294         add     x2,x2,#1
  295         eor     w10,w10,w11
  296         strb    w10,[x0,x2]
  297         cbnz    x2,.Loop_tail
  298 
  299         stp     xzr,xzr,[sp,#0]
  300         stp     xzr,xzr,[sp,#16]
  301         stp     xzr,xzr,[sp,#32]
  302         stp     xzr,xzr,[sp,#48]
  303 
  304         ldp     x19,x20,[x29,#16]
  305         add     sp,sp,#64
  306         ldp     x21,x22,[x29,#32]
  307         ldp     x23,x24,[x29,#48]
  308         ldp     x25,x26,[x29,#64]
  309         ldp     x27,x28,[x29,#80]
  310         ldp     x29,x30,[sp],#96
  311 .inst   0xd50323bf                      // autiasp
  312         ret
  313 .size   ChaCha20_ctr32,.-ChaCha20_ctr32
  314 
  315 .type   ChaCha20_neon,%function
  316 .align  5
  317 ChaCha20_neon:
  318 .inst   0xd503233f                      // paciasp
  319         stp     x29,x30,[sp,#-96]!
  320         add     x29,sp,#0
  321 
  322         adr     x5,.Lsigma
  323         stp     x19,x20,[sp,#16]
  324         stp     x21,x22,[sp,#32]
  325         stp     x23,x24,[sp,#48]
  326         stp     x25,x26,[sp,#64]
  327         stp     x27,x28,[sp,#80]
  328         cmp     x2,#512
  329         b.hs    .L512_or_more_neon
  330 
  331         sub     sp,sp,#64
  332 
  333         ldp     x22,x23,[x5]            // load sigma
  334         ld1     {v24.4s},[x5],#16
  335         ldp     x24,x25,[x3]            // load key
  336         ldp     x26,x27,[x3,#16]
  337         ld1     {v25.4s,v26.4s},[x3]
  338         ldp     x28,x30,[x4]            // load counter
  339         ld1     {v27.4s},[x4]
  340         ld1     {v31.4s},[x5]
  341 #ifdef  __ARMEB__
  342         rev64   v24.4s,v24.4s
  343         ror     x24,x24,#32
  344         ror     x25,x25,#32
  345         ror     x26,x26,#32
  346         ror     x27,x27,#32
  347         ror     x28,x28,#32
  348         ror     x30,x30,#32
  349 #endif
  350         add     v27.4s,v27.4s,v31.4s            // += 1
  351         add     v28.4s,v27.4s,v31.4s
  352         add     v29.4s,v28.4s,v31.4s
  353         shl     v31.4s,v31.4s,#2                        // 1 -> 4
  354 
  355 .Loop_outer_neon:
  356         mov     w5,w22                  // unpack key block
  357         lsr     x6,x22,#32
  358         mov     v0.16b,v24.16b
  359         mov     w7,w23
  360         lsr     x8,x23,#32
  361         mov     v4.16b,v24.16b
  362         mov     w9,w24
  363         lsr     x10,x24,#32
  364         mov     v16.16b,v24.16b
  365         mov     w11,w25
  366         mov     v1.16b,v25.16b
  367         lsr     x12,x25,#32
  368         mov     v5.16b,v25.16b
  369         mov     w13,w26
  370         mov     v17.16b,v25.16b
  371         lsr     x14,x26,#32
  372         mov     v3.16b,v27.16b
  373         mov     w15,w27
  374         mov     v7.16b,v28.16b
  375         lsr     x16,x27,#32
  376         mov     v19.16b,v29.16b
  377         mov     w17,w28
  378         mov     v2.16b,v26.16b
  379         lsr     x19,x28,#32
  380         mov     v6.16b,v26.16b
  381         mov     w20,w30
  382         mov     v18.16b,v26.16b
  383         lsr     x21,x30,#32
  384 
  385         mov     x4,#10
  386         subs    x2,x2,#256
  387 .Loop_neon:
  388         sub     x4,x4,#1
  389         add     v0.4s,v0.4s,v1.4s
  390         add     w5,w5,w9
  391         add     v4.4s,v4.4s,v5.4s
  392         add     w6,w6,w10
  393         add     v16.4s,v16.4s,v17.4s
  394         add     w7,w7,w11
  395         eor     v3.16b,v3.16b,v0.16b
  396         add     w8,w8,w12
  397         eor     v7.16b,v7.16b,v4.16b
  398         eor     w17,w17,w5
  399         eor     v19.16b,v19.16b,v16.16b
  400         eor     w19,w19,w6
  401         rev32   v3.8h,v3.8h
  402         eor     w20,w20,w7
  403         rev32   v7.8h,v7.8h
  404         eor     w21,w21,w8
  405         rev32   v19.8h,v19.8h
  406         ror     w17,w17,#16
  407         add     v2.4s,v2.4s,v3.4s
  408         ror     w19,w19,#16
  409         add     v6.4s,v6.4s,v7.4s
  410         ror     w20,w20,#16
  411         add     v18.4s,v18.4s,v19.4s
  412         ror     w21,w21,#16
  413         eor     v20.16b,v1.16b,v2.16b
  414         add     w13,w13,w17
  415         eor     v21.16b,v5.16b,v6.16b
  416         add     w14,w14,w19
  417         eor     v22.16b,v17.16b,v18.16b
  418         add     w15,w15,w20
  419         ushr    v1.4s,v20.4s,#20
  420         add     w16,w16,w21
  421         ushr    v5.4s,v21.4s,#20
  422         eor     w9,w9,w13
  423         ushr    v17.4s,v22.4s,#20
  424         eor     w10,w10,w14
  425         sli     v1.4s,v20.4s,#12
  426         eor     w11,w11,w15
  427         sli     v5.4s,v21.4s,#12
  428         eor     w12,w12,w16
  429         sli     v17.4s,v22.4s,#12
  430         ror     w9,w9,#20
  431         add     v0.4s,v0.4s,v1.4s
  432         ror     w10,w10,#20
  433         add     v4.4s,v4.4s,v5.4s
  434         ror     w11,w11,#20
  435         add     v16.4s,v16.4s,v17.4s
  436         ror     w12,w12,#20
  437         eor     v20.16b,v3.16b,v0.16b
  438         add     w5,w5,w9
  439         eor     v21.16b,v7.16b,v4.16b
  440         add     w6,w6,w10
  441         eor     v22.16b,v19.16b,v16.16b
  442         add     w7,w7,w11
  443         ushr    v3.4s,v20.4s,#24
  444         add     w8,w8,w12
  445         ushr    v7.4s,v21.4s,#24
  446         eor     w17,w17,w5
  447         ushr    v19.4s,v22.4s,#24
  448         eor     w19,w19,w6
  449         sli     v3.4s,v20.4s,#8
  450         eor     w20,w20,w7
  451         sli     v7.4s,v21.4s,#8
  452         eor     w21,w21,w8
  453         sli     v19.4s,v22.4s,#8
  454         ror     w17,w17,#24
  455         add     v2.4s,v2.4s,v3.4s
  456         ror     w19,w19,#24
  457         add     v6.4s,v6.4s,v7.4s
  458         ror     w20,w20,#24
  459         add     v18.4s,v18.4s,v19.4s
  460         ror     w21,w21,#24
  461         eor     v20.16b,v1.16b,v2.16b
  462         add     w13,w13,w17
  463         eor     v21.16b,v5.16b,v6.16b
  464         add     w14,w14,w19
  465         eor     v22.16b,v17.16b,v18.16b
  466         add     w15,w15,w20
  467         ushr    v1.4s,v20.4s,#25
  468         add     w16,w16,w21
  469         ushr    v5.4s,v21.4s,#25
  470         eor     w9,w9,w13
  471         ushr    v17.4s,v22.4s,#25
  472         eor     w10,w10,w14
  473         sli     v1.4s,v20.4s,#7
  474         eor     w11,w11,w15
  475         sli     v5.4s,v21.4s,#7
  476         eor     w12,w12,w16
  477         sli     v17.4s,v22.4s,#7
  478         ror     w9,w9,#25
  479         ext     v2.16b,v2.16b,v2.16b,#8
  480         ror     w10,w10,#25
  481         ext     v6.16b,v6.16b,v6.16b,#8
  482         ror     w11,w11,#25
  483         ext     v18.16b,v18.16b,v18.16b,#8
  484         ror     w12,w12,#25
  485         ext     v3.16b,v3.16b,v3.16b,#12
  486         ext     v7.16b,v7.16b,v7.16b,#12
  487         ext     v19.16b,v19.16b,v19.16b,#12
  488         ext     v1.16b,v1.16b,v1.16b,#4
  489         ext     v5.16b,v5.16b,v5.16b,#4
  490         ext     v17.16b,v17.16b,v17.16b,#4
  491         add     v0.4s,v0.4s,v1.4s
  492         add     w5,w5,w10
  493         add     v4.4s,v4.4s,v5.4s
  494         add     w6,w6,w11
  495         add     v16.4s,v16.4s,v17.4s
  496         add     w7,w7,w12
  497         eor     v3.16b,v3.16b,v0.16b
  498         add     w8,w8,w9
  499         eor     v7.16b,v7.16b,v4.16b
  500         eor     w21,w21,w5
  501         eor     v19.16b,v19.16b,v16.16b
  502         eor     w17,w17,w6
  503         rev32   v3.8h,v3.8h
  504         eor     w19,w19,w7
  505         rev32   v7.8h,v7.8h
  506         eor     w20,w20,w8
  507         rev32   v19.8h,v19.8h
  508         ror     w21,w21,#16
  509         add     v2.4s,v2.4s,v3.4s
  510         ror     w17,w17,#16
  511         add     v6.4s,v6.4s,v7.4s
  512         ror     w19,w19,#16
  513         add     v18.4s,v18.4s,v19.4s
  514         ror     w20,w20,#16
  515         eor     v20.16b,v1.16b,v2.16b
  516         add     w15,w15,w21
  517         eor     v21.16b,v5.16b,v6.16b
  518         add     w16,w16,w17
  519         eor     v22.16b,v17.16b,v18.16b
  520         add     w13,w13,w19
  521         ushr    v1.4s,v20.4s,#20
  522         add     w14,w14,w20
  523         ushr    v5.4s,v21.4s,#20
  524         eor     w10,w10,w15
  525         ushr    v17.4s,v22.4s,#20
  526         eor     w11,w11,w16
  527         sli     v1.4s,v20.4s,#12
  528         eor     w12,w12,w13
  529         sli     v5.4s,v21.4s,#12
  530         eor     w9,w9,w14
  531         sli     v17.4s,v22.4s,#12
  532         ror     w10,w10,#20
  533         add     v0.4s,v0.4s,v1.4s
  534         ror     w11,w11,#20
  535         add     v4.4s,v4.4s,v5.4s
  536         ror     w12,w12,#20
  537         add     v16.4s,v16.4s,v17.4s
  538         ror     w9,w9,#20
  539         eor     v20.16b,v3.16b,v0.16b
  540         add     w5,w5,w10
  541         eor     v21.16b,v7.16b,v4.16b
  542         add     w6,w6,w11
  543         eor     v22.16b,v19.16b,v16.16b
  544         add     w7,w7,w12
  545         ushr    v3.4s,v20.4s,#24
  546         add     w8,w8,w9
  547         ushr    v7.4s,v21.4s,#24
  548         eor     w21,w21,w5
  549         ushr    v19.4s,v22.4s,#24
  550         eor     w17,w17,w6
  551         sli     v3.4s,v20.4s,#8
  552         eor     w19,w19,w7
  553         sli     v7.4s,v21.4s,#8
  554         eor     w20,w20,w8
  555         sli     v19.4s,v22.4s,#8
  556         ror     w21,w21,#24
  557         add     v2.4s,v2.4s,v3.4s
  558         ror     w17,w17,#24
  559         add     v6.4s,v6.4s,v7.4s
  560         ror     w19,w19,#24
  561         add     v18.4s,v18.4s,v19.4s
  562         ror     w20,w20,#24
  563         eor     v20.16b,v1.16b,v2.16b
  564         add     w15,w15,w21
  565         eor     v21.16b,v5.16b,v6.16b
  566         add     w16,w16,w17
  567         eor     v22.16b,v17.16b,v18.16b
  568         add     w13,w13,w19
  569         ushr    v1.4s,v20.4s,#25
  570         add     w14,w14,w20
  571         ushr    v5.4s,v21.4s,#25
  572         eor     w10,w10,w15
  573         ushr    v17.4s,v22.4s,#25
  574         eor     w11,w11,w16
  575         sli     v1.4s,v20.4s,#7
  576         eor     w12,w12,w13
  577         sli     v5.4s,v21.4s,#7
  578         eor     w9,w9,w14
  579         sli     v17.4s,v22.4s,#7
  580         ror     w10,w10,#25
  581         ext     v2.16b,v2.16b,v2.16b,#8
  582         ror     w11,w11,#25
  583         ext     v6.16b,v6.16b,v6.16b,#8
  584         ror     w12,w12,#25
  585         ext     v18.16b,v18.16b,v18.16b,#8
  586         ror     w9,w9,#25
  587         ext     v3.16b,v3.16b,v3.16b,#4
  588         ext     v7.16b,v7.16b,v7.16b,#4
  589         ext     v19.16b,v19.16b,v19.16b,#4
  590         ext     v1.16b,v1.16b,v1.16b,#12
  591         ext     v5.16b,v5.16b,v5.16b,#12
  592         ext     v17.16b,v17.16b,v17.16b,#12
  593         cbnz    x4,.Loop_neon
  594 
  595         add     w5,w5,w22               // accumulate key block
  596         add     v0.4s,v0.4s,v24.4s
  597         add     x6,x6,x22,lsr#32
  598         add     v4.4s,v4.4s,v24.4s
  599         add     w7,w7,w23
  600         add     v16.4s,v16.4s,v24.4s
  601         add     x8,x8,x23,lsr#32
  602         add     v2.4s,v2.4s,v26.4s
  603         add     w9,w9,w24
  604         add     v6.4s,v6.4s,v26.4s
  605         add     x10,x10,x24,lsr#32
  606         add     v18.4s,v18.4s,v26.4s
  607         add     w11,w11,w25
  608         add     v3.4s,v3.4s,v27.4s
  609         add     x12,x12,x25,lsr#32
  610         add     w13,w13,w26
  611         add     v7.4s,v7.4s,v28.4s
  612         add     x14,x14,x26,lsr#32
  613         add     w15,w15,w27
  614         add     v19.4s,v19.4s,v29.4s
  615         add     x16,x16,x27,lsr#32
  616         add     w17,w17,w28
  617         add     v1.4s,v1.4s,v25.4s
  618         add     x19,x19,x28,lsr#32
  619         add     w20,w20,w30
  620         add     v5.4s,v5.4s,v25.4s
  621         add     x21,x21,x30,lsr#32
  622         add     v17.4s,v17.4s,v25.4s
  623 
  624         b.lo    .Ltail_neon
  625 
  626         add     x5,x5,x6,lsl#32 // pack
  627         add     x7,x7,x8,lsl#32
  628         ldp     x6,x8,[x1,#0]           // load input
  629         add     x9,x9,x10,lsl#32
  630         add     x11,x11,x12,lsl#32
  631         ldp     x10,x12,[x1,#16]
  632         add     x13,x13,x14,lsl#32
  633         add     x15,x15,x16,lsl#32
  634         ldp     x14,x16,[x1,#32]
  635         add     x17,x17,x19,lsl#32
  636         add     x20,x20,x21,lsl#32
  637         ldp     x19,x21,[x1,#48]
  638         add     x1,x1,#64
  639 #ifdef  __ARMEB__
  640         rev     x5,x5
  641         rev     x7,x7
  642         rev     x9,x9
  643         rev     x11,x11
  644         rev     x13,x13
  645         rev     x15,x15
  646         rev     x17,x17
  647         rev     x20,x20
  648 #endif
  649         ld1     {v20.16b,v21.16b,v22.16b,v23.16b},[x1],#64
  650         eor     x5,x5,x6
  651         eor     x7,x7,x8
  652         eor     x9,x9,x10
  653         eor     x11,x11,x12
  654         eor     x13,x13,x14
  655         eor     v0.16b,v0.16b,v20.16b
  656         eor     x15,x15,x16
  657         eor     v1.16b,v1.16b,v21.16b
  658         eor     x17,x17,x19
  659         eor     v2.16b,v2.16b,v22.16b
  660         eor     x20,x20,x21
  661         eor     v3.16b,v3.16b,v23.16b
  662         ld1     {v20.16b,v21.16b,v22.16b,v23.16b},[x1],#64
  663 
  664         stp     x5,x7,[x0,#0]           // store output
  665         add     x28,x28,#4                      // increment counter
  666         stp     x9,x11,[x0,#16]
  667         add     v27.4s,v27.4s,v31.4s            // += 4
  668         stp     x13,x15,[x0,#32]
  669         add     v28.4s,v28.4s,v31.4s
  670         stp     x17,x20,[x0,#48]
  671         add     v29.4s,v29.4s,v31.4s
  672         add     x0,x0,#64
  673 
  674         st1     {v0.16b,v1.16b,v2.16b,v3.16b},[x0],#64
  675         ld1     {v0.16b,v1.16b,v2.16b,v3.16b},[x1],#64
  676 
  677         eor     v4.16b,v4.16b,v20.16b
  678         eor     v5.16b,v5.16b,v21.16b
  679         eor     v6.16b,v6.16b,v22.16b
  680         eor     v7.16b,v7.16b,v23.16b
  681         st1     {v4.16b,v5.16b,v6.16b,v7.16b},[x0],#64
  682 
  683         eor     v16.16b,v16.16b,v0.16b
  684         eor     v17.16b,v17.16b,v1.16b
  685         eor     v18.16b,v18.16b,v2.16b
  686         eor     v19.16b,v19.16b,v3.16b
  687         st1     {v16.16b,v17.16b,v18.16b,v19.16b},[x0],#64
  688 
  689         b.hi    .Loop_outer_neon
  690 
  691         ldp     x19,x20,[x29,#16]
  692         add     sp,sp,#64
  693         ldp     x21,x22,[x29,#32]
  694         ldp     x23,x24,[x29,#48]
  695         ldp     x25,x26,[x29,#64]
  696         ldp     x27,x28,[x29,#80]
  697         ldp     x29,x30,[sp],#96
  698 .inst   0xd50323bf                      // autiasp
  699         ret
  700 
  701 .Ltail_neon:
  702         add     x2,x2,#256
  703         cmp     x2,#64
  704         b.lo    .Less_than_64
  705 
  706         add     x5,x5,x6,lsl#32 // pack
  707         add     x7,x7,x8,lsl#32
  708         ldp     x6,x8,[x1,#0]           // load input
  709         add     x9,x9,x10,lsl#32
  710         add     x11,x11,x12,lsl#32
  711         ldp     x10,x12,[x1,#16]
  712         add     x13,x13,x14,lsl#32
  713         add     x15,x15,x16,lsl#32
  714         ldp     x14,x16,[x1,#32]
  715         add     x17,x17,x19,lsl#32
  716         add     x20,x20,x21,lsl#32
  717         ldp     x19,x21,[x1,#48]
  718         add     x1,x1,#64
  719 #ifdef  __ARMEB__
  720         rev     x5,x5
  721         rev     x7,x7
  722         rev     x9,x9
  723         rev     x11,x11
  724         rev     x13,x13
  725         rev     x15,x15
  726         rev     x17,x17
  727         rev     x20,x20
  728 #endif
  729         eor     x5,x5,x6
  730         eor     x7,x7,x8
  731         eor     x9,x9,x10
  732         eor     x11,x11,x12
  733         eor     x13,x13,x14
  734         eor     x15,x15,x16
  735         eor     x17,x17,x19
  736         eor     x20,x20,x21
  737 
  738         stp     x5,x7,[x0,#0]           // store output
  739         add     x28,x28,#4                      // increment counter
  740         stp     x9,x11,[x0,#16]
  741         stp     x13,x15,[x0,#32]
  742         stp     x17,x20,[x0,#48]
  743         add     x0,x0,#64
  744         b.eq    .Ldone_neon
  745         sub     x2,x2,#64
  746         cmp     x2,#64
  747         b.lo    .Less_than_128
  748 
  749         ld1     {v20.16b,v21.16b,v22.16b,v23.16b},[x1],#64
  750         eor     v0.16b,v0.16b,v20.16b
  751         eor     v1.16b,v1.16b,v21.16b
  752         eor     v2.16b,v2.16b,v22.16b
  753         eor     v3.16b,v3.16b,v23.16b
  754         st1     {v0.16b,v1.16b,v2.16b,v3.16b},[x0],#64
  755         b.eq    .Ldone_neon
  756         sub     x2,x2,#64
  757         cmp     x2,#64
  758         b.lo    .Less_than_192
  759 
  760         ld1     {v20.16b,v21.16b,v22.16b,v23.16b},[x1],#64
  761         eor     v4.16b,v4.16b,v20.16b
  762         eor     v5.16b,v5.16b,v21.16b
  763         eor     v6.16b,v6.16b,v22.16b
  764         eor     v7.16b,v7.16b,v23.16b
  765         st1     {v4.16b,v5.16b,v6.16b,v7.16b},[x0],#64
  766         b.eq    .Ldone_neon
  767         sub     x2,x2,#64
  768 
  769         st1     {v16.16b,v17.16b,v18.16b,v19.16b},[sp]
  770         b       .Last_neon
  771 
  772 .Less_than_128:
  773         st1     {v0.16b,v1.16b,v2.16b,v3.16b},[sp]
  774         b       .Last_neon
  775 .Less_than_192:
  776         st1     {v4.16b,v5.16b,v6.16b,v7.16b},[sp]
  777         b       .Last_neon
  778 
  779 .align  4
  780 .Last_neon:
  781         sub     x0,x0,#1
  782         add     x1,x1,x2
  783         add     x0,x0,x2
  784         add     x4,sp,x2
  785         neg     x2,x2
  786 
  787 .Loop_tail_neon:
  788         ldrb    w10,[x1,x2]
  789         ldrb    w11,[x4,x2]
  790         add     x2,x2,#1
  791         eor     w10,w10,w11
  792         strb    w10,[x0,x2]
  793         cbnz    x2,.Loop_tail_neon
  794 
  795         stp     xzr,xzr,[sp,#0]
  796         stp     xzr,xzr,[sp,#16]
  797         stp     xzr,xzr,[sp,#32]
  798         stp     xzr,xzr,[sp,#48]
  799 
  800 .Ldone_neon:
  801         ldp     x19,x20,[x29,#16]
  802         add     sp,sp,#64
  803         ldp     x21,x22,[x29,#32]
  804         ldp     x23,x24,[x29,#48]
  805         ldp     x25,x26,[x29,#64]
  806         ldp     x27,x28,[x29,#80]
  807         ldp     x29,x30,[sp],#96
  808 .inst   0xd50323bf                      // autiasp
  809         ret
  810 .size   ChaCha20_neon,.-ChaCha20_neon
  811 .type   ChaCha20_512_neon,%function
  812 .align  5
  813 ChaCha20_512_neon:
  814 .inst   0xd503233f                      // paciasp
  815         stp     x29,x30,[sp,#-96]!
  816         add     x29,sp,#0
  817 
  818         adr     x5,.Lsigma
  819         stp     x19,x20,[sp,#16]
  820         stp     x21,x22,[sp,#32]
  821         stp     x23,x24,[sp,#48]
  822         stp     x25,x26,[sp,#64]
  823         stp     x27,x28,[sp,#80]
  824 
  825 .L512_or_more_neon:
  826         sub     sp,sp,#128+64
  827 
  828         ldp     x22,x23,[x5]            // load sigma
  829         ld1     {v24.4s},[x5],#16
  830         ldp     x24,x25,[x3]            // load key
  831         ldp     x26,x27,[x3,#16]
  832         ld1     {v25.4s,v26.4s},[x3]
  833         ldp     x28,x30,[x4]            // load counter
  834         ld1     {v27.4s},[x4]
  835         ld1     {v31.4s},[x5]
  836 #ifdef  __ARMEB__
  837         rev64   v24.4s,v24.4s
  838         ror     x24,x24,#32
  839         ror     x25,x25,#32
  840         ror     x26,x26,#32
  841         ror     x27,x27,#32
  842         ror     x28,x28,#32
  843         ror     x30,x30,#32
  844 #endif
  845         add     v27.4s,v27.4s,v31.4s            // += 1
  846         stp     q24,q25,[sp,#0]         // off-load key block, invariant part
  847         add     v27.4s,v27.4s,v31.4s            // not typo
  848         str     q26,[sp,#32]
  849         add     v28.4s,v27.4s,v31.4s
  850         add     v29.4s,v28.4s,v31.4s
  851         add     v30.4s,v29.4s,v31.4s
  852         shl     v31.4s,v31.4s,#2                        // 1 -> 4
  853 
  854         stp     d8,d9,[sp,#128+0]               // meet ABI requirements
  855         stp     d10,d11,[sp,#128+16]
  856         stp     d12,d13,[sp,#128+32]
  857         stp     d14,d15,[sp,#128+48]
  858 
  859         sub     x2,x2,#512                      // not typo
  860 
  861 .Loop_outer_512_neon:
  862         mov     v0.16b,v24.16b
  863         mov     v4.16b,v24.16b
  864         mov     v8.16b,v24.16b
  865         mov     v12.16b,v24.16b
  866         mov     v16.16b,v24.16b
  867         mov     v20.16b,v24.16b
  868         mov     v1.16b,v25.16b
  869         mov     w5,w22                  // unpack key block
  870         mov     v5.16b,v25.16b
  871         lsr     x6,x22,#32
  872         mov     v9.16b,v25.16b
  873         mov     w7,w23
  874         mov     v13.16b,v25.16b
  875         lsr     x8,x23,#32
  876         mov     v17.16b,v25.16b
  877         mov     w9,w24
  878         mov     v21.16b,v25.16b
  879         lsr     x10,x24,#32
  880         mov     v3.16b,v27.16b
  881         mov     w11,w25
  882         mov     v7.16b,v28.16b
  883         lsr     x12,x25,#32
  884         mov     v11.16b,v29.16b
  885         mov     w13,w26
  886         mov     v15.16b,v30.16b
  887         lsr     x14,x26,#32
  888         mov     v2.16b,v26.16b
  889         mov     w15,w27
  890         mov     v6.16b,v26.16b
  891         lsr     x16,x27,#32
  892         add     v19.4s,v3.4s,v31.4s                     // +4
  893         mov     w17,w28
  894         add     v23.4s,v7.4s,v31.4s                     // +4
  895         lsr     x19,x28,#32
  896         mov     v10.16b,v26.16b
  897         mov     w20,w30
  898         mov     v14.16b,v26.16b
  899         lsr     x21,x30,#32
  900         mov     v18.16b,v26.16b
  901         stp     q27,q28,[sp,#48]                // off-load key block, variable part
  902         mov     v22.16b,v26.16b
  903         str     q29,[sp,#80]
  904 
  905         mov     x4,#5
  906         subs    x2,x2,#512
  907 .Loop_upper_neon:
  908         sub     x4,x4,#1
  909         add     v0.4s,v0.4s,v1.4s
  910         add     w5,w5,w9
  911         add     v4.4s,v4.4s,v5.4s
  912         add     w6,w6,w10
  913         add     v8.4s,v8.4s,v9.4s
  914         add     w7,w7,w11
  915         add     v12.4s,v12.4s,v13.4s
  916         add     w8,w8,w12
  917         add     v16.4s,v16.4s,v17.4s
  918         eor     w17,w17,w5
  919         add     v20.4s,v20.4s,v21.4s
  920         eor     w19,w19,w6
  921         eor     v3.16b,v3.16b,v0.16b
  922         eor     w20,w20,w7
  923         eor     v7.16b,v7.16b,v4.16b
  924         eor     w21,w21,w8
  925         eor     v11.16b,v11.16b,v8.16b
  926         ror     w17,w17,#16
  927         eor     v15.16b,v15.16b,v12.16b
  928         ror     w19,w19,#16
  929         eor     v19.16b,v19.16b,v16.16b
  930         ror     w20,w20,#16
  931         eor     v23.16b,v23.16b,v20.16b
  932         ror     w21,w21,#16
  933         rev32   v3.8h,v3.8h
  934         add     w13,w13,w17
  935         rev32   v7.8h,v7.8h
  936         add     w14,w14,w19
  937         rev32   v11.8h,v11.8h
  938         add     w15,w15,w20
  939         rev32   v15.8h,v15.8h
  940         add     w16,w16,w21
  941         rev32   v19.8h,v19.8h
  942         eor     w9,w9,w13
  943         rev32   v23.8h,v23.8h
  944         eor     w10,w10,w14
  945         add     v2.4s,v2.4s,v3.4s
  946         eor     w11,w11,w15
  947         add     v6.4s,v6.4s,v7.4s
  948         eor     w12,w12,w16
  949         add     v10.4s,v10.4s,v11.4s
  950         ror     w9,w9,#20
  951         add     v14.4s,v14.4s,v15.4s
  952         ror     w10,w10,#20
  953         add     v18.4s,v18.4s,v19.4s
  954         ror     w11,w11,#20
  955         add     v22.4s,v22.4s,v23.4s
  956         ror     w12,w12,#20
  957         eor     v24.16b,v1.16b,v2.16b
  958         add     w5,w5,w9
  959         eor     v25.16b,v5.16b,v6.16b
  960         add     w6,w6,w10
  961         eor     v26.16b,v9.16b,v10.16b
  962         add     w7,w7,w11
  963         eor     v27.16b,v13.16b,v14.16b
  964         add     w8,w8,w12
  965         eor     v28.16b,v17.16b,v18.16b
  966         eor     w17,w17,w5
  967         eor     v29.16b,v21.16b,v22.16b
  968         eor     w19,w19,w6
  969         ushr    v1.4s,v24.4s,#20
  970         eor     w20,w20,w7
  971         ushr    v5.4s,v25.4s,#20
  972         eor     w21,w21,w8
  973         ushr    v9.4s,v26.4s,#20
  974         ror     w17,w17,#24
  975         ushr    v13.4s,v27.4s,#20
  976         ror     w19,w19,#24
  977         ushr    v17.4s,v28.4s,#20
  978         ror     w20,w20,#24
  979         ushr    v21.4s,v29.4s,#20
  980         ror     w21,w21,#24
  981         sli     v1.4s,v24.4s,#12
  982         add     w13,w13,w17
  983         sli     v5.4s,v25.4s,#12
  984         add     w14,w14,w19
  985         sli     v9.4s,v26.4s,#12
  986         add     w15,w15,w20
  987         sli     v13.4s,v27.4s,#12
  988         add     w16,w16,w21
  989         sli     v17.4s,v28.4s,#12
  990         eor     w9,w9,w13
  991         sli     v21.4s,v29.4s,#12
  992         eor     w10,w10,w14
  993         add     v0.4s,v0.4s,v1.4s
  994         eor     w11,w11,w15
  995         add     v4.4s,v4.4s,v5.4s
  996         eor     w12,w12,w16
  997         add     v8.4s,v8.4s,v9.4s
  998         ror     w9,w9,#25
  999         add     v12.4s,v12.4s,v13.4s
 1000         ror     w10,w10,#25
 1001         add     v16.4s,v16.4s,v17.4s
 1002         ror     w11,w11,#25
 1003         add     v20.4s,v20.4s,v21.4s
 1004         ror     w12,w12,#25
 1005         eor     v24.16b,v3.16b,v0.16b
 1006         add     w5,w5,w10
 1007         eor     v25.16b,v7.16b,v4.16b
 1008         add     w6,w6,w11
 1009         eor     v26.16b,v11.16b,v8.16b
 1010         add     w7,w7,w12
 1011         eor     v27.16b,v15.16b,v12.16b
 1012         add     w8,w8,w9
 1013         eor     v28.16b,v19.16b,v16.16b
 1014         eor     w21,w21,w5
 1015         eor     v29.16b,v23.16b,v20.16b
 1016         eor     w17,w17,w6
 1017         ushr    v3.4s,v24.4s,#24
 1018         eor     w19,w19,w7
 1019         ushr    v7.4s,v25.4s,#24
 1020         eor     w20,w20,w8
 1021         ushr    v11.4s,v26.4s,#24
 1022         ror     w21,w21,#16
 1023         ushr    v15.4s,v27.4s,#24
 1024         ror     w17,w17,#16
 1025         ushr    v19.4s,v28.4s,#24
 1026         ror     w19,w19,#16
 1027         ushr    v23.4s,v29.4s,#24
 1028         ror     w20,w20,#16
 1029         sli     v3.4s,v24.4s,#8
 1030         add     w15,w15,w21
 1031         sli     v7.4s,v25.4s,#8
 1032         add     w16,w16,w17
 1033         sli     v11.4s,v26.4s,#8
 1034         add     w13,w13,w19
 1035         sli     v15.4s,v27.4s,#8
 1036         add     w14,w14,w20
 1037         sli     v19.4s,v28.4s,#8
 1038         eor     w10,w10,w15
 1039         sli     v23.4s,v29.4s,#8
 1040         eor     w11,w11,w16
 1041         add     v2.4s,v2.4s,v3.4s
 1042         eor     w12,w12,w13
 1043         add     v6.4s,v6.4s,v7.4s
 1044         eor     w9,w9,w14
 1045         add     v10.4s,v10.4s,v11.4s
 1046         ror     w10,w10,#20
 1047         add     v14.4s,v14.4s,v15.4s
 1048         ror     w11,w11,#20
 1049         add     v18.4s,v18.4s,v19.4s
 1050         ror     w12,w12,#20
 1051         add     v22.4s,v22.4s,v23.4s
 1052         ror     w9,w9,#20
 1053         eor     v24.16b,v1.16b,v2.16b
 1054         add     w5,w5,w10
 1055         eor     v25.16b,v5.16b,v6.16b
 1056         add     w6,w6,w11
 1057         eor     v26.16b,v9.16b,v10.16b
 1058         add     w7,w7,w12
 1059         eor     v27.16b,v13.16b,v14.16b
 1060         add     w8,w8,w9
 1061         eor     v28.16b,v17.16b,v18.16b
 1062         eor     w21,w21,w5
 1063         eor     v29.16b,v21.16b,v22.16b
 1064         eor     w17,w17,w6
 1065         ushr    v1.4s,v24.4s,#25
 1066         eor     w19,w19,w7
 1067         ushr    v5.4s,v25.4s,#25
 1068         eor     w20,w20,w8
 1069         ushr    v9.4s,v26.4s,#25
 1070         ror     w21,w21,#24
 1071         ushr    v13.4s,v27.4s,#25
 1072         ror     w17,w17,#24
 1073         ushr    v17.4s,v28.4s,#25
 1074         ror     w19,w19,#24
 1075         ushr    v21.4s,v29.4s,#25
 1076         ror     w20,w20,#24
 1077         sli     v1.4s,v24.4s,#7
 1078         add     w15,w15,w21
 1079         sli     v5.4s,v25.4s,#7
 1080         add     w16,w16,w17
 1081         sli     v9.4s,v26.4s,#7
 1082         add     w13,w13,w19
 1083         sli     v13.4s,v27.4s,#7
 1084         add     w14,w14,w20
 1085         sli     v17.4s,v28.4s,#7
 1086         eor     w10,w10,w15
 1087         sli     v21.4s,v29.4s,#7
 1088         eor     w11,w11,w16
 1089         ext     v2.16b,v2.16b,v2.16b,#8
 1090         eor     w12,w12,w13
 1091         ext     v6.16b,v6.16b,v6.16b,#8
 1092         eor     w9,w9,w14
 1093         ext     v10.16b,v10.16b,v10.16b,#8
 1094         ror     w10,w10,#25
 1095         ext     v14.16b,v14.16b,v14.16b,#8
 1096         ror     w11,w11,#25
 1097         ext     v18.16b,v18.16b,v18.16b,#8
 1098         ror     w12,w12,#25
 1099         ext     v22.16b,v22.16b,v22.16b,#8
 1100         ror     w9,w9,#25
 1101         ext     v3.16b,v3.16b,v3.16b,#12
 1102         ext     v7.16b,v7.16b,v7.16b,#12
 1103         ext     v11.16b,v11.16b,v11.16b,#12
 1104         ext     v15.16b,v15.16b,v15.16b,#12
 1105         ext     v19.16b,v19.16b,v19.16b,#12
 1106         ext     v23.16b,v23.16b,v23.16b,#12
 1107         ext     v1.16b,v1.16b,v1.16b,#4
 1108         ext     v5.16b,v5.16b,v5.16b,#4
 1109         ext     v9.16b,v9.16b,v9.16b,#4
 1110         ext     v13.16b,v13.16b,v13.16b,#4
 1111         ext     v17.16b,v17.16b,v17.16b,#4
 1112         ext     v21.16b,v21.16b,v21.16b,#4
 1113         add     v0.4s,v0.4s,v1.4s
 1114         add     w5,w5,w9
 1115         add     v4.4s,v4.4s,v5.4s
 1116         add     w6,w6,w10
 1117         add     v8.4s,v8.4s,v9.4s
 1118         add     w7,w7,w11
 1119         add     v12.4s,v12.4s,v13.4s
 1120         add     w8,w8,w12
 1121         add     v16.4s,v16.4s,v17.4s
 1122         eor     w17,w17,w5
 1123         add     v20.4s,v20.4s,v21.4s
 1124         eor     w19,w19,w6
 1125         eor     v3.16b,v3.16b,v0.16b
 1126         eor     w20,w20,w7
 1127         eor     v7.16b,v7.16b,v4.16b
 1128         eor     w21,w21,w8
 1129         eor     v11.16b,v11.16b,v8.16b
 1130         ror     w17,w17,#16
 1131         eor     v15.16b,v15.16b,v12.16b
 1132         ror     w19,w19,#16
 1133         eor     v19.16b,v19.16b,v16.16b
 1134         ror     w20,w20,#16
 1135         eor     v23.16b,v23.16b,v20.16b
 1136         ror     w21,w21,#16
 1137         rev32   v3.8h,v3.8h
 1138         add     w13,w13,w17
 1139         rev32   v7.8h,v7.8h
 1140         add     w14,w14,w19
 1141         rev32   v11.8h,v11.8h
 1142         add     w15,w15,w20
 1143         rev32   v15.8h,v15.8h
 1144         add     w16,w16,w21
 1145         rev32   v19.8h,v19.8h
 1146         eor     w9,w9,w13
 1147         rev32   v23.8h,v23.8h
 1148         eor     w10,w10,w14
 1149         add     v2.4s,v2.4s,v3.4s
 1150         eor     w11,w11,w15
 1151         add     v6.4s,v6.4s,v7.4s
 1152         eor     w12,w12,w16
 1153         add     v10.4s,v10.4s,v11.4s
 1154         ror     w9,w9,#20
 1155         add     v14.4s,v14.4s,v15.4s
 1156         ror     w10,w10,#20
 1157         add     v18.4s,v18.4s,v19.4s
 1158         ror     w11,w11,#20
 1159         add     v22.4s,v22.4s,v23.4s
 1160         ror     w12,w12,#20
 1161         eor     v24.16b,v1.16b,v2.16b
 1162         add     w5,w5,w9
 1163         eor     v25.16b,v5.16b,v6.16b
 1164         add     w6,w6,w10
 1165         eor     v26.16b,v9.16b,v10.16b
 1166         add     w7,w7,w11
 1167         eor     v27.16b,v13.16b,v14.16b
 1168         add     w8,w8,w12
 1169         eor     v28.16b,v17.16b,v18.16b
 1170         eor     w17,w17,w5
 1171         eor     v29.16b,v21.16b,v22.16b
 1172         eor     w19,w19,w6
 1173         ushr    v1.4s,v24.4s,#20
 1174         eor     w20,w20,w7
 1175         ushr    v5.4s,v25.4s,#20
 1176         eor     w21,w21,w8
 1177         ushr    v9.4s,v26.4s,#20
 1178         ror     w17,w17,#24
 1179         ushr    v13.4s,v27.4s,#20
 1180         ror     w19,w19,#24
 1181         ushr    v17.4s,v28.4s,#20
 1182         ror     w20,w20,#24
 1183         ushr    v21.4s,v29.4s,#20
 1184         ror     w21,w21,#24
 1185         sli     v1.4s,v24.4s,#12
 1186         add     w13,w13,w17
 1187         sli     v5.4s,v25.4s,#12
 1188         add     w14,w14,w19
 1189         sli     v9.4s,v26.4s,#12
 1190         add     w15,w15,w20
 1191         sli     v13.4s,v27.4s,#12
 1192         add     w16,w16,w21
 1193         sli     v17.4s,v28.4s,#12
 1194         eor     w9,w9,w13
 1195         sli     v21.4s,v29.4s,#12
 1196         eor     w10,w10,w14
 1197         add     v0.4s,v0.4s,v1.4s
 1198         eor     w11,w11,w15
 1199         add     v4.4s,v4.4s,v5.4s
 1200         eor     w12,w12,w16
 1201         add     v8.4s,v8.4s,v9.4s
 1202         ror     w9,w9,#25
 1203         add     v12.4s,v12.4s,v13.4s
 1204         ror     w10,w10,#25
 1205         add     v16.4s,v16.4s,v17.4s
 1206         ror     w11,w11,#25
 1207         add     v20.4s,v20.4s,v21.4s
 1208         ror     w12,w12,#25
 1209         eor     v24.16b,v3.16b,v0.16b
 1210         add     w5,w5,w10
 1211         eor     v25.16b,v7.16b,v4.16b
 1212         add     w6,w6,w11
 1213         eor     v26.16b,v11.16b,v8.16b
 1214         add     w7,w7,w12
 1215         eor     v27.16b,v15.16b,v12.16b
 1216         add     w8,w8,w9
 1217         eor     v28.16b,v19.16b,v16.16b
 1218         eor     w21,w21,w5
 1219         eor     v29.16b,v23.16b,v20.16b
 1220         eor     w17,w17,w6
 1221         ushr    v3.4s,v24.4s,#24
 1222         eor     w19,w19,w7
 1223         ushr    v7.4s,v25.4s,#24
 1224         eor     w20,w20,w8
 1225         ushr    v11.4s,v26.4s,#24
 1226         ror     w21,w21,#16
 1227         ushr    v15.4s,v27.4s,#24
 1228         ror     w17,w17,#16
 1229         ushr    v19.4s,v28.4s,#24
 1230         ror     w19,w19,#16
 1231         ushr    v23.4s,v29.4s,#24
 1232         ror     w20,w20,#16
 1233         sli     v3.4s,v24.4s,#8
 1234         add     w15,w15,w21
 1235         sli     v7.4s,v25.4s,#8
 1236         add     w16,w16,w17
 1237         sli     v11.4s,v26.4s,#8
 1238         add     w13,w13,w19
 1239         sli     v15.4s,v27.4s,#8
 1240         add     w14,w14,w20
 1241         sli     v19.4s,v28.4s,#8
 1242         eor     w10,w10,w15
 1243         sli     v23.4s,v29.4s,#8
 1244         eor     w11,w11,w16
 1245         add     v2.4s,v2.4s,v3.4s
 1246         eor     w12,w12,w13
 1247         add     v6.4s,v6.4s,v7.4s
 1248         eor     w9,w9,w14
 1249         add     v10.4s,v10.4s,v11.4s
 1250         ror     w10,w10,#20
 1251         add     v14.4s,v14.4s,v15.4s
 1252         ror     w11,w11,#20
 1253         add     v18.4s,v18.4s,v19.4s
 1254         ror     w12,w12,#20
 1255         add     v22.4s,v22.4s,v23.4s
 1256         ror     w9,w9,#20
 1257         eor     v24.16b,v1.16b,v2.16b
 1258         add     w5,w5,w10
 1259         eor     v25.16b,v5.16b,v6.16b
 1260         add     w6,w6,w11
 1261         eor     v26.16b,v9.16b,v10.16b
 1262         add     w7,w7,w12
 1263         eor     v27.16b,v13.16b,v14.16b
 1264         add     w8,w8,w9
 1265         eor     v28.16b,v17.16b,v18.16b
 1266         eor     w21,w21,w5
 1267         eor     v29.16b,v21.16b,v22.16b
 1268         eor     w17,w17,w6
 1269         ushr    v1.4s,v24.4s,#25
 1270         eor     w19,w19,w7
 1271         ushr    v5.4s,v25.4s,#25
 1272         eor     w20,w20,w8
 1273         ushr    v9.4s,v26.4s,#25
 1274         ror     w21,w21,#24
 1275         ushr    v13.4s,v27.4s,#25
 1276         ror     w17,w17,#24
 1277         ushr    v17.4s,v28.4s,#25
 1278         ror     w19,w19,#24
 1279         ushr    v21.4s,v29.4s,#25
 1280         ror     w20,w20,#24
 1281         sli     v1.4s,v24.4s,#7
 1282         add     w15,w15,w21
 1283         sli     v5.4s,v25.4s,#7
 1284         add     w16,w16,w17
 1285         sli     v9.4s,v26.4s,#7
 1286         add     w13,w13,w19
 1287         sli     v13.4s,v27.4s,#7
 1288         add     w14,w14,w20
 1289         sli     v17.4s,v28.4s,#7
 1290         eor     w10,w10,w15
 1291         sli     v21.4s,v29.4s,#7
 1292         eor     w11,w11,w16
 1293         ext     v2.16b,v2.16b,v2.16b,#8
 1294         eor     w12,w12,w13
 1295         ext     v6.16b,v6.16b,v6.16b,#8
 1296         eor     w9,w9,w14
 1297         ext     v10.16b,v10.16b,v10.16b,#8
 1298         ror     w10,w10,#25
 1299         ext     v14.16b,v14.16b,v14.16b,#8
 1300         ror     w11,w11,#25
 1301         ext     v18.16b,v18.16b,v18.16b,#8
 1302         ror     w12,w12,#25
 1303         ext     v22.16b,v22.16b,v22.16b,#8
 1304         ror     w9,w9,#25
 1305         ext     v3.16b,v3.16b,v3.16b,#4
 1306         ext     v7.16b,v7.16b,v7.16b,#4
 1307         ext     v11.16b,v11.16b,v11.16b,#4
 1308         ext     v15.16b,v15.16b,v15.16b,#4
 1309         ext     v19.16b,v19.16b,v19.16b,#4
 1310         ext     v23.16b,v23.16b,v23.16b,#4
 1311         ext     v1.16b,v1.16b,v1.16b,#12
 1312         ext     v5.16b,v5.16b,v5.16b,#12
 1313         ext     v9.16b,v9.16b,v9.16b,#12
 1314         ext     v13.16b,v13.16b,v13.16b,#12
 1315         ext     v17.16b,v17.16b,v17.16b,#12
 1316         ext     v21.16b,v21.16b,v21.16b,#12
 1317         cbnz    x4,.Loop_upper_neon
 1318 
 1319         add     w5,w5,w22               // accumulate key block
 1320         add     x6,x6,x22,lsr#32
 1321         add     w7,w7,w23
 1322         add     x8,x8,x23,lsr#32
 1323         add     w9,w9,w24
 1324         add     x10,x10,x24,lsr#32
 1325         add     w11,w11,w25
 1326         add     x12,x12,x25,lsr#32
 1327         add     w13,w13,w26
 1328         add     x14,x14,x26,lsr#32
 1329         add     w15,w15,w27
 1330         add     x16,x16,x27,lsr#32
 1331         add     w17,w17,w28
 1332         add     x19,x19,x28,lsr#32
 1333         add     w20,w20,w30
 1334         add     x21,x21,x30,lsr#32
 1335 
 1336         add     x5,x5,x6,lsl#32 // pack
 1337         add     x7,x7,x8,lsl#32
 1338         ldp     x6,x8,[x1,#0]           // load input
 1339         add     x9,x9,x10,lsl#32
 1340         add     x11,x11,x12,lsl#32
 1341         ldp     x10,x12,[x1,#16]
 1342         add     x13,x13,x14,lsl#32
 1343         add     x15,x15,x16,lsl#32
 1344         ldp     x14,x16,[x1,#32]
 1345         add     x17,x17,x19,lsl#32
 1346         add     x20,x20,x21,lsl#32
 1347         ldp     x19,x21,[x1,#48]
 1348         add     x1,x1,#64
 1349 #ifdef  __ARMEB__
 1350         rev     x5,x5
 1351         rev     x7,x7
 1352         rev     x9,x9
 1353         rev     x11,x11
 1354         rev     x13,x13
 1355         rev     x15,x15
 1356         rev     x17,x17
 1357         rev     x20,x20
 1358 #endif
 1359         eor     x5,x5,x6
 1360         eor     x7,x7,x8
 1361         eor     x9,x9,x10
 1362         eor     x11,x11,x12
 1363         eor     x13,x13,x14
 1364         eor     x15,x15,x16
 1365         eor     x17,x17,x19
 1366         eor     x20,x20,x21
 1367 
 1368         stp     x5,x7,[x0,#0]           // store output
 1369         add     x28,x28,#1                      // increment counter
 1370         mov     w5,w22                  // unpack key block
 1371         lsr     x6,x22,#32
 1372         stp     x9,x11,[x0,#16]
 1373         mov     w7,w23
 1374         lsr     x8,x23,#32
 1375         stp     x13,x15,[x0,#32]
 1376         mov     w9,w24
 1377         lsr     x10,x24,#32
 1378         stp     x17,x20,[x0,#48]
 1379         add     x0,x0,#64
 1380         mov     w11,w25
 1381         lsr     x12,x25,#32
 1382         mov     w13,w26
 1383         lsr     x14,x26,#32
 1384         mov     w15,w27
 1385         lsr     x16,x27,#32
 1386         mov     w17,w28
 1387         lsr     x19,x28,#32
 1388         mov     w20,w30
 1389         lsr     x21,x30,#32
 1390 
 1391         mov     x4,#5
 1392 .Loop_lower_neon:
 1393         sub     x4,x4,#1
 1394         add     v0.4s,v0.4s,v1.4s
 1395         add     w5,w5,w9
 1396         add     v4.4s,v4.4s,v5.4s
 1397         add     w6,w6,w10
 1398         add     v8.4s,v8.4s,v9.4s
 1399         add     w7,w7,w11
 1400         add     v12.4s,v12.4s,v13.4s
 1401         add     w8,w8,w12
 1402         add     v16.4s,v16.4s,v17.4s
 1403         eor     w17,w17,w5
 1404         add     v20.4s,v20.4s,v21.4s
 1405         eor     w19,w19,w6
 1406         eor     v3.16b,v3.16b,v0.16b
 1407         eor     w20,w20,w7
 1408         eor     v7.16b,v7.16b,v4.16b
 1409         eor     w21,w21,w8
 1410         eor     v11.16b,v11.16b,v8.16b
 1411         ror     w17,w17,#16
 1412         eor     v15.16b,v15.16b,v12.16b
 1413         ror     w19,w19,#16
 1414         eor     v19.16b,v19.16b,v16.16b
 1415         ror     w20,w20,#16
 1416         eor     v23.16b,v23.16b,v20.16b
 1417         ror     w21,w21,#16
 1418         rev32   v3.8h,v3.8h
 1419         add     w13,w13,w17
 1420         rev32   v7.8h,v7.8h
 1421         add     w14,w14,w19
 1422         rev32   v11.8h,v11.8h
 1423         add     w15,w15,w20
 1424         rev32   v15.8h,v15.8h
 1425         add     w16,w16,w21
 1426         rev32   v19.8h,v19.8h
 1427         eor     w9,w9,w13
 1428         rev32   v23.8h,v23.8h
 1429         eor     w10,w10,w14
 1430         add     v2.4s,v2.4s,v3.4s
 1431         eor     w11,w11,w15
 1432         add     v6.4s,v6.4s,v7.4s
 1433         eor     w12,w12,w16
 1434         add     v10.4s,v10.4s,v11.4s
 1435         ror     w9,w9,#20
 1436         add     v14.4s,v14.4s,v15.4s
 1437         ror     w10,w10,#20
 1438         add     v18.4s,v18.4s,v19.4s
 1439         ror     w11,w11,#20
 1440         add     v22.4s,v22.4s,v23.4s
 1441         ror     w12,w12,#20
 1442         eor     v24.16b,v1.16b,v2.16b
 1443         add     w5,w5,w9
 1444         eor     v25.16b,v5.16b,v6.16b
 1445         add     w6,w6,w10
 1446         eor     v26.16b,v9.16b,v10.16b
 1447         add     w7,w7,w11
 1448         eor     v27.16b,v13.16b,v14.16b
 1449         add     w8,w8,w12
 1450         eor     v28.16b,v17.16b,v18.16b
 1451         eor     w17,w17,w5
 1452         eor     v29.16b,v21.16b,v22.16b
 1453         eor     w19,w19,w6
 1454         ushr    v1.4s,v24.4s,#20
 1455         eor     w20,w20,w7
 1456         ushr    v5.4s,v25.4s,#20
 1457         eor     w21,w21,w8
 1458         ushr    v9.4s,v26.4s,#20
 1459         ror     w17,w17,#24
 1460         ushr    v13.4s,v27.4s,#20
 1461         ror     w19,w19,#24
 1462         ushr    v17.4s,v28.4s,#20
 1463         ror     w20,w20,#24
 1464         ushr    v21.4s,v29.4s,#20
 1465         ror     w21,w21,#24
 1466         sli     v1.4s,v24.4s,#12
 1467         add     w13,w13,w17
 1468         sli     v5.4s,v25.4s,#12
 1469         add     w14,w14,w19
 1470         sli     v9.4s,v26.4s,#12
 1471         add     w15,w15,w20
 1472         sli     v13.4s,v27.4s,#12
 1473         add     w16,w16,w21
 1474         sli     v17.4s,v28.4s,#12
 1475         eor     w9,w9,w13
 1476         sli     v21.4s,v29.4s,#12
 1477         eor     w10,w10,w14
 1478         add     v0.4s,v0.4s,v1.4s
 1479         eor     w11,w11,w15
 1480         add     v4.4s,v4.4s,v5.4s
 1481         eor     w12,w12,w16
 1482         add     v8.4s,v8.4s,v9.4s
 1483         ror     w9,w9,#25
 1484         add     v12.4s,v12.4s,v13.4s
 1485         ror     w10,w10,#25
 1486         add     v16.4s,v16.4s,v17.4s
 1487         ror     w11,w11,#25
 1488         add     v20.4s,v20.4s,v21.4s
 1489         ror     w12,w12,#25
 1490         eor     v24.16b,v3.16b,v0.16b
 1491         add     w5,w5,w10
 1492         eor     v25.16b,v7.16b,v4.16b
 1493         add     w6,w6,w11
 1494         eor     v26.16b,v11.16b,v8.16b
 1495         add     w7,w7,w12
 1496         eor     v27.16b,v15.16b,v12.16b
 1497         add     w8,w8,w9
 1498         eor     v28.16b,v19.16b,v16.16b
 1499         eor     w21,w21,w5
 1500         eor     v29.16b,v23.16b,v20.16b
 1501         eor     w17,w17,w6
 1502         ushr    v3.4s,v24.4s,#24
 1503         eor     w19,w19,w7
 1504         ushr    v7.4s,v25.4s,#24
 1505         eor     w20,w20,w8
 1506         ushr    v11.4s,v26.4s,#24
 1507         ror     w21,w21,#16
 1508         ushr    v15.4s,v27.4s,#24
 1509         ror     w17,w17,#16
 1510         ushr    v19.4s,v28.4s,#24
 1511         ror     w19,w19,#16
 1512         ushr    v23.4s,v29.4s,#24
 1513         ror     w20,w20,#16
 1514         sli     v3.4s,v24.4s,#8
 1515         add     w15,w15,w21
 1516         sli     v7.4s,v25.4s,#8
 1517         add     w16,w16,w17
 1518         sli     v11.4s,v26.4s,#8
 1519         add     w13,w13,w19
 1520         sli     v15.4s,v27.4s,#8
 1521         add     w14,w14,w20
 1522         sli     v19.4s,v28.4s,#8
 1523         eor     w10,w10,w15
 1524         sli     v23.4s,v29.4s,#8
 1525         eor     w11,w11,w16
 1526         add     v2.4s,v2.4s,v3.4s
 1527         eor     w12,w12,w13
 1528         add     v6.4s,v6.4s,v7.4s
 1529         eor     w9,w9,w14
 1530         add     v10.4s,v10.4s,v11.4s
 1531         ror     w10,w10,#20
 1532         add     v14.4s,v14.4s,v15.4s
 1533         ror     w11,w11,#20
 1534         add     v18.4s,v18.4s,v19.4s
 1535         ror     w12,w12,#20
 1536         add     v22.4s,v22.4s,v23.4s
 1537         ror     w9,w9,#20
 1538         eor     v24.16b,v1.16b,v2.16b
 1539         add     w5,w5,w10
 1540         eor     v25.16b,v5.16b,v6.16b
 1541         add     w6,w6,w11
 1542         eor     v26.16b,v9.16b,v10.16b
 1543         add     w7,w7,w12
 1544         eor     v27.16b,v13.16b,v14.16b
 1545         add     w8,w8,w9
 1546         eor     v28.16b,v17.16b,v18.16b
 1547         eor     w21,w21,w5
 1548         eor     v29.16b,v21.16b,v22.16b
 1549         eor     w17,w17,w6
 1550         ushr    v1.4s,v24.4s,#25
 1551         eor     w19,w19,w7
 1552         ushr    v5.4s,v25.4s,#25
 1553         eor     w20,w20,w8
 1554         ushr    v9.4s,v26.4s,#25
 1555         ror     w21,w21,#24
 1556         ushr    v13.4s,v27.4s,#25
 1557         ror     w17,w17,#24
 1558         ushr    v17.4s,v28.4s,#25
 1559         ror     w19,w19,#24
 1560         ushr    v21.4s,v29.4s,#25
 1561         ror     w20,w20,#24
 1562         sli     v1.4s,v24.4s,#7
 1563         add     w15,w15,w21
 1564         sli     v5.4s,v25.4s,#7
 1565         add     w16,w16,w17
 1566         sli     v9.4s,v26.4s,#7
 1567         add     w13,w13,w19
 1568         sli     v13.4s,v27.4s,#7
 1569         add     w14,w14,w20
 1570         sli     v17.4s,v28.4s,#7
 1571         eor     w10,w10,w15
 1572         sli     v21.4s,v29.4s,#7
 1573         eor     w11,w11,w16
 1574         ext     v2.16b,v2.16b,v2.16b,#8
 1575         eor     w12,w12,w13
 1576         ext     v6.16b,v6.16b,v6.16b,#8
 1577         eor     w9,w9,w14
 1578         ext     v10.16b,v10.16b,v10.16b,#8
 1579         ror     w10,w10,#25
 1580         ext     v14.16b,v14.16b,v14.16b,#8
 1581         ror     w11,w11,#25
 1582         ext     v18.16b,v18.16b,v18.16b,#8
 1583         ror     w12,w12,#25
 1584         ext     v22.16b,v22.16b,v22.16b,#8
 1585         ror     w9,w9,#25
 1586         ext     v3.16b,v3.16b,v3.16b,#12
 1587         ext     v7.16b,v7.16b,v7.16b,#12
 1588         ext     v11.16b,v11.16b,v11.16b,#12
 1589         ext     v15.16b,v15.16b,v15.16b,#12
 1590         ext     v19.16b,v19.16b,v19.16b,#12
 1591         ext     v23.16b,v23.16b,v23.16b,#12
 1592         ext     v1.16b,v1.16b,v1.16b,#4
 1593         ext     v5.16b,v5.16b,v5.16b,#4
 1594         ext     v9.16b,v9.16b,v9.16b,#4
 1595         ext     v13.16b,v13.16b,v13.16b,#4
 1596         ext     v17.16b,v17.16b,v17.16b,#4
 1597         ext     v21.16b,v21.16b,v21.16b,#4
 1598         add     v0.4s,v0.4s,v1.4s
 1599         add     w5,w5,w9
 1600         add     v4.4s,v4.4s,v5.4s
 1601         add     w6,w6,w10
 1602         add     v8.4s,v8.4s,v9.4s
 1603         add     w7,w7,w11
 1604         add     v12.4s,v12.4s,v13.4s
 1605         add     w8,w8,w12
 1606         add     v16.4s,v16.4s,v17.4s
 1607         eor     w17,w17,w5
 1608         add     v20.4s,v20.4s,v21.4s
 1609         eor     w19,w19,w6
 1610         eor     v3.16b,v3.16b,v0.16b
 1611         eor     w20,w20,w7
 1612         eor     v7.16b,v7.16b,v4.16b
 1613         eor     w21,w21,w8
 1614         eor     v11.16b,v11.16b,v8.16b
 1615         ror     w17,w17,#16
 1616         eor     v15.16b,v15.16b,v12.16b
 1617         ror     w19,w19,#16
 1618         eor     v19.16b,v19.16b,v16.16b
 1619         ror     w20,w20,#16
 1620         eor     v23.16b,v23.16b,v20.16b
 1621         ror     w21,w21,#16
 1622         rev32   v3.8h,v3.8h
 1623         add     w13,w13,w17
 1624         rev32   v7.8h,v7.8h
 1625         add     w14,w14,w19
 1626         rev32   v11.8h,v11.8h
 1627         add     w15,w15,w20
 1628         rev32   v15.8h,v15.8h
 1629         add     w16,w16,w21
 1630         rev32   v19.8h,v19.8h
 1631         eor     w9,w9,w13
 1632         rev32   v23.8h,v23.8h
 1633         eor     w10,w10,w14
 1634         add     v2.4s,v2.4s,v3.4s
 1635         eor     w11,w11,w15
 1636         add     v6.4s,v6.4s,v7.4s
 1637         eor     w12,w12,w16
 1638         add     v10.4s,v10.4s,v11.4s
 1639         ror     w9,w9,#20
 1640         add     v14.4s,v14.4s,v15.4s
 1641         ror     w10,w10,#20
 1642         add     v18.4s,v18.4s,v19.4s
 1643         ror     w11,w11,#20
 1644         add     v22.4s,v22.4s,v23.4s
 1645         ror     w12,w12,#20
 1646         eor     v24.16b,v1.16b,v2.16b
 1647         add     w5,w5,w9
 1648         eor     v25.16b,v5.16b,v6.16b
 1649         add     w6,w6,w10
 1650         eor     v26.16b,v9.16b,v10.16b
 1651         add     w7,w7,w11
 1652         eor     v27.16b,v13.16b,v14.16b
 1653         add     w8,w8,w12
 1654         eor     v28.16b,v17.16b,v18.16b
 1655         eor     w17,w17,w5
 1656         eor     v29.16b,v21.16b,v22.16b
 1657         eor     w19,w19,w6
 1658         ushr    v1.4s,v24.4s,#20
 1659         eor     w20,w20,w7
 1660         ushr    v5.4s,v25.4s,#20
 1661         eor     w21,w21,w8
 1662         ushr    v9.4s,v26.4s,#20
 1663         ror     w17,w17,#24
 1664         ushr    v13.4s,v27.4s,#20
 1665         ror     w19,w19,#24
 1666         ushr    v17.4s,v28.4s,#20
 1667         ror     w20,w20,#24
 1668         ushr    v21.4s,v29.4s,#20
 1669         ror     w21,w21,#24
 1670         sli     v1.4s,v24.4s,#12
 1671         add     w13,w13,w17
 1672         sli     v5.4s,v25.4s,#12
 1673         add     w14,w14,w19
 1674         sli     v9.4s,v26.4s,#12
 1675         add     w15,w15,w20
 1676         sli     v13.4s,v27.4s,#12
 1677         add     w16,w16,w21
 1678         sli     v17.4s,v28.4s,#12
 1679         eor     w9,w9,w13
 1680         sli     v21.4s,v29.4s,#12
 1681         eor     w10,w10,w14
 1682         add     v0.4s,v0.4s,v1.4s
 1683         eor     w11,w11,w15
 1684         add     v4.4s,v4.4s,v5.4s
 1685         eor     w12,w12,w16
 1686         add     v8.4s,v8.4s,v9.4s
 1687         ror     w9,w9,#25
 1688         add     v12.4s,v12.4s,v13.4s
 1689         ror     w10,w10,#25
 1690         add     v16.4s,v16.4s,v17.4s
 1691         ror     w11,w11,#25
 1692         add     v20.4s,v20.4s,v21.4s
 1693         ror     w12,w12,#25
 1694         eor     v24.16b,v3.16b,v0.16b
 1695         add     w5,w5,w10
 1696         eor     v25.16b,v7.16b,v4.16b
 1697         add     w6,w6,w11
 1698         eor     v26.16b,v11.16b,v8.16b
 1699         add     w7,w7,w12
 1700         eor     v27.16b,v15.16b,v12.16b
 1701         add     w8,w8,w9
 1702         eor     v28.16b,v19.16b,v16.16b
 1703         eor     w21,w21,w5
 1704         eor     v29.16b,v23.16b,v20.16b
 1705         eor     w17,w17,w6
 1706         ushr    v3.4s,v24.4s,#24
 1707         eor     w19,w19,w7
 1708         ushr    v7.4s,v25.4s,#24
 1709         eor     w20,w20,w8
 1710         ushr    v11.4s,v26.4s,#24
 1711         ror     w21,w21,#16
 1712         ushr    v15.4s,v27.4s,#24
 1713         ror     w17,w17,#16
 1714         ushr    v19.4s,v28.4s,#24
 1715         ror     w19,w19,#16
 1716         ushr    v23.4s,v29.4s,#24
 1717         ror     w20,w20,#16
 1718         sli     v3.4s,v24.4s,#8
 1719         add     w15,w15,w21
 1720         sli     v7.4s,v25.4s,#8
 1721         add     w16,w16,w17
 1722         sli     v11.4s,v26.4s,#8
 1723         add     w13,w13,w19
 1724         sli     v15.4s,v27.4s,#8
 1725         add     w14,w14,w20
 1726         sli     v19.4s,v28.4s,#8
 1727         eor     w10,w10,w15
 1728         sli     v23.4s,v29.4s,#8
 1729         eor     w11,w11,w16
 1730         add     v2.4s,v2.4s,v3.4s
 1731         eor     w12,w12,w13
 1732         add     v6.4s,v6.4s,v7.4s
 1733         eor     w9,w9,w14
 1734         add     v10.4s,v10.4s,v11.4s
 1735         ror     w10,w10,#20
 1736         add     v14.4s,v14.4s,v15.4s
 1737         ror     w11,w11,#20
 1738         add     v18.4s,v18.4s,v19.4s
 1739         ror     w12,w12,#20
 1740         add     v22.4s,v22.4s,v23.4s
 1741         ror     w9,w9,#20
 1742         eor     v24.16b,v1.16b,v2.16b
 1743         add     w5,w5,w10
 1744         eor     v25.16b,v5.16b,v6.16b
 1745         add     w6,w6,w11
 1746         eor     v26.16b,v9.16b,v10.16b
 1747         add     w7,w7,w12
 1748         eor     v27.16b,v13.16b,v14.16b
 1749         add     w8,w8,w9
 1750         eor     v28.16b,v17.16b,v18.16b
 1751         eor     w21,w21,w5
 1752         eor     v29.16b,v21.16b,v22.16b
 1753         eor     w17,w17,w6
 1754         ushr    v1.4s,v24.4s,#25
 1755         eor     w19,w19,w7
 1756         ushr    v5.4s,v25.4s,#25
 1757         eor     w20,w20,w8
 1758         ushr    v9.4s,v26.4s,#25
 1759         ror     w21,w21,#24
 1760         ushr    v13.4s,v27.4s,#25
 1761         ror     w17,w17,#24
 1762         ushr    v17.4s,v28.4s,#25
 1763         ror     w19,w19,#24
 1764         ushr    v21.4s,v29.4s,#25
 1765         ror     w20,w20,#24
 1766         sli     v1.4s,v24.4s,#7
 1767         add     w15,w15,w21
 1768         sli     v5.4s,v25.4s,#7
 1769         add     w16,w16,w17
 1770         sli     v9.4s,v26.4s,#7
 1771         add     w13,w13,w19
 1772         sli     v13.4s,v27.4s,#7
 1773         add     w14,w14,w20
 1774         sli     v17.4s,v28.4s,#7
 1775         eor     w10,w10,w15
 1776         sli     v21.4s,v29.4s,#7
 1777         eor     w11,w11,w16
 1778         ext     v2.16b,v2.16b,v2.16b,#8
 1779         eor     w12,w12,w13
 1780         ext     v6.16b,v6.16b,v6.16b,#8
 1781         eor     w9,w9,w14
 1782         ext     v10.16b,v10.16b,v10.16b,#8
 1783         ror     w10,w10,#25
 1784         ext     v14.16b,v14.16b,v14.16b,#8
 1785         ror     w11,w11,#25
 1786         ext     v18.16b,v18.16b,v18.16b,#8
 1787         ror     w12,w12,#25
 1788         ext     v22.16b,v22.16b,v22.16b,#8
 1789         ror     w9,w9,#25
 1790         ext     v3.16b,v3.16b,v3.16b,#4
 1791         ext     v7.16b,v7.16b,v7.16b,#4
 1792         ext     v11.16b,v11.16b,v11.16b,#4
 1793         ext     v15.16b,v15.16b,v15.16b,#4
 1794         ext     v19.16b,v19.16b,v19.16b,#4
 1795         ext     v23.16b,v23.16b,v23.16b,#4
 1796         ext     v1.16b,v1.16b,v1.16b,#12
 1797         ext     v5.16b,v5.16b,v5.16b,#12
 1798         ext     v9.16b,v9.16b,v9.16b,#12
 1799         ext     v13.16b,v13.16b,v13.16b,#12
 1800         ext     v17.16b,v17.16b,v17.16b,#12
 1801         ext     v21.16b,v21.16b,v21.16b,#12
 1802         cbnz    x4,.Loop_lower_neon
 1803 
 1804         add     w5,w5,w22               // accumulate key block
 1805         ldp     q24,q25,[sp,#0]
 1806         add     x6,x6,x22,lsr#32
 1807         ldp     q26,q27,[sp,#32]
 1808         add     w7,w7,w23
 1809         ldp     q28,q29,[sp,#64]
 1810         add     x8,x8,x23,lsr#32
 1811         add     v0.4s,v0.4s,v24.4s
 1812         add     w9,w9,w24
 1813         add     v4.4s,v4.4s,v24.4s
 1814         add     x10,x10,x24,lsr#32
 1815         add     v8.4s,v8.4s,v24.4s
 1816         add     w11,w11,w25
 1817         add     v12.4s,v12.4s,v24.4s
 1818         add     x12,x12,x25,lsr#32
 1819         add     v16.4s,v16.4s,v24.4s
 1820         add     w13,w13,w26
 1821         add     v20.4s,v20.4s,v24.4s
 1822         add     x14,x14,x26,lsr#32
 1823         add     v2.4s,v2.4s,v26.4s
 1824         add     w15,w15,w27
 1825         add     v6.4s,v6.4s,v26.4s
 1826         add     x16,x16,x27,lsr#32
 1827         add     v10.4s,v10.4s,v26.4s
 1828         add     w17,w17,w28
 1829         add     v14.4s,v14.4s,v26.4s
 1830         add     x19,x19,x28,lsr#32
 1831         add     v18.4s,v18.4s,v26.4s
 1832         add     w20,w20,w30
 1833         add     v22.4s,v22.4s,v26.4s
 1834         add     x21,x21,x30,lsr#32
 1835         add     v19.4s,v19.4s,v31.4s                    // +4
 1836         add     x5,x5,x6,lsl#32 // pack
 1837         add     v23.4s,v23.4s,v31.4s                    // +4
 1838         add     x7,x7,x8,lsl#32
 1839         add     v3.4s,v3.4s,v27.4s
 1840         ldp     x6,x8,[x1,#0]           // load input
 1841         add     v7.4s,v7.4s,v28.4s
 1842         add     x9,x9,x10,lsl#32
 1843         add     v11.4s,v11.4s,v29.4s
 1844         add     x11,x11,x12,lsl#32
 1845         add     v15.4s,v15.4s,v30.4s
 1846         ldp     x10,x12,[x1,#16]
 1847         add     v19.4s,v19.4s,v27.4s
 1848         add     x13,x13,x14,lsl#32
 1849         add     v23.4s,v23.4s,v28.4s
 1850         add     x15,x15,x16,lsl#32
 1851         add     v1.4s,v1.4s,v25.4s
 1852         ldp     x14,x16,[x1,#32]
 1853         add     v5.4s,v5.4s,v25.4s
 1854         add     x17,x17,x19,lsl#32
 1855         add     v9.4s,v9.4s,v25.4s
 1856         add     x20,x20,x21,lsl#32
 1857         add     v13.4s,v13.4s,v25.4s
 1858         ldp     x19,x21,[x1,#48]
 1859         add     v17.4s,v17.4s,v25.4s
 1860         add     x1,x1,#64
 1861         add     v21.4s,v21.4s,v25.4s
 1862 
 1863 #ifdef  __ARMEB__
 1864         rev     x5,x5
 1865         rev     x7,x7
 1866         rev     x9,x9
 1867         rev     x11,x11
 1868         rev     x13,x13
 1869         rev     x15,x15
 1870         rev     x17,x17
 1871         rev     x20,x20
 1872 #endif
 1873         ld1     {v24.16b,v25.16b,v26.16b,v27.16b},[x1],#64
 1874         eor     x5,x5,x6
 1875         eor     x7,x7,x8
 1876         eor     x9,x9,x10
 1877         eor     x11,x11,x12
 1878         eor     x13,x13,x14
 1879         eor     v0.16b,v0.16b,v24.16b
 1880         eor     x15,x15,x16
 1881         eor     v1.16b,v1.16b,v25.16b
 1882         eor     x17,x17,x19
 1883         eor     v2.16b,v2.16b,v26.16b
 1884         eor     x20,x20,x21
 1885         eor     v3.16b,v3.16b,v27.16b
 1886         ld1     {v24.16b,v25.16b,v26.16b,v27.16b},[x1],#64
 1887 
 1888         stp     x5,x7,[x0,#0]           // store output
 1889         add     x28,x28,#7                      // increment counter
 1890         stp     x9,x11,[x0,#16]
 1891         stp     x13,x15,[x0,#32]
 1892         stp     x17,x20,[x0,#48]
 1893         add     x0,x0,#64
 1894         st1     {v0.16b,v1.16b,v2.16b,v3.16b},[x0],#64
 1895 
 1896         ld1     {v0.16b,v1.16b,v2.16b,v3.16b},[x1],#64
 1897         eor     v4.16b,v4.16b,v24.16b
 1898         eor     v5.16b,v5.16b,v25.16b
 1899         eor     v6.16b,v6.16b,v26.16b
 1900         eor     v7.16b,v7.16b,v27.16b
 1901         st1     {v4.16b,v5.16b,v6.16b,v7.16b},[x0],#64
 1902 
 1903         ld1     {v4.16b,v5.16b,v6.16b,v7.16b},[x1],#64
 1904         eor     v8.16b,v8.16b,v0.16b
 1905         ldp     q24,q25,[sp,#0]
 1906         eor     v9.16b,v9.16b,v1.16b
 1907         ldp     q26,q27,[sp,#32]
 1908         eor     v10.16b,v10.16b,v2.16b
 1909         eor     v11.16b,v11.16b,v3.16b
 1910         st1     {v8.16b,v9.16b,v10.16b,v11.16b},[x0],#64
 1911 
 1912         ld1     {v8.16b,v9.16b,v10.16b,v11.16b},[x1],#64
 1913         eor     v12.16b,v12.16b,v4.16b
 1914         eor     v13.16b,v13.16b,v5.16b
 1915         eor     v14.16b,v14.16b,v6.16b
 1916         eor     v15.16b,v15.16b,v7.16b
 1917         st1     {v12.16b,v13.16b,v14.16b,v15.16b},[x0],#64
 1918 
 1919         ld1     {v12.16b,v13.16b,v14.16b,v15.16b},[x1],#64
 1920         eor     v16.16b,v16.16b,v8.16b
 1921         eor     v17.16b,v17.16b,v9.16b
 1922         eor     v18.16b,v18.16b,v10.16b
 1923         eor     v19.16b,v19.16b,v11.16b
 1924         st1     {v16.16b,v17.16b,v18.16b,v19.16b},[x0],#64
 1925 
 1926         shl     v0.4s,v31.4s,#1                 // 4 -> 8
 1927         eor     v20.16b,v20.16b,v12.16b
 1928         eor     v21.16b,v21.16b,v13.16b
 1929         eor     v22.16b,v22.16b,v14.16b
 1930         eor     v23.16b,v23.16b,v15.16b
 1931         st1     {v20.16b,v21.16b,v22.16b,v23.16b},[x0],#64
 1932 
 1933         add     v27.4s,v27.4s,v0.4s                     // += 8
 1934         add     v28.4s,v28.4s,v0.4s
 1935         add     v29.4s,v29.4s,v0.4s
 1936         add     v30.4s,v30.4s,v0.4s
 1937 
 1938         b.hs    .Loop_outer_512_neon
 1939 
 1940         adds    x2,x2,#512
 1941         ushr    v0.4s,v31.4s,#2                 // 4 -> 1
 1942 
 1943         ldp     d8,d9,[sp,#128+0]               // meet ABI requirements
 1944         ldp     d10,d11,[sp,#128+16]
 1945         ldp     d12,d13,[sp,#128+32]
 1946         ldp     d14,d15,[sp,#128+48]
 1947 
 1948         stp     q24,q31,[sp,#0]         // wipe off-load area
 1949         stp     q24,q31,[sp,#32]
 1950         stp     q24,q31,[sp,#64]
 1951 
 1952         b.eq    .Ldone_512_neon
 1953 
 1954         cmp     x2,#192
 1955         sub     v27.4s,v27.4s,v0.4s                     // -= 1
 1956         sub     v28.4s,v28.4s,v0.4s
 1957         sub     v29.4s,v29.4s,v0.4s
 1958         add     sp,sp,#128
 1959         b.hs    .Loop_outer_neon
 1960 
 1961         eor     v25.16b,v25.16b,v25.16b
 1962         eor     v26.16b,v26.16b,v26.16b
 1963         eor     v27.16b,v27.16b,v27.16b
 1964         eor     v28.16b,v28.16b,v28.16b
 1965         eor     v29.16b,v29.16b,v29.16b
 1966         eor     v30.16b,v30.16b,v30.16b
 1967         b       .Loop_outer
 1968 
 1969 .Ldone_512_neon:
 1970         ldp     x19,x20,[x29,#16]
 1971         add     sp,sp,#128+64
 1972         ldp     x21,x22,[x29,#32]
 1973         ldp     x23,x24,[x29,#48]
 1974         ldp     x25,x26,[x29,#64]
 1975         ldp     x27,x28,[x29,#80]
 1976         ldp     x29,x30,[sp],#96
 1977 .inst   0xd50323bf                      // autiasp
 1978         ret
 1979 .size   ChaCha20_512_neon,.-ChaCha20_512_neon

Cache object: 146e534c15ca818fdc691afc63d7853a


[ source navigation ] [ diff markup ] [ identifier search ] [ freetext search ] [ file search ] [ list types ] [ track identifier ]


This page is part of the FreeBSD/Linux Linux Kernel Cross-Reference, and was automatically generated using a modified version of the LXR engine.