The Design and Implementation of the FreeBSD Operating System, Second Edition
Now available: The Design and Implementation of the FreeBSD Operating System (Second Edition)


[ source navigation ] [ diff markup ] [ identifier search ] [ freetext search ] [ file search ] [ list types ] [ track identifier ]

FreeBSD/Linux Kernel Cross Reference
sys/crypto/skein/amd64/skein_block_asm.S

Version: -  FREEBSD  -  FREEBSD-13-STABLE  -  FREEBSD-13-0  -  FREEBSD-12-STABLE  -  FREEBSD-12-0  -  FREEBSD-11-STABLE  -  FREEBSD-11-0  -  FREEBSD-10-STABLE  -  FREEBSD-10-0  -  FREEBSD-9-STABLE  -  FREEBSD-9-0  -  FREEBSD-8-STABLE  -  FREEBSD-8-0  -  FREEBSD-7-STABLE  -  FREEBSD-7-0  -  FREEBSD-6-STABLE  -  FREEBSD-6-0  -  FREEBSD-5-STABLE  -  FREEBSD-5-0  -  FREEBSD-4-STABLE  -  FREEBSD-3-STABLE  -  FREEBSD22  -  l41  -  OPENBSD  -  linux-2.6  -  MK84  -  PLAN9  -  xnu-8792 
SearchContext: -  none  -  3  -  10 

    1 #
    2 #----------------------------------------------------------------
    3 # 64-bit x86 assembler code (gnu as) for Skein block functions
    4 #
    5 # Author: Doug Whiting, Hifn/Exar
    6 #
    7 # This code is released to the public domain.
    8 #----------------------------------------------------------------
    9 # $FreeBSD$
   10 #
   11     .text
   12     .altmacro
   13 #ifndef __clang__
   14     .psize 0,128                            #list file has no page boundaries
   15 #endif
   16 #
   17 _MASK_ALL_  =  (256+512+1024)               #all three algorithm bits
   18 _MAX_FRAME_ =  240
   19 #
   20 #################
   21 #ifndef SKEIN_USE_ASM
   22 _USE_ASM_         = _MASK_ALL_
   23 #else
   24 _USE_ASM_         = SKEIN_USE_ASM
   25 #endif
   26 #################
   27 #configure loop unrolling
   28 #ifndef SKEIN_LOOP
   29 _SKEIN_LOOP       =   2                     #default is fully unrolled for 256/512, twice for 1024
   30 #else
   31 _SKEIN_LOOP       = SKEIN_LOOP
   32   .irp _NN_,%_SKEIN_LOOP                #only display loop unrolling if default changed on command line
   33 #.print  "+++ SKEIN_LOOP = \_NN_"
   34   .endr
   35 #endif
   36 # the unroll counts (0 --> fully unrolled)
   37 SKEIN_UNROLL_256  = (_SKEIN_LOOP / 100) % 10
   38 SKEIN_UNROLL_512  = (_SKEIN_LOOP /  10) % 10
   39 SKEIN_UNROLL_1024 = (_SKEIN_LOOP      ) % 10
   40 #
   41 SKEIN_ASM_UNROLL  = 0
   42   .irp _NN_,256,512,1024
   43     .if (SKEIN_UNROLL_\_NN_) == 0
   44 SKEIN_ASM_UNROLL  = (SKEIN_ASM_UNROLL) + \_NN_
   45     .endif
   46   .endr
   47 #################
   48 #
   49 .ifndef SKEIN_ROUNDS
   50 ROUNDS_256  =   72
   51 ROUNDS_512  =   72
   52 ROUNDS_1024 =   80
   53 .else
   54 ROUNDS_256  = 8*((((SKEIN_ROUNDS / 100) + 5) % 10) + 5)
   55 ROUNDS_512  = 8*((((SKEIN_ROUNDS /  10) + 5) % 10) + 5)
   56 ROUNDS_1024 = 8*((((SKEIN_ROUNDS      ) + 5) % 10) + 5)
   57 # only display rounds if default size is changed on command line
   58 .irp _NN_,256,512,1024
   59   .if _USE_ASM_ & \_NN_
   60     .irp _RR_,%(ROUNDS_\_NN_)
   61       .if _NN_ < 1024
   62 .print  "+++ SKEIN_ROUNDS_\_NN_  = \_RR_"
   63       .else
   64 .print  "+++ SKEIN_ROUNDS_\_NN_ = \_RR_"
   65       .endif
   66     .endr
   67   .endif
   68 .endr
   69 .endif
   70 #################
   71 #
   72 .ifdef SKEIN_CODE_SIZE
   73 _SKEIN_CODE_SIZE = (1)
   74 .else
   75 .ifdef  SKEIN_PERF                           #use code size if SKEIN_PERF is defined
   76 _SKEIN_CODE_SIZE = (1)
   77 .else
   78 _SKEIN_CODE_SIZE = (0)
   79 .endif
   80 .endif
   81 #
   82 #################
   83 #
   84 .ifndef SKEIN_DEBUG
   85 _SKEIN_DEBUG      = 0
   86 .else
   87 _SKEIN_DEBUG      = 1
   88 .endif
   89 #################
   90 #
   91 # define offsets of fields in hash context structure
   92 #
   93 HASH_BITS   =   0                   #bits of hash output
   94 BCNT        =   8 + HASH_BITS       #number of bytes in BUFFER[]
   95 TWEAK       =   8 + BCNT            #tweak values[0..1]
   96 X_VARS      =  16 + TWEAK           #chaining vars
   97 #
   98 #(Note: buffer[] in context structure is NOT needed here :-)
   99 #
  100 KW_PARITY   =   0x1BD11BDAA9FC1A22  #overall parity of key schedule words
  101 FIRST_MASK  =   ~ (1 <<  6)
  102 FIRST_MASK64=   ~ (1 << 62)
  103 #
  104 # rotation constants for Skein
  105 #
  106 RC_256_0_0  = 14
  107 RC_256_0_1  = 16
  108 
  109 RC_256_1_0  = 52
  110 RC_256_1_1  = 57
  111 
  112 RC_256_2_0  = 23
  113 RC_256_2_1  = 40
  114 
  115 RC_256_3_0  =  5
  116 RC_256_3_1  = 37
  117 
  118 RC_256_4_0  = 25
  119 RC_256_4_1  = 33
  120 
  121 RC_256_5_0  = 46
  122 RC_256_5_1  = 12
  123 
  124 RC_256_6_0  = 58
  125 RC_256_6_1  = 22
  126 
  127 RC_256_7_0  = 32
  128 RC_256_7_1  = 32
  129 
  130 RC_512_0_0  = 46
  131 RC_512_0_1  = 36
  132 RC_512_0_2  = 19
  133 RC_512_0_3  = 37
  134 
  135 RC_512_1_0  = 33
  136 RC_512_1_1  = 27
  137 RC_512_1_2  = 14
  138 RC_512_1_3  = 42
  139 
  140 RC_512_2_0  = 17
  141 RC_512_2_1  = 49
  142 RC_512_2_2  = 36
  143 RC_512_2_3  = 39
  144 
  145 RC_512_3_0  = 44
  146 RC_512_3_1  =  9
  147 RC_512_3_2  = 54
  148 RC_512_3_3  = 56
  149 
  150 RC_512_4_0  = 39
  151 RC_512_4_1  = 30
  152 RC_512_4_2  = 34
  153 RC_512_4_3  = 24
  154 
  155 RC_512_5_0  = 13
  156 RC_512_5_1  = 50
  157 RC_512_5_2  = 10
  158 RC_512_5_3  = 17
  159 
  160 RC_512_6_0  = 25
  161 RC_512_6_1  = 29
  162 RC_512_6_2  = 39
  163 RC_512_6_3  = 43
  164 
  165 RC_512_7_0  =  8
  166 RC_512_7_1  = 35
  167 RC_512_7_2  = 56
  168 RC_512_7_3  = 22
  169 
  170 RC_1024_0_0 = 24
  171 RC_1024_0_1 = 13
  172 RC_1024_0_2 =  8
  173 RC_1024_0_3 = 47
  174 RC_1024_0_4 =  8
  175 RC_1024_0_5 = 17
  176 RC_1024_0_6 = 22
  177 RC_1024_0_7 = 37
  178 
  179 RC_1024_1_0 = 38
  180 RC_1024_1_1 = 19
  181 RC_1024_1_2 = 10
  182 RC_1024_1_3 = 55
  183 RC_1024_1_4 = 49
  184 RC_1024_1_5 = 18
  185 RC_1024_1_6 = 23
  186 RC_1024_1_7 = 52
  187 
  188 RC_1024_2_0 = 33
  189 RC_1024_2_1 =  4
  190 RC_1024_2_2 = 51
  191 RC_1024_2_3 = 13
  192 RC_1024_2_4 = 34
  193 RC_1024_2_5 = 41
  194 RC_1024_2_6 = 59
  195 RC_1024_2_7 = 17
  196 
  197 RC_1024_3_0 =  5
  198 RC_1024_3_1 = 20
  199 RC_1024_3_2 = 48
  200 RC_1024_3_3 = 41
  201 RC_1024_3_4 = 47
  202 RC_1024_3_5 = 28
  203 RC_1024_3_6 = 16
  204 RC_1024_3_7 = 25
  205 
  206 RC_1024_4_0 = 41
  207 RC_1024_4_1 =  9
  208 RC_1024_4_2 = 37
  209 RC_1024_4_3 = 31
  210 RC_1024_4_4 = 12
  211 RC_1024_4_5 = 47
  212 RC_1024_4_6 = 44
  213 RC_1024_4_7 = 30
  214 
  215 RC_1024_5_0 = 16
  216 RC_1024_5_1 = 34
  217 RC_1024_5_2 = 56
  218 RC_1024_5_3 = 51
  219 RC_1024_5_4 =  4
  220 RC_1024_5_5 = 53
  221 RC_1024_5_6 = 42
  222 RC_1024_5_7 = 41
  223 
  224 RC_1024_6_0 = 31
  225 RC_1024_6_1 = 44
  226 RC_1024_6_2 = 47
  227 RC_1024_6_3 = 46
  228 RC_1024_6_4 = 19
  229 RC_1024_6_5 = 42
  230 RC_1024_6_6 = 44
  231 RC_1024_6_7 = 25
  232 
  233 RC_1024_7_0 =  9
  234 RC_1024_7_1 = 48
  235 RC_1024_7_2 = 35
  236 RC_1024_7_3 = 52
  237 RC_1024_7_4 = 23
  238 RC_1024_7_5 = 31
  239 RC_1024_7_6 = 37
  240 RC_1024_7_7 = 20
  241 #
  242 #  Input:  reg
  243 # Output: <reg> <<< RC_BlkSize_roundNum_mixNum, BlkSize=256/512/1024
  244 #
  245 .macro RotL64   reg,BLK_SIZE,ROUND_NUM,MIX_NUM
  246   .if RC_\BLK_SIZE\()_\ROUND_NUM\()_\MIX_NUM  #is there anything to do?
  247     rolq    $RC_\BLK_SIZE\()_\ROUND_NUM\()_\MIX_NUM,%\reg
  248   .endif
  249 .endm
  250 #
  251 #----------------------------------------------------------------
  252 #
  253 # MACROS: define local vars and configure stack
  254 #
  255 #----------------------------------------------------------------
  256 # declare allocated space on the stack
  257 .macro StackVar localName,localSize
  258 \localName  =   _STK_OFFS_
  259 _STK_OFFS_  =   _STK_OFFS_+(\localSize)
  260 .endm #StackVar
  261 #
  262 #----------------------------------------------------------------
  263 #
  264 # MACRO: Configure stack frame, allocate local vars
  265 #
  266 .macro Setup_Stack BLK_BITS,KS_CNT,debugCnt
  267     WCNT    =    (\BLK_BITS)/64
  268 #
  269 _PushCnt_   =   0                   #save nonvolatile regs on stack
  270   .irp _reg_,rbp,rbx,r12,r13,r14,r15
  271        pushq    %\_reg_
  272 _PushCnt_ = _PushCnt_ + 1           #track count to keep alignment
  273   .endr
  274 #
  275 _STK_OFFS_  =   0                   #starting offset from rsp
  276     #---- local  variables         #<-- rsp
  277     StackVar    X_stk  ,8*(WCNT)    #local context vars
  278     StackVar    ksTwk  ,8*3         #key schedule: tweak words
  279     StackVar    ksKey  ,8*(WCNT)+8  #key schedule: key   words
  280   .if ((SKEIN_ASM_UNROLL) & (\BLK_BITS)) == 0
  281     StackVar    ksRot ,16*(\KS_CNT) #leave space for "rotation" to happen
  282   .endif
  283     StackVar    Wcopy  ,8*(WCNT)    #copy of input block    
  284   .if _SKEIN_DEBUG
  285   .if \debugCnt + 0                 #temp location for debug X[] info
  286     StackVar    xDebug_\BLK_BITS ,8*(\debugCnt)
  287   .endif
  288   .endif
  289   .if ((8*_PushCnt_ + _STK_OFFS_) % 8) == 0
  290     StackVar    align16,8           #keep 16-byte aligned (adjust for retAddr?)
  291 tmpStk_\BLK_BITS = align16          #use this
  292   .endif
  293     #---- saved caller parameters (from regs rdi, rsi, rdx, rcx)
  294     StackVar    ctxPtr ,8           #context ptr
  295     StackVar    blkPtr ,8           #pointer to block data
  296     StackVar    blkCnt ,8           #number of full blocks to process
  297     StackVar    bitAdd ,8           #bit count to add to tweak
  298 LOCAL_SIZE  =   _STK_OFFS_          #size of "local" vars
  299     #---- 
  300     StackVar    savRegs,8*_PushCnt_ #saved registers
  301     StackVar    retAddr,8           #return address
  302     #---- caller's stack frame (aligned mod 16)
  303 #
  304 # set up the stack frame pointer (rbp)
  305 #
  306 FRAME_OFFS  =   ksTwk + 128         #allow short (negative) offset to ksTwk, kwKey
  307   .if FRAME_OFFS > _STK_OFFS_       #keep rbp in the "locals" range
  308 FRAME_OFFS  =      _STK_OFFS_
  309   .endif
  310 F_O         =   -FRAME_OFFS
  311 #
  312   #put some useful defines in the .lst file (for grep)
  313 __STK_LCL_SIZE_\BLK_BITS = LOCAL_SIZE
  314 __STK_TOT_SIZE_\BLK_BITS = _STK_OFFS_
  315 __STK_FRM_OFFS_\BLK_BITS = FRAME_OFFS
  316 #
  317 # Notes on stack frame setup:
  318 #   * the most frequently used variable is X_stk[], based at [rsp+0]
  319 #   * the next most used is the key schedule arrays, ksKey and ksTwk
  320 #       so rbp is "centered" there, allowing short offsets to the key 
  321 #       schedule even in 1024-bit Skein case
  322 #   * the Wcopy variables are infrequently accessed, but they have long 
  323 #       offsets from both rsp and rbp only in the 1024-bit case.
  324 #   * all other local vars and calling parameters can be accessed 
  325 #       with short offsets, except in the 1024-bit case
  326 #
  327     subq    $LOCAL_SIZE,%rsp        #make room for the locals
  328     leaq    FRAME_OFFS(%rsp),%rbp   #maximize use of short offsets
  329     movq    %rdi, ctxPtr+F_O(%rbp)  #save caller's parameters on the stack
  330     movq    %rsi, blkPtr+F_O(%rbp)
  331     movq    %rdx, blkCnt+F_O(%rbp)
  332     movq    %rcx, bitAdd+F_O(%rbp)
  333 #
  334 .endm #Setup_Stack
  335 #
  336 #----------------------------------------------------------------
  337 #
  338 .macro Reset_Stack
  339     addq    $LOCAL_SIZE,%rsp        #get rid of locals (wipe?)
  340   .irp _reg_,r15,r14,r13,r12,rbx,rbp
  341     popq    %\_reg_                 #restore caller's regs
  342 _PushCnt_ = _PushCnt_ - 1
  343   .endr
  344   .if _PushCnt_
  345     .error  "Mismatched push/pops?"
  346   .endif
  347 .endm # Reset_Stack
  348 #
  349 #----------------------------------------------------------------
  350 # macros to help debug internals
  351 #
  352 .if _SKEIN_DEBUG
  353     .extern  Skein_Show_Block     #calls to C routines
  354     .extern  Skein_Show_Round
  355 #
  356 SKEIN_RND_SPECIAL       =   1000
  357 SKEIN_RND_KEY_INITIAL   =   SKEIN_RND_SPECIAL+0
  358 SKEIN_RND_KEY_INJECT    =   SKEIN_RND_SPECIAL+1
  359 SKEIN_RND_FEED_FWD      =   SKEIN_RND_SPECIAL+2
  360 #
  361 .macro Skein_Debug_Block BLK_BITS
  362 #
  363 #void Skein_Show_Block(uint_t bits,const Skein_Ctxt_Hdr_t *h,const u64b_t *X,
  364 #                     const u08b_t *blkPtr, const u64b_t *wPtr, 
  365 #                     const u64b_t *ksPtr,const u64b_t *tsPtr)
  366 #
  367 _NN_ = 0
  368   .irp _reg_,rax,rcx,rdx,rsi,rdi,r8,r9,r10,r11
  369     pushq   %\_reg_                 #save all volatile regs on tack before the call
  370 _NN_ = _NN_ + 1
  371   .endr
  372     # get and push call parameters
  373     movq    $\BLK_BITS      ,%rdi   #bits
  374     movq    ctxPtr+F_O(%rbp),%rsi   #h (pointer)
  375     leaq    X_VARS    (%rsi),%rdx   #X (pointer)
  376     movq    blkPtr+F_O(%rbp),%rcx   #blkPtr
  377     leaq    Wcopy +F_O(%rbp),%r8    #wPtr
  378     leaq    ksKey +F_O(%rbp),%r9    #key pointer
  379     leaq    ksTwk +F_O(%rbp),%rax   #tweak pointer
  380     pushq   %rax                    #   (pass on the stack)
  381     call    Skein_Show_Block        #call external debug handler
  382     addq    $8*1,%rsp               #discard parameters on stack
  383   .if (_NN_ % 2 ) == 0              #check stack alignment
  384     .error "Stack misalignment problem in Skein_Debug_Block_\_BLK_BITS"
  385   .endif
  386   .irp _reg_,r11,r10,r9,r8,rdi,rsi,rdx,rcx,rax
  387     popq    %\_reg_                 #restore regs
  388 _NN_ = _NN_ - 1
  389   .endr
  390   .if _NN_
  391     .error "Push/pop mismatch problem in Skein_Debug_Block_\_BLK_BITS"
  392   .endif
  393 .endm # Skein_Debug_Block
  394 #
  395 # the macro to "call" to debug a round
  396 #
  397 .macro Skein_Debug_Round BLK_BITS,R,RDI_OFFS,afterOp
  398     # call the appropriate (local) debug "function"
  399     pushq   %rdx                    #save rdx, so we can use it for round "number"
  400   .if ((SKEIN_ASM_UNROLL) & \BLK_BITS) || (\R >= SKEIN_RND_SPECIAL)
  401     movq    $\R,%rdx
  402   .else                             #compute round number using edi
  403 _rOffs_ = \RDI_OFFS + 0
  404    .if \BLK_BITS == 1024
  405     movq    rIdx_offs+8(%rsp),%rdx  #get rIdx off the stack (adjust for pushq rdx above)
  406     leaq    1+(((\R)-1) & 3)+_rOffs_(,%rdx,4),%rdx
  407    .else
  408     leaq    1+(((\R)-1) & 3)+_rOffs_(,%rdi,4),%rdx
  409    .endif
  410   .endif
  411     call    Skein_Debug_Round_\BLK_BITS
  412     popq    %rdx                    #restore origianl rdx value
  413 #
  414     afterOp
  415 .endm  #  Skein_Debug_Round
  416 .else  #------- _SKEIN_DEBUG (dummy macros if debug not enabled)
  417 .macro Skein_Debug_Block BLK_BITS
  418 .endm
  419 #
  420 .macro Skein_Debug_Round BLK_BITS,R,RDI_OFFS,afterOp
  421 .endm
  422 #
  423 .endif # _SKEIN_DEBUG
  424 #
  425 #----------------------------------------------------------------
  426 #
  427 .macro  addReg dstReg,srcReg_A,srcReg_B,useAddOp,immOffs
  428   .if \immOffs + 0
  429        leaq    \immOffs(%\srcReg_A\srcReg_B,%\dstReg),%\dstReg
  430   .elseif ((\useAddOp + 0) == 0)
  431     .ifndef ASM_NO_LEA  #lea seems to be faster on Core 2 Duo CPUs!
  432        leaq   (%\srcReg_A\srcReg_B,%\dstReg),%\dstReg
  433     .else
  434        addq    %\srcReg_A\srcReg_B,%\dstReg
  435     .endif
  436   .else
  437        addq    %\srcReg_A\srcReg_B,%\dstReg
  438   .endif
  439 .endm
  440 
  441 # keep Intel-style ordering here, to match addReg
  442 .macro  xorReg dstReg,srcReg_A,srcReg_B
  443         xorq   %\srcReg_A\srcReg_B,%\dstReg
  444 .endm
  445 #
  446 #----------------------------------------------------------------
  447 #
  448 .macro C_label lName
  449  \lName:        #use both "genders" to work across linkage conventions
  450 _\lName:
  451     .global  \lName
  452     .global _\lName
  453 .endm
  454 #
  455 #=================================== Skein_256 =============================================
  456 #
  457 .if _USE_ASM_ & 256
  458 #
  459 # void Skein_256_Process_Block(Skein_256_Ctxt_t *ctx,const u08b_t *blkPtr,size_t blkCnt,size_t bitcntAdd)#
  460 #
  461 #################
  462 #
  463 # code
  464 #
  465 C_label Skein_256_Process_Block
  466     Setup_Stack 256,((ROUNDS_256/8)+1)
  467     movq    TWEAK+8(%rdi),%r14
  468     jmp     Skein_256_block_loop
  469     .p2align 4
  470     # main hash loop for Skein_256
  471 Skein_256_block_loop:
  472     #
  473     # general register usage:
  474     #   RAX..RDX        = X0..X3    
  475     #   R08..R12        = ks[0..4]
  476     #   R13..R15        = ts[0..2]
  477     #   RSP, RBP        = stack/frame pointers
  478     #   RDI             = round counter or context pointer
  479     #   RSI             = temp
  480     #
  481     movq    TWEAK+0(%rdi)     ,%r13
  482     addq    bitAdd+F_O(%rbp)  ,%r13  #computed updated tweak value T0
  483     movq    %r14              ,%r15
  484     xorq    %r13              ,%r15  #now %r13.%r15 is set as the tweak 
  485 
  486     movq    $KW_PARITY        ,%r12
  487     movq       X_VARS+ 0(%rdi),%r8
  488     movq       X_VARS+ 8(%rdi),%r9 
  489     movq       X_VARS+16(%rdi),%r10
  490     movq       X_VARS+24(%rdi),%r11
  491     movq    %r13,TWEAK+0(%rdi)       #save updated tweak value ctx->h.T[0]
  492     xorq    %r8               ,%r12  #start accumulating overall parity
  493 
  494     movq    blkPtr +F_O(%rbp) ,%rsi  #esi --> input block
  495     xorq    %r9               ,%r12
  496     movq     0(%rsi)          ,%rax  #get X[0..3]
  497     xorq    %r10              ,%r12
  498     movq     8(%rsi)          ,%rbx
  499     xorq    %r11              ,%r12
  500     movq    16(%rsi)          ,%rcx
  501     movq    24(%rsi)          ,%rdx
  502 
  503     movq    %rax,Wcopy+ 0+F_O(%rbp)  #save copy of input block
  504     movq    %rbx,Wcopy+ 8+F_O(%rbp)    
  505     movq    %rcx,Wcopy+16+F_O(%rbp)    
  506     movq    %rdx,Wcopy+24+F_O(%rbp)    
  507 
  508     addq    %r8 ,%rax                #initial key injection
  509     addq    %r9 ,%rbx 
  510     addq    %r10,%rcx
  511     addq    %r11,%rdx
  512     addq    %r13,%rbx
  513     addq    %r14,%rcx
  514 
  515 .if _SKEIN_DEBUG
  516     movq    %r14,TWEAK+ 8(%rdi)      #save updated tweak T[1] (start bit cleared?)
  517     movq    %r8 ,ksKey+ 0+F_O(%rbp)  #save key schedule on stack for Skein_Debug_Block
  518     movq    %r9 ,ksKey+ 8+F_O(%rbp)    
  519     movq    %r10,ksKey+16+F_O(%rbp)    
  520     movq    %r11,ksKey+24+F_O(%rbp)    
  521     movq    %r12,ksKey+32+F_O(%rbp)    
  522                                        
  523     movq    %r13,ksTwk+ 0+F_O(%rbp)    
  524     movq    %r14,ksTwk+ 8+F_O(%rbp)    
  525     movq    %r15,ksTwk+16+F_O(%rbp)    
  526                                        
  527     movq    %rax,X_stk + 0(%rsp)     #save X[] on stack for Skein_Debug_Block
  528     movq    %rbx,X_stk + 8(%rsp)       
  529     movq    %rcx,X_stk +16(%rsp)       
  530     movq    %rdx,X_stk +24(%rsp)       
  531 
  532     Skein_Debug_Block 256            #debug dump
  533     Skein_Debug_Round 256,SKEIN_RND_KEY_INITIAL
  534 .endif
  535 #
  536 .if (((SKEIN_ASM_UNROLL) & 256) == 0)
  537     movq    %r8 ,ksKey+40+F_O(%rbp)  #save key schedule on stack for looping code
  538     movq    %r9 ,ksKey+ 8+F_O(%rbp)    
  539     movq    %r10,ksKey+16+F_O(%rbp)    
  540     movq    %r11,ksKey+24+F_O(%rbp)    
  541     movq    %r12,ksKey+32+F_O(%rbp)    
  542                                        
  543     movq    %r13,ksTwk+24+F_O(%rbp)    
  544     movq    %r14,ksTwk+ 8+F_O(%rbp)    
  545     movq    %r15,ksTwk+16+F_O(%rbp)    
  546 .endif
  547     addq    $WCNT*8,%rsi             #skip the block
  548     movq    %rsi,blkPtr  +F_O(%rbp)  #update block pointer
  549     #
  550     # now the key schedule is computed. Start the rounds
  551     #
  552 .if (SKEIN_ASM_UNROLL) & 256
  553 _UNROLL_CNT =   ROUNDS_256/8
  554 .else
  555 _UNROLL_CNT =   SKEIN_UNROLL_256
  556   .if ((ROUNDS_256/8) % _UNROLL_CNT)
  557     .error "Invalid SKEIN_UNROLL_256"
  558   .endif
  559     xorq    %rdi,%rdi                #rdi = iteration count
  560 Skein_256_round_loop:
  561 .endif
  562 _Rbase_ = 0
  563 .rept _UNROLL_CNT*2
  564     # all X and ks vars in regs      # (ops to "rotate" ks vars, via mem, if not unrolled)
  565     # round 4*_RBase_ + 0
  566     addReg  rax, rbx
  567     RotL64  rbx, 256,%((4*_Rbase_+0) % 8),0
  568     addReg  rcx, rdx
  569                 .if ((SKEIN_ASM_UNROLL) & 256) == 0
  570                     movq ksKey+8*1+F_O(%rbp,%rdi,8),%r8
  571                 .endif
  572     xorReg  rbx, rax
  573     RotL64  rdx, 256,%((4*_Rbase_+0) % 8),1
  574     xorReg  rdx, rcx
  575   .if (SKEIN_ASM_UNROLL) & 256
  576     .irp _r0_,%( 8+(_Rbase_+3) % 5)
  577     .irp _r1_,%(13+(_Rbase_+2) % 3)
  578       leaq   (%r\_r0_,%r\_r1_),%rdi    #precompute key injection value for %rcx
  579     .endr
  580     .endr
  581   .endif
  582                 .if ((SKEIN_ASM_UNROLL) & 256) == 0
  583                     movq ksTwk+8*1+F_O(%rbp,%rdi,8),%r13
  584                 .endif
  585     Skein_Debug_Round 256,%(4*_Rbase_+1)
  586 
  587     # round 4*_Rbase_ + 1
  588     addReg  rax, rdx
  589     RotL64  rdx, 256,%((4*_Rbase_+1) % 8),0
  590     xorReg  rdx, rax
  591                 .if ((SKEIN_ASM_UNROLL) & 256) == 0
  592                     movq ksKey+8*2+F_O(%rbp,%rdi,8),%r9
  593                 .endif
  594     addReg  rcx, rbx
  595     RotL64  rbx, 256,%((4*_Rbase_+1) % 8),1
  596     xorReg  rbx, rcx
  597                 .if ((SKEIN_ASM_UNROLL) & 256) == 0
  598                     movq ksKey+8*4+F_O(%rbp,%rdi,8),%r11
  599                 .endif
  600     Skein_Debug_Round 256,%(4*_Rbase_+2)
  601  .if (SKEIN_ASM_UNROLL) & 256
  602     .irp _r0_,%( 8+(_Rbase_+2) % 5)
  603     .irp _r1_,%(13+(_Rbase_+1) % 3)
  604       leaq   (%r\_r0_,%r\_r1_),%rsi     #precompute key injection value for %rbx
  605     .endr
  606     .endr
  607  .endif
  608     # round 4*_Rbase_ + 2
  609     addReg  rax, rbx
  610     RotL64  rbx, 256,%((4*_Rbase_+2) % 8),0
  611     addReg  rcx, rdx
  612                 .if ((SKEIN_ASM_UNROLL) & 256) == 0
  613                     movq ksKey+8*3+F_O(%rbp,%rdi,8),%r10
  614                 .endif
  615     xorReg  rbx, rax
  616     RotL64  rdx, 256,%((4*_Rbase_+2) % 8),1
  617     xorReg  rdx, rcx
  618                 .if ((SKEIN_ASM_UNROLL) & 256) == 0
  619                     movq %r8,ksKey+8*6+F_O(%rbp,%rdi,8)  #"rotate" the key
  620                     leaq 1(%r11,%rdi),%r11               #precompute key + tweak
  621                 .endif
  622     Skein_Debug_Round 256,%(4*_Rbase_+3)
  623     # round 4*_Rbase_ + 3
  624     addReg  rax, rdx
  625     RotL64  rdx, 256,%((4*_Rbase_+3) % 8),0
  626     addReg  rcx, rbx
  627                 .if ((SKEIN_ASM_UNROLL) & 256) == 0
  628                     addq      ksTwk+8*2+F_O(%rbp,%rdi,8),%r10  #precompute key + tweak
  629                     movq %r13,ksTwk+8*4+F_O(%rbp,%rdi,8)       #"rotate" the tweak
  630                 .endif
  631     xorReg  rdx, rax
  632     RotL64  rbx, 256,%((4*_Rbase_+3) % 8),1
  633     xorReg  rbx, rcx
  634     Skein_Debug_Round 256,%(4*_Rbase_+4)
  635                 .if ((SKEIN_ASM_UNROLL) & 256) == 0
  636                     addReg r9 ,r13           #precompute key+tweak
  637                 .endif
  638       #inject key schedule words
  639 _Rbase_ = _Rbase_+1
  640   .if (SKEIN_ASM_UNROLL) & 256
  641     addReg    rax,r,%(8+((_Rbase_+0) % 5))
  642     addReg    rbx,rsi
  643     addReg    rcx,rdi
  644     addReg    rdx,r,%(8+((_Rbase_+3) % 5)),,_Rbase_
  645   .else
  646     incq      %rdi
  647     addReg    rax,r8 
  648     addReg    rcx,r10
  649     addReg    rbx,r9 
  650     addReg    rdx,r11
  651   .endif
  652     Skein_Debug_Round 256,SKEIN_RND_KEY_INJECT
  653 .endr #rept _UNROLL_CNT
  654 #
  655 .if ((SKEIN_ASM_UNROLL) & 256) == 0
  656     cmpq    $2*(ROUNDS_256/8),%rdi
  657     jb      Skein_256_round_loop
  658 .endif # (SKEIN_ASM_UNROLL & 256) == 0
  659     movq    ctxPtr +F_O(%rbp),%rdi           #restore rdi --> context
  660 
  661     #----------------------------
  662     # feedforward:   ctx->X[i] = X[i] ^ w[i], {i=0..3}
  663     movq    $FIRST_MASK64 ,%r14
  664     xorq    Wcopy + 0+F_O (%rbp),%rax
  665     xorq    Wcopy + 8+F_O (%rbp),%rbx
  666     xorq    Wcopy +16+F_O (%rbp),%rcx
  667     xorq    Wcopy +24+F_O (%rbp),%rdx
  668     andq    TWEAK + 8     (%rdi),%r14
  669     movq    %rax,X_VARS+ 0(%rdi)             #store final result
  670     movq    %rbx,X_VARS+ 8(%rdi)        
  671     movq    %rcx,X_VARS+16(%rdi)        
  672     movq    %rdx,X_VARS+24(%rdi)        
  673 
  674     Skein_Debug_Round 256,SKEIN_RND_FEED_FWD
  675 
  676     # go back for more blocks, if needed
  677     decq    blkCnt+F_O(%rbp)
  678     jnz     Skein_256_block_loop
  679     movq    %r14,TWEAK + 8(%rdi)
  680     Reset_Stack
  681     ret
  682 Skein_256_Process_Block_End:
  683 
  684   .if _SKEIN_DEBUG
  685 Skein_Debug_Round_256:               #here with rdx == round "number" from macro
  686     pushq   %rsi                     #save two regs for BLK_BITS-specific parms
  687     pushq   %rdi
  688     movq    24(%rsp),%rdi            #get back original rdx (pushed on stack in macro call) to rdi
  689     movq    %rax,X_stk+ 0+F_O(%rbp)  #save X[] state on stack so debug routines can access it
  690     movq    %rbx,X_stk+ 8+F_O(%rbp)  #(use FP_ since rsp has changed!)
  691     movq    %rcx,X_stk+16+F_O(%rbp)
  692     movq    %rdi,X_stk+24+F_O(%rbp)
  693 
  694     movq    ctxPtr+F_O(%rbp),%rsi    #ctx_hdr_ptr
  695     movq    $256,%rdi                #now <rdi,rsi,rdx> are set for the call
  696     jmp     Skein_Debug_Round_Common
  697   .endif
  698 #
  699 .if _SKEIN_CODE_SIZE
  700 C_label  Skein_256_Process_Block_CodeSize
  701     movq    $(Skein_256_Process_Block_End-Skein_256_Process_Block),%rax
  702     ret
  703 #
  704 C_label Skein_256_Unroll_Cnt
  705   .if _UNROLL_CNT <> ROUNDS_256/8
  706     movq    $_UNROLL_CNT,%rax
  707   .else
  708     xorq    %rax,%rax
  709   .endif
  710     ret
  711 .endif
  712 #
  713 .endif #_USE_ASM_ & 256
  714 #
  715 #=================================== Skein_512 =============================================
  716 #
  717 .if _USE_ASM_ & 512
  718 #
  719 # void Skein_512_Process_Block(Skein_512_Ctxt_t *ctx,const u08b_t *blkPtr,size_t blkCnt,size_t bitcntAdd)
  720 #
  721 # X[i] == %r[8+i]          #register assignments for X[] values during rounds (i=0..7)
  722 #
  723 #################
  724 # MACRO: one round for 512-bit blocks
  725 #
  726 .macro R_512_OneRound rn0,rn1,rn2,rn3,rn4,rn5,rn6,rn7,_Rn_,op1,op2,op3,op4
  727 #
  728     addReg      r\rn0, r\rn1
  729     RotL64      r\rn1, 512,%((\_Rn_) % 8),0
  730     xorReg      r\rn1, r\rn0
  731             \op1
  732     addReg      r\rn2, r\rn3
  733     RotL64      r\rn3, 512,%((\_Rn_) % 8),1
  734     xorReg      r\rn3, r\rn2
  735             \op2
  736     addReg      r\rn4, r\rn5
  737     RotL64      r\rn5, 512,%((\_Rn_) % 8),2
  738     xorReg      r\rn5, r\rn4
  739             \op3
  740     addReg      r\rn6, r\rn7
  741     RotL64      r\rn7, 512,%((\_Rn_) % 8),3
  742     xorReg      r\rn7, r\rn6
  743             \op4
  744     Skein_Debug_Round 512,%(\_Rn_+1),-4
  745 #
  746 .endm #R_512_OneRound
  747 #
  748 #################
  749 # MACRO: eight rounds for 512-bit blocks
  750 #
  751 .macro R_512_FourRounds _RR_    #RR = base round number (0 % 8)
  752   .if ((SKEIN_ASM_UNROLL) & 512)
  753     # here for fully unrolled case.
  754     _II_ = ((\_RR_)/4) + 1       #key injection counter
  755     R_512_OneRound  8, 9,10,11,12,13,14,15,%((\_RR_)+0),<movq ksKey+8*(((_II_)+3) % 9)+F_O(%rbp),%rax>,,<movq ksKey+8*(((_II_)+4) % 9)+F_O(%rbp),%rbx>
  756     R_512_OneRound 10, 9,12,15,14,13, 8,11,%((\_RR_)+1),<movq ksKey+8*(((_II_)+5) % 9)+F_O(%rbp),%rcx>,,<movq ksKey+8*(((_II_)+6) % 9)+F_O(%rbp),%rdx>
  757     R_512_OneRound 12, 9,14,11, 8,13,10,15,%((\_RR_)+2),<movq ksKey+8*(((_II_)+7) % 9)+F_O(%rbp),%rsi>,,<addq ksTwk+8*(((_II_)+0) % 3)+F_O(%rbp),%rcx>
  758     R_512_OneRound 14, 9, 8,15,10,13,12,11,%((\_RR_)+3),<addq ksTwk+8*(((_II_)+1) % 3)+F_O(%rbp),%rdx>,
  759     # inject the key schedule
  760     addq    ksKey+8*(((_II_)+0)%9)+F_O(%rbp),%r8
  761     addReg   r11, rax
  762     addq    ksKey+8*(((_II_)+1)%9)+F_O(%rbp),%r9
  763     addReg   r12, rbx
  764     addq    ksKey+8*(((_II_)+2)%9)+F_O(%rbp),%r10
  765     addReg   r13, rcx
  766     addReg   r14, rdx
  767     addReg   r15, rsi,,,(_II_)
  768   .else
  769     # here for looping case                                                    #"rotate" key/tweak schedule (move up on stack)
  770     incq    %rdi                 #bump key injection counter
  771     R_512_OneRound  8, 9,10,11,12,13,14,15,%((\_RR_)+0),<movq ksKey+8*6+F_O(%rbp,%rdi,8),%rdx>,<movq      ksTwk-8*1+F_O(%rbp,%rdi,8),%rax>,<movq      ksKey-8*1+F_O(%rbp,%rdi,8),%rsi>
  772     R_512_OneRound 10, 9,12,15,14,13, 8,11,%((\_RR_)+1),<movq ksKey+8*5+F_O(%rbp,%rdi,8),%rcx>,<movq %rax,ksTwk+8*2+F_O(%rbp,%rdi,8)     >,<movq %rsi,ksKey+8*8+F_O(%rbp,%rdi,8)>
  773     R_512_OneRound 12, 9,14,11, 8,13,10,15,%((\_RR_)+2),<movq ksKey+8*4+F_O(%rbp,%rdi,8),%rbx>,<addq      ksTwk+8*1+F_O(%rbp,%rdi,8),%rdx>,<movq      ksKey+8*7+F_O(%rbp,%rdi,8),%rsi>
  774     R_512_OneRound 14, 9, 8,15,10,13,12,11,%((\_RR_)+3),<movq ksKey+8*3+F_O(%rbp,%rdi,8),%rax>,<addq      ksTwk+8*0+F_O(%rbp,%rdi,8),%rcx>
  775     # inject the key schedule
  776     addq    ksKey+8*0+F_O(%rbp,%rdi,8),%r8
  777     addReg   r11, rax
  778     addReg   r12, rbx
  779     addq    ksKey+8*1+F_O(%rbp,%rdi,8),%r9
  780     addReg   r13, rcx
  781     addReg   r14, rdx
  782     addq    ksKey+8*2+F_O(%rbp,%rdi,8),%r10
  783     addReg   r15, rsi
  784     addReg   r15, rdi              #inject the round number
  785   .endif
  786 
  787     #show the result of the key injection
  788     Skein_Debug_Round 512,SKEIN_RND_KEY_INJECT
  789 .endm #R_512_EightRounds
  790 #
  791 #################
  792 # instantiated code
  793 #
  794 C_label Skein_512_Process_Block
  795     Setup_Stack 512,ROUNDS_512/8
  796     movq    TWEAK+ 8(%rdi),%rbx
  797     jmp     Skein_512_block_loop
  798     .p2align 4
  799     # main hash loop for Skein_512
  800 Skein_512_block_loop:
  801     # general register usage:
  802     #   RAX..RDX       = temps for key schedule pre-loads
  803     #   R8 ..R15       = X0..X7
  804     #   RSP, RBP       = stack/frame pointers
  805     #   RDI            = round counter or context pointer
  806     #   RSI            = temp
  807     #
  808     movq    TWEAK +  0(%rdi),%rax
  809     addq    bitAdd+F_O(%rbp),%rax     #computed updated tweak value T0
  810     movq    %rbx,%rcx
  811     xorq    %rax,%rcx                 #%rax/%rbx/%rcx = tweak schedule
  812     movq    %rax,TWEAK+ 0    (%rdi)   #save updated tweak value ctx->h.T[0]
  813     movq    %rax,ksTwk+ 0+F_O(%rbp)
  814     movq    $KW_PARITY,%rdx
  815     movq    blkPtr +F_O(%rbp),%rsi    #%rsi --> input block
  816     movq    %rbx,ksTwk+ 8+F_O(%rbp)
  817     movq    %rcx,ksTwk+16+F_O(%rbp)
  818     .irp _Rn_,8,9,10,11,12,13,14,15
  819       movq  X_VARS+8*(\_Rn_-8)(%rdi),%r\_Rn_
  820       xorq  %r\_Rn_,%rdx              #compute overall parity
  821       movq  %r\_Rn_,ksKey+8*(\_Rn_-8)+F_O(%rbp)
  822     .endr                             #load state into %r8 ..%r15, compute parity
  823       movq  %rdx,ksKey+8*(8)+F_O(%rbp)#save key schedule parity
  824 
  825     addReg   r13,rax                  #precompute key injection for tweak
  826     addReg   r14, rbx
  827 .if _SKEIN_DEBUG
  828     movq    %rbx,TWEAK+ 8(%rdi)       #save updated tweak value ctx->h.T[1] for Skein_Debug_Block below
  829 .endif
  830     movq     0(%rsi),%rax             #load input block
  831     movq     8(%rsi),%rbx 
  832     movq    16(%rsi),%rcx 
  833     movq    24(%rsi),%rdx 
  834     addReg   r8 , rax                 #do initial key injection
  835     addReg   r9 , rbx
  836     movq    %rax,Wcopy+ 0+F_O(%rbp)   #keep local copy for feedforward
  837     movq    %rbx,Wcopy+ 8+F_O(%rbp)
  838     addReg   r10, rcx
  839     addReg   r11, rdx
  840     movq    %rcx,Wcopy+16+F_O(%rbp)
  841     movq    %rdx,Wcopy+24+F_O(%rbp)
  842 
  843     movq    32(%rsi),%rax
  844     movq    40(%rsi),%rbx 
  845     movq    48(%rsi),%rcx 
  846     movq    56(%rsi),%rdx
  847     addReg   r12, rax
  848     addReg   r13, rbx
  849     addReg   r14, rcx
  850     addReg   r15, rdx
  851     movq    %rax,Wcopy+32+F_O(%rbp)    
  852     movq    %rbx,Wcopy+40+F_O(%rbp)    
  853     movq    %rcx,Wcopy+48+F_O(%rbp)    
  854     movq    %rdx,Wcopy+56+F_O(%rbp)    
  855 
  856 .if _SKEIN_DEBUG
  857     .irp _Rn_,8,9,10,11,12,13,14,15   #save values on stack for debug output
  858       movq  %r\_Rn_,X_stk+8*(\_Rn_-8)(%rsp)
  859     .endr
  860 
  861     Skein_Debug_Block 512             #debug dump
  862     Skein_Debug_Round 512,SKEIN_RND_KEY_INITIAL
  863 .endif
  864     addq    $8*WCNT,%rsi              #skip the block
  865     movq    %rsi,blkPtr+F_O(%rbp)     #update block pointer
  866     #
  867     #################
  868     # now the key schedule is computed. Start the rounds
  869     #
  870 .if (SKEIN_ASM_UNROLL) & 512
  871 _UNROLL_CNT =   ROUNDS_512/8
  872 .else
  873 _UNROLL_CNT =   SKEIN_UNROLL_512
  874   .if ((ROUNDS_512/8) % _UNROLL_CNT)
  875     .error "Invalid SKEIN_UNROLL_512"
  876   .endif
  877     xorq    %rdi,%rdi                 #rdi = round counter
  878 Skein_512_round_loop:
  879 .endif
  880 #
  881 _Rbase_ = 0
  882 .rept _UNROLL_CNT*2
  883       R_512_FourRounds %(4*_Rbase_+00)
  884 _Rbase_ = _Rbase_+1
  885 .endr #rept _UNROLL_CNT
  886 #
  887 .if ((SKEIN_ASM_UNROLL) & 512) == 0
  888     cmpq    $2*(ROUNDS_512/8),%rdi
  889     jb      Skein_512_round_loop
  890     movq    ctxPtr +F_O(%rbp),%rdi           #restore rdi --> context
  891 .endif
  892     # end of rounds
  893     #################
  894     # feedforward:   ctx->X[i] = X[i] ^ w[i], {i=0..7}
  895     .irp _Rn_,8,9,10,11,12,13,14,15
  896   .if (\_Rn_ == 8)
  897     movq    $FIRST_MASK64,%rbx
  898   .endif
  899       xorq  Wcopy+8*(\_Rn_-8)+F_O(%rbp),%r\_Rn_  #feedforward XOR
  900       movq  %r\_Rn_,X_VARS+8*(\_Rn_-8)(%rdi)     #and store result
  901   .if (\_Rn_ == 14)
  902     andq    TWEAK+ 8(%rdi),%rbx
  903   .endif
  904     .endr
  905     Skein_Debug_Round 512,SKEIN_RND_FEED_FWD
  906 
  907     # go back for more blocks, if needed
  908     decq    blkCnt+F_O(%rbp)
  909     jnz     Skein_512_block_loop
  910     movq    %rbx,TWEAK + 8(%rdi)
  911 
  912     Reset_Stack
  913     ret
  914 Skein_512_Process_Block_End:
  915 #
  916   .if _SKEIN_DEBUG
  917 # call here with rdx  = "round number"
  918 Skein_Debug_Round_512:
  919     pushq   %rsi                     #save two regs for BLK_BITS-specific parms
  920     pushq   %rdi
  921   .irp _Rn_,8,9,10,11,12,13,14,15    #save X[] state on stack so debug routines can access it
  922     movq    %r\_Rn_,X_stk+8*(\_Rn_-8)+F_O(%rbp)
  923   .endr
  924     movq    ctxPtr+F_O(%rbp),%rsi    #ctx_hdr_ptr
  925     movq    $512,%rdi                #now <rdi,rsi,rdx> are set for the call
  926     jmp     Skein_Debug_Round_Common
  927   .endif
  928 #
  929 .if _SKEIN_CODE_SIZE
  930 C_label Skein_512_Process_Block_CodeSize
  931     movq    $(Skein_512_Process_Block_End-Skein_512_Process_Block),%rax
  932     ret
  933 #
  934 C_label Skein_512_Unroll_Cnt
  935   .if _UNROLL_CNT <> (ROUNDS_512/8)
  936     movq    $_UNROLL_CNT,%rax
  937   .else
  938     xorq    %rax,%rax
  939   .endif
  940     ret
  941 .endif
  942 #
  943 .endif # _USE_ASM_ & 512
  944 #
  945 #=================================== Skein1024 =============================================
  946 .if _USE_ASM_ & 1024
  947 #
  948 # void Skein1024_Process_Block(Skein_1024_Ctxt_t *ctx,const u08b_t *blkPtr,size_t blkCnt,size_t bitcntAdd)#
  949 #
  950 #################
  951 # use details of permutation to make register assignments
  952 # 
  953 o1K_rdi =  0        #offsets in X[] associated with each register
  954 o1K_rsi =  1 
  955 o1K_rbp =  2 
  956 o1K_rax =  3 
  957 o1K_rcx =  4        #rcx is "shared" with X6, since X4/X6 alternate
  958 o1K_rbx =  5 
  959 o1K_rdx =  7 
  960 o1K_r8  =  8  
  961 o1K_r9  =  9  
  962 o1K_r10 = 10
  963 o1K_r11 = 11
  964 o1K_r12 = 12
  965 o1K_r13 = 13
  966 o1K_r14 = 14
  967 o1K_r15 = 15
  968 #
  969 rIdx_offs = tmpStk_1024
  970 #
  971 .macro r1024_Mix w0,w1,reg0,reg1,_RN0_,_Rn1_,op1
  972     addReg      \reg0 , \reg1                      #perform the MIX
  973     RotL64      \reg1 , 1024,%((\_RN0_) % 8),\_Rn1_
  974     xorReg      \reg1 , \reg0
  975 .if ((\_RN0_) & 3) == 3        #time to do key injection?
  976  .if _SKEIN_DEBUG
  977     movq       %\reg0 , xDebug_1024+8*\w0(%rsp)    #save intermediate values for Debug_Round
  978     movq       %\reg1 , xDebug_1024+8*\w1(%rsp)    # (before inline key injection)
  979  .endif
  980 _II_ = ((\_RN0_)/4)+1           #injection count
  981  .if (SKEIN_ASM_UNROLL) & 1024   #here to do fully unrolled key injection
  982     addq        ksKey+ 8*((_II_+\w0) % 17)(%rsp),%\reg0
  983     addq        ksKey+ 8*((_II_+\w1) % 17)(%rsp),%\reg1
  984   .if     \w1 == 13                                #tweak injection
  985     addq        ksTwk+ 8*((_II_+ 0) %  3)(%rsp),%\reg1
  986   .elseif \w0 == 14
  987     addq        ksTwk+ 8*((_II_+ 1) %  3)(%rsp),%\reg0
  988   .elseif \w1 == 15
  989     addq        $_II_, %\reg1                      #(injection counter)
  990   .endif
  991  .else                          #here to do looping  key injection
  992   .if  (\w0 == 0)
  993     movq        %rdi, X_stk+8*\w0(%rsp)            #if so, store N0 so we can use reg as index
  994     movq         rIdx_offs(%rsp),%rdi              #get the injection counter index into rdi
  995   .else
  996     addq         ksKey+8+8*\w0(%rsp,%rdi,8),%\reg0 #even key injection
  997   .endif
  998   .if     \w1 == 13                                #tweak injection
  999     addq         ksTwk+8+8* 0(%rsp,%rdi,8),%\reg1  
 1000   .elseif \w0 == 14
 1001     addq         ksTwk+8+8* 1(%rsp,%rdi,8),%\reg0  
 1002   .elseif \w1 == 15
 1003     addReg      \reg1,rdi,,,1                      #(injection counter)
 1004   .endif
 1005     addq         ksKey+8+8*\w1(%rsp,%rdi,8),%\reg1 #odd key injection
 1006  .endif
 1007 .endif
 1008     # insert the op provided, .if any
 1009     \op1
 1010 .endm
 1011 #################
 1012 # MACRO: four rounds for 1024-bit blocks
 1013 #
 1014 .macro r1024_FourRounds _RR_    #RR = base round number (0 mod 4)
 1015     # should be here with X4 set properly, X6 stored on stack
 1016 _Rn_ = (\_RR_) + 0
 1017         r1024_Mix  0, 1,rdi,rsi,_Rn_,0
 1018         r1024_Mix  2, 3,rbp,rax,_Rn_,1
 1019         r1024_Mix  4, 5,rcx,rbx,_Rn_,2,<movq %rcx,X_stk+8*4(%rsp)>       #save X4  on  stack (x4/x6 alternate)
 1020         r1024_Mix  8, 9,r8 ,r9 ,_Rn_,4,<movq      X_stk+8*6(%rsp),%rcx>  #load X6 from stack 
 1021         r1024_Mix 10,11,r10,r11,_Rn_,5
 1022         r1024_Mix 12,13,r12,r13,_Rn_,6
 1023         r1024_Mix  6, 7,rcx,rdx,_Rn_,3
 1024         r1024_Mix 14,15,r14,r15,_Rn_,7
 1025     .if _SKEIN_DEBUG
 1026       Skein_Debug_Round 1024,%(_Rn_+1)
 1027     .endif
 1028 _Rn_ = (\_RR_) + 1
 1029         r1024_Mix  0, 9,rdi,r9 ,_Rn_,0
 1030         r1024_Mix  2,13,rbp,r13,_Rn_,1
 1031         r1024_Mix  6,11,rcx,r11,_Rn_,2,<movq %rcx,X_stk+8*6(%rsp)>       #save X6  on  stack (x4/x6 alternate)
 1032         r1024_Mix 10, 7,r10,rdx,_Rn_,4,<movq      X_stk+8*4(%rsp),%rcx>  #load X4 from stack 
 1033         r1024_Mix 12, 3,r12,rax,_Rn_,5
 1034         r1024_Mix 14, 5,r14,rbx,_Rn_,6
 1035         r1024_Mix  4,15,rcx,r15,_Rn_,3
 1036         r1024_Mix  8, 1,r8 ,rsi,_Rn_,7
 1037     .if _SKEIN_DEBUG
 1038       Skein_Debug_Round 1024,%(_Rn_+1)
 1039     .endif
 1040 _Rn_ = (\_RR_) + 2
 1041         r1024_Mix  0, 7,rdi,rdx,_Rn_,0
 1042         r1024_Mix  2, 5,rbp,rbx,_Rn_,1
 1043         r1024_Mix  4, 3,rcx,rax,_Rn_,2,<movq %rcx,X_stk+8*4(%rsp)>       #save X4  on  stack (x4/x6 alternate)
 1044         r1024_Mix 12,15,r12,r15,_Rn_,4,<movq      X_stk+8*6(%rsp),%rcx>  #load X6 from stack 
 1045         r1024_Mix 14,13,r14,r13,_Rn_,5
 1046         r1024_Mix  8,11,r8 ,r11,_Rn_,6
 1047         r1024_Mix  6, 1,rcx,rsi,_Rn_,3
 1048         r1024_Mix 10, 9,r10,r9 ,_Rn_,7
 1049     .if _SKEIN_DEBUG
 1050       Skein_Debug_Round 1024,%(_Rn_+1)
 1051     .endif
 1052 _Rn_ = (\_RR_) + 3
 1053         r1024_Mix  0,15,rdi,r15,_Rn_,0
 1054         r1024_Mix  2,11,rbp,r11,_Rn_,1
 1055         r1024_Mix  6,13,rcx,r13,_Rn_,2,<movq %rcx,X_stk+8*6(%rsp)>       #save X6  on  stack (x4/x6 alternate)
 1056         r1024_Mix 14, 1,r14,rsi,_Rn_,4,<movq      X_stk+8*4(%rsp),%rcx>  #load X4 from stack 
 1057         r1024_Mix  8, 5,r8 ,rbx,_Rn_,5
 1058         r1024_Mix 10, 3,r10,rax,_Rn_,6
 1059         r1024_Mix  4, 9,rcx,r9 ,_Rn_,3
 1060         r1024_Mix 12, 7,r12,rdx,_Rn_,7
 1061     .if _SKEIN_DEBUG
 1062       Skein_Debug_Round 1024,%(_Rn_+1)
 1063     .endif
 1064 
 1065   .if ((SKEIN_ASM_UNROLL) & 1024) == 0           #here with rdi == rIdx, X0 on stack
 1066     #"rotate" the key schedule on the stack
 1067 i8 = o1K_r8
 1068 i0 = o1K_rdi
 1069     movq    %r8 , X_stk+8*i8(%rsp)              #free up a register (save it on the stack)
 1070     movq          ksKey+8* 0(%rsp,%rdi,8),%r8   #get  key  word
 1071     movq    %r8 , ksKey+8*17(%rsp,%rdi,8)       #rotate key (must do key first or tweak clobbers it!)
 1072     movq          ksTwk+8* 0(%rsp,%rdi,8),%r8   #get tweak word
 1073     movq    %r8 , ksTwk+8* 3(%rsp,%rdi,8)       #rotate tweak (onto the stack)
 1074     movq          X_stk+8*i8(%rsp)       ,%r8   #get the reg back
 1075     incq    %rdi                                #bump the index
 1076     movq    %rdi, rIdx_offs (%rsp)              #save rdi again
 1077     movq          ksKey+8*i0(%rsp,%rdi,8),%rdi  #get the key schedule word for X0 back
 1078     addq          X_stk+8*i0(%rsp)       ,%rdi  #perform the X0 key injection
 1079   .endif
 1080     #show the result of the key injection
 1081     Skein_Debug_Round 1024,SKEIN_RND_KEY_INJECT
 1082 .endm #r1024_FourRounds
 1083 #
 1084 ################
 1085 # code
 1086 #
 1087 C_label Skein1024_Process_Block
 1088 #
 1089     Setup_Stack 1024,ROUNDS_1024/8,WCNT
 1090     movq    TWEAK+ 8(%rdi),%r9
 1091     jmp     Skein1024_block_loop
 1092     # main hash loop for Skein1024
 1093     .p2align 4
 1094 Skein1024_block_loop:
 1095     # general register usage:
 1096     #   RSP              = stack pointer
 1097     #   RAX..RDX,RSI,RDI = X1, X3..X7 (state words)
 1098     #   R8 ..R15         = X8..X15    (state words)
 1099     #   RBP              = temp (used for X0 and X2)
 1100     #
 1101   .if ((SKEIN_ASM_UNROLL) & 1024) == 0
 1102     xorq    %rax,%rax                      #init loop index on the stack
 1103     movq    %rax,rIdx_offs(%rsp)
 1104   .endif
 1105     movq         TWEAK+     0(%rdi),%r8
 1106     addq         bitAdd+  F_O(%rbp),%r8    #computed updated tweak value T0
 1107     movq    %r9 ,%r10 
 1108     xorq    %r8 ,%r10                      #%rax/%rbx/%rcx = tweak schedule
 1109     movq    %r8 ,TWEAK+     0(%rdi)        #save updated tweak value ctx->h.T[0]
 1110     movq    %r8 ,ksTwk+ 0+F_O(%rbp)
 1111     movq    %r9 ,ksTwk+ 8+F_O(%rbp)        #keep values in %r8 ,%r9  for initial tweak injection below
 1112     movq    %r10,ksTwk+16+F_O(%rbp)
 1113   .if _SKEIN_DEBUG
 1114     movq    %r9 ,TWEAK+     8(%rdi)        #save updated tweak value ctx->h.T[1] for Skein_Debug_Block
 1115   .endif
 1116     movq         blkPtr +F_O(%rbp),%rsi    # rsi --> input block
 1117     movq        $KW_PARITY        ,%rax    #overall key schedule parity
 1118 
 1119     # the logic here assumes the set {rdi,rsi,rbp,rax} = X[0,1,2,3]
 1120     .irp _rN_,0,1,2,3,4,6                  #process the "initial" words, using r14/r15 as temps
 1121       movq       X_VARS+8*\_rN_(%rdi),%r14 #get state word
 1122       movq              8*\_rN_(%rsi),%r15 #get msg   word
 1123       xorq  %r14,%rax                      #update key schedule overall parity
 1124       movq  %r14,ksKey +8*\_rN_+F_O(%rbp)  #save key schedule word on stack
 1125       movq  %r15,Wcopy +8*\_rN_+F_O(%rbp)  #save local msg Wcopy
 1126       addq  %r15,%r14                      #do the initial key injection
 1127       movq  %r14,X_stk +8*\_rN_    (%rsp)  #save initial state var on stack
 1128     .endr
 1129     # now process the rest, using the "real" registers 
 1130     #     (MUST do it in reverse order to inject tweaks r8/r9 first)
 1131     .irp _rr_,r15,r14,r13,r12,r11,r10,r9,r8,rdx,rbx
 1132 _oo_ = o1K_\_rr_                           #offset assocated with the register
 1133       movq  X_VARS+8*_oo_(%rdi),%\_rr_     #get key schedule word from context
 1134       movq         8*_oo_(%rsi),%rcx       #get next input msg word
 1135       movq  %\_rr_, ksKey +8*_oo_(%rsp)    #save key schedule on stack
 1136       xorq  %\_rr_, %rax                   #accumulate key schedule parity
 1137       movq  %rcx,Wcopy+8*_oo_+F_O(%rbp)    #save copy of msg word for feedforward
 1138       addq  %rcx,%\_rr_                    #do the initial  key  injection
 1139       .if    _oo_ == 13                    #do the initial tweak injection
 1140         addReg \_rr_,r8                    #          (only in words 13/14)
 1141       .elseif _oo_ == 14
 1142         addReg \_rr_,r9
 1143       .endif
 1144     .endr
 1145     movq    %rax,ksKey+8*WCNT+F_O(%rbp)    #save key schedule parity
 1146 .if _SKEIN_DEBUG
 1147     Skein_Debug_Block 1024                 #initial debug dump
 1148 .endif
 1149     addq     $8*WCNT,%rsi                  #bump the msg ptr
 1150     movq     %rsi,blkPtr+F_O(%rbp)         #save bumped msg ptr
 1151     # re-load words 0..4 from stack, enter the main loop
 1152     .irp _rr_,rdi,rsi,rbp,rax,rcx          #(no need to re-load x6, already on stack)
 1153       movq  X_stk+8*o1K_\_rr_(%rsp),%\_rr_ #re-load state and get ready to go!
 1154     .endr
 1155 .if _SKEIN_DEBUG
 1156     Skein_Debug_Round 1024,SKEIN_RND_KEY_INITIAL        #show state after initial key injection
 1157 .endif
 1158     #
 1159     #################
 1160     # now the key schedule is computed. Start the rounds
 1161     #
 1162 .if (SKEIN_ASM_UNROLL) & 1024
 1163 _UNROLL_CNT =   ROUNDS_1024/8
 1164 .else
 1165 _UNROLL_CNT =   SKEIN_UNROLL_1024
 1166   .if ((ROUNDS_1024/8) % _UNROLL_CNT)
 1167     .error "Invalid SKEIN_UNROLL_1024"
 1168   .endif
 1169 Skein1024_round_loop:
 1170 .endif
 1171 #
 1172 _Rbase_ = 0
 1173 .rept _UNROLL_CNT*2                        #implement the rounds, 4 at a time
 1174       r1024_FourRounds %(4*_Rbase_+00)
 1175 _Rbase_ = _Rbase_+1
 1176 .endr #rept _UNROLL_CNT
 1177 #
 1178 .if ((SKEIN_ASM_UNROLL) & 1024) == 0
 1179     cmpq    $2*(ROUNDS_1024/8),tmpStk_1024(%rsp) #see .if we are done
 1180     jb      Skein1024_round_loop    
 1181 .endif
 1182     # end of rounds
 1183     #################
 1184     #
 1185     # feedforward:   ctx->X[i] = X[i] ^ w[i], {i=0..15}
 1186     movq    %rdx,X_stk+8*o1K_rdx(%rsp) #we need a register. x6 already on stack
 1187     movq       ctxPtr(%rsp),%rdx
 1188     
 1189     .irp _rr_,rdi,rsi,rbp,rax,rcx,rbx,r8,r9,r10,r11,r12,r13,r14,r15   #do all but x6,x7
 1190 _oo_ = o1K_\_rr_
 1191       xorq  Wcopy +8*_oo_(%rsp),%\_rr_ #feedforward XOR
 1192       movq  %\_rr_,X_VARS+8*_oo_(%rdx) #save result into context
 1193       .if (_oo_ ==  9)
 1194         movq   $FIRST_MASK64 ,%r9
 1195       .endif
 1196       .if (_oo_ == 14)
 1197         andq   TWEAK+ 8(%rdx),%r9
 1198       .endif
 1199     .endr
 1200     # 
 1201     movq         X_stk +8*6(%rsp),%rax #now process x6,x7 (skipped in .irp above)
 1202     movq         X_stk +8*7(%rsp),%rbx
 1203     xorq         Wcopy +8*6(%rsp),%rax
 1204     xorq         Wcopy +8*7(%rsp),%rbx
 1205     movq    %rax,X_VARS+8*6(%rdx)
 1206     decq             blkCnt(%rsp)      #set zero flag iff done
 1207     movq    %rbx,X_VARS+8*7(%rdx)
 1208 
 1209     Skein_Debug_Round 1024,SKEIN_RND_FEED_FWD,,<cmpq $0,blkCnt(%rsp)>
 1210     # go back for more blocks, if needed
 1211     movq             ctxPtr(%rsp),%rdi #don't muck with the flags here!
 1212     lea          FRAME_OFFS(%rsp),%rbp
 1213     jnz     Skein1024_block_loop
 1214     movq    %r9 ,TWEAK+   8(%rdx)
 1215     Reset_Stack
 1216     ret
 1217 #
 1218 Skein1024_Process_Block_End:
 1219 #
 1220 .if _SKEIN_DEBUG
 1221 Skein_Debug_Round_1024:
 1222     # call here with rdx  = "round number",
 1223 _SP_OFFS_ = 8*2                     #stack "offset" here: rdx, return addr
 1224     #
 1225   #save rest of X[] state on stack so debug routines can access it
 1226   .irp _rr_,rsi,rbp,rax,rbx,r8,r9,r10,r11,r12,r13,r14,r15
 1227     movq    %\_rr_,X_stk+8*o1K_\_rr_+_SP_OFFS_(%rsp)
 1228   .endr
 1229     # Figure out what to do with x0 (rdi).  When rdx == 0 mod 4, it's already on stack
 1230     cmpq    $SKEIN_RND_SPECIAL,%rdx #special rounds always save
 1231     jae     save_x0
 1232     testq   $3,%rdx                 #otherwise only if rdx != 0 mod 4
 1233     jz      save_x0_not
 1234 save_x0:
 1235     movq    %rdi,X_stk+8*o1K_rdi+_SP_OFFS_(%rsp)
 1236 save_x0_not:
 1237     #figure out the x4/x6 swapping state and save the correct one!
 1238     cmpq    $SKEIN_RND_SPECIAL,%rdx #special rounds always do x4
 1239     jae     save_x4
 1240     testq   $1,%rdx                  #and even ones have r4 as well
 1241     jz      save_x4
 1242     movq    %rcx,X_stk+8*6+_SP_OFFS_(%rsp)
 1243     jmp     debug_1024_go
 1244 save_x4:
 1245     movq    %rcx,X_stk+8*4+_SP_OFFS_(%rsp)
 1246 debug_1024_go:
 1247     #now all is saved in Xstk[] except for rdx
 1248     push    %rsi                    #save two regs for BLK_BITS-specific parms
 1249     push    %rdi
 1250 _SP_OFFS_ = _SP_OFFS_ + 16          #adjust stack offset accordingly (now 32)
 1251 
 1252     movq    _SP_OFFS_-8(%rsp),%rsi  #get back original %rdx (pushed on stack in macro call)
 1253     movq    %rsi,X_stk+8*o1K_rdx+_SP_OFFS_(%rsp) #and save it in its rightful place in X_stk[]
 1254 
 1255     movq    ctxPtr+_SP_OFFS_(%rsp),%rsi  #rsi = ctx_hdr_ptr
 1256     movq    $1024,%rdi                   #rdi = block size
 1257     jmp     Skein_Debug_Round_Common
 1258 .endif
 1259 #
 1260 .if _SKEIN_CODE_SIZE
 1261 C_label Skein1024_Process_Block_CodeSize
 1262     movq    $(Skein1024_Process_Block_End-Skein1024_Process_Block),%rax
 1263     ret
 1264 #
 1265 C_label Skein1024_Unroll_Cnt
 1266   .if _UNROLL_CNT <> (ROUNDS_1024/8)
 1267     movq    $_UNROLL_CNT,%rax
 1268   .else
 1269     xorq    %rax,%rax
 1270   .endif
 1271     ret
 1272 .endif
 1273 #
 1274 .endif # _USE_ASM_ and 1024
 1275 #
 1276 .if _SKEIN_DEBUG
 1277 #----------------------------------------------------------------
 1278 #local debug routine to set up for calls to:
 1279 #  void Skein_Show_Round(uint_t bits,const Skein_Ctxt_Hdr_t *h,int r,const u64b_t *X)
 1280 #                       [       rdi                        rsi   rdx              rcx]
 1281 #
 1282 # here with %rdx = round number
 1283 #           %rsi = ctx_hdr_ptr
 1284 #           %rdi = block size (256/512/1024)
 1285 # on stack: saved rdi, saved rsi, retAddr, saved rdx  
 1286 #
 1287 Skein_Debug_Round_Common:
 1288 _SP_OFFS_ = 32                        #account for four words on stack already
 1289   .irp _rr_,rax,rbx,rcx,rbp,r8,r9,r10,r11,r12,r13,r14,r15  #save the rest of the regs
 1290     pushq %\_rr_
 1291 _SP_OFFS_ = _SP_OFFS_+8
 1292   .endr
 1293   .if (_SP_OFFS_ % 16)                # make sure stack is still 16-byte aligned here
 1294     .error  "Debug_Round_Common: stack alignment"
 1295   .endif
 1296     # compute %rcx  = ptr to the X[] array on the stack (final parameter to call)
 1297     leaq    X_stk+_SP_OFFS_(%rsp),%rcx #adjust for reg pushes, return address
 1298     cmpq    $SKEIN_RND_FEED_FWD,%rdx   #special handling for feedforward "round"?
 1299     jnz     _got_rcxA
 1300     leaq    X_VARS(%rsi),%rcx
 1301 _got_rcxA:
 1302   .if _USE_ASM_ & 1024
 1303     # special handling for 1024-bit case
 1304     #    (for rounds right before with key injection: 
 1305     #        use xDebug_1024[] instead of X_stk[])
 1306     cmpq    $SKEIN_RND_SPECIAL,%rdx
 1307     jae     _got_rcxB               #must be a normal round
 1308     orq     %rdx,%rdx
 1309     jz      _got_rcxB               #just before key injection
 1310     test    $3,%rdx
 1311     jne     _got_rcxB
 1312     cmp     $1024,%rdi              #only 1024-bit(s) for now
 1313     jne     _got_rcxB
 1314     leaq    xDebug_1024+_SP_OFFS_(%rsp),%rcx
 1315 _got_rcxB:
 1316   .endif
 1317     call    Skein_Show_Round        #call external debug handler
 1318 
 1319   .irp _rr_,r15,r14,r13,r12,r11,r10,r9,r8,rbp,rcx,rbx,rax  #restore regs
 1320     popq  %\_rr_
 1321 _SP_OFFS_ = _SP_OFFS_-8
 1322   .endr
 1323   .if _SP_OFFS_ - 32
 1324     .error   "Debug_Round_Common: push/pop misalignment!"
 1325   .endif    
 1326     popq    %rdi
 1327     popq    %rsi
 1328     ret
 1329 .endif
 1330 #----------------------------------------------------------------
 1331     .section .note.GNU-stack,"",@progbits
 1332 
 1333     .end

Cache object: b8b2979f33123748464d971888f03395


[ source navigation ] [ diff markup ] [ identifier search ] [ freetext search ] [ file search ] [ list types ] [ track identifier ]


This page is part of the FreeBSD/Linux Linux Kernel Cross-Reference, and was automatically generated using a modified version of the LXR engine.