1 #
2 #----------------------------------------------------------------
3 # 64-bit x86 assembler code (gnu as) for Skein block functions
4 #
5 # Author: Doug Whiting, Hifn/Exar
6 #
7 # This code is released to the public domain.
8 #----------------------------------------------------------------
9 # $FreeBSD$
10 #
11 .text
12 .altmacro
13 #ifndef __clang__
14 .psize 0,128 #list file has no page boundaries
15 #endif
16 #
17 _MASK_ALL_ = (256+512+1024) #all three algorithm bits
18 _MAX_FRAME_ = 240
19 #
20 #################
21 #ifndef SKEIN_USE_ASM
22 _USE_ASM_ = _MASK_ALL_
23 #else
24 _USE_ASM_ = SKEIN_USE_ASM
25 #endif
26 #################
27 #configure loop unrolling
28 #ifndef SKEIN_LOOP
29 _SKEIN_LOOP = 2 #default is fully unrolled for 256/512, twice for 1024
30 #else
31 _SKEIN_LOOP = SKEIN_LOOP
32 .irp _NN_,%_SKEIN_LOOP #only display loop unrolling if default changed on command line
33 #.print "+++ SKEIN_LOOP = \_NN_"
34 .endr
35 #endif
36 # the unroll counts (0 --> fully unrolled)
37 SKEIN_UNROLL_256 = (_SKEIN_LOOP / 100) % 10
38 SKEIN_UNROLL_512 = (_SKEIN_LOOP / 10) % 10
39 SKEIN_UNROLL_1024 = (_SKEIN_LOOP ) % 10
40 #
41 SKEIN_ASM_UNROLL = 0
42 .irp _NN_,256,512,1024
43 .if (SKEIN_UNROLL_\_NN_) == 0
44 SKEIN_ASM_UNROLL = (SKEIN_ASM_UNROLL) + \_NN_
45 .endif
46 .endr
47 #################
48 #
49 .ifndef SKEIN_ROUNDS
50 ROUNDS_256 = 72
51 ROUNDS_512 = 72
52 ROUNDS_1024 = 80
53 .else
54 ROUNDS_256 = 8*((((SKEIN_ROUNDS / 100) + 5) % 10) + 5)
55 ROUNDS_512 = 8*((((SKEIN_ROUNDS / 10) + 5) % 10) + 5)
56 ROUNDS_1024 = 8*((((SKEIN_ROUNDS ) + 5) % 10) + 5)
57 # only display rounds if default size is changed on command line
58 .irp _NN_,256,512,1024
59 .if _USE_ASM_ & \_NN_
60 .irp _RR_,%(ROUNDS_\_NN_)
61 .if _NN_ < 1024
62 .print "+++ SKEIN_ROUNDS_\_NN_ = \_RR_"
63 .else
64 .print "+++ SKEIN_ROUNDS_\_NN_ = \_RR_"
65 .endif
66 .endr
67 .endif
68 .endr
69 .endif
70 #################
71 #
72 .ifdef SKEIN_CODE_SIZE
73 _SKEIN_CODE_SIZE = (1)
74 .else
75 .ifdef SKEIN_PERF #use code size if SKEIN_PERF is defined
76 _SKEIN_CODE_SIZE = (1)
77 .else
78 _SKEIN_CODE_SIZE = (0)
79 .endif
80 .endif
81 #
82 #################
83 #
84 .ifndef SKEIN_DEBUG
85 _SKEIN_DEBUG = 0
86 .else
87 _SKEIN_DEBUG = 1
88 .endif
89 #################
90 #
91 # define offsets of fields in hash context structure
92 #
93 HASH_BITS = 0 #bits of hash output
94 BCNT = 8 + HASH_BITS #number of bytes in BUFFER[]
95 TWEAK = 8 + BCNT #tweak values[0..1]
96 X_VARS = 16 + TWEAK #chaining vars
97 #
98 #(Note: buffer[] in context structure is NOT needed here :-)
99 #
100 KW_PARITY = 0x1BD11BDAA9FC1A22 #overall parity of key schedule words
101 FIRST_MASK = ~ (1 << 6)
102 FIRST_MASK64= ~ (1 << 62)
103 #
104 # rotation constants for Skein
105 #
106 RC_256_0_0 = 14
107 RC_256_0_1 = 16
108
109 RC_256_1_0 = 52
110 RC_256_1_1 = 57
111
112 RC_256_2_0 = 23
113 RC_256_2_1 = 40
114
115 RC_256_3_0 = 5
116 RC_256_3_1 = 37
117
118 RC_256_4_0 = 25
119 RC_256_4_1 = 33
120
121 RC_256_5_0 = 46
122 RC_256_5_1 = 12
123
124 RC_256_6_0 = 58
125 RC_256_6_1 = 22
126
127 RC_256_7_0 = 32
128 RC_256_7_1 = 32
129
130 RC_512_0_0 = 46
131 RC_512_0_1 = 36
132 RC_512_0_2 = 19
133 RC_512_0_3 = 37
134
135 RC_512_1_0 = 33
136 RC_512_1_1 = 27
137 RC_512_1_2 = 14
138 RC_512_1_3 = 42
139
140 RC_512_2_0 = 17
141 RC_512_2_1 = 49
142 RC_512_2_2 = 36
143 RC_512_2_3 = 39
144
145 RC_512_3_0 = 44
146 RC_512_3_1 = 9
147 RC_512_3_2 = 54
148 RC_512_3_3 = 56
149
150 RC_512_4_0 = 39
151 RC_512_4_1 = 30
152 RC_512_4_2 = 34
153 RC_512_4_3 = 24
154
155 RC_512_5_0 = 13
156 RC_512_5_1 = 50
157 RC_512_5_2 = 10
158 RC_512_5_3 = 17
159
160 RC_512_6_0 = 25
161 RC_512_6_1 = 29
162 RC_512_6_2 = 39
163 RC_512_6_3 = 43
164
165 RC_512_7_0 = 8
166 RC_512_7_1 = 35
167 RC_512_7_2 = 56
168 RC_512_7_3 = 22
169
170 RC_1024_0_0 = 24
171 RC_1024_0_1 = 13
172 RC_1024_0_2 = 8
173 RC_1024_0_3 = 47
174 RC_1024_0_4 = 8
175 RC_1024_0_5 = 17
176 RC_1024_0_6 = 22
177 RC_1024_0_7 = 37
178
179 RC_1024_1_0 = 38
180 RC_1024_1_1 = 19
181 RC_1024_1_2 = 10
182 RC_1024_1_3 = 55
183 RC_1024_1_4 = 49
184 RC_1024_1_5 = 18
185 RC_1024_1_6 = 23
186 RC_1024_1_7 = 52
187
188 RC_1024_2_0 = 33
189 RC_1024_2_1 = 4
190 RC_1024_2_2 = 51
191 RC_1024_2_3 = 13
192 RC_1024_2_4 = 34
193 RC_1024_2_5 = 41
194 RC_1024_2_6 = 59
195 RC_1024_2_7 = 17
196
197 RC_1024_3_0 = 5
198 RC_1024_3_1 = 20
199 RC_1024_3_2 = 48
200 RC_1024_3_3 = 41
201 RC_1024_3_4 = 47
202 RC_1024_3_5 = 28
203 RC_1024_3_6 = 16
204 RC_1024_3_7 = 25
205
206 RC_1024_4_0 = 41
207 RC_1024_4_1 = 9
208 RC_1024_4_2 = 37
209 RC_1024_4_3 = 31
210 RC_1024_4_4 = 12
211 RC_1024_4_5 = 47
212 RC_1024_4_6 = 44
213 RC_1024_4_7 = 30
214
215 RC_1024_5_0 = 16
216 RC_1024_5_1 = 34
217 RC_1024_5_2 = 56
218 RC_1024_5_3 = 51
219 RC_1024_5_4 = 4
220 RC_1024_5_5 = 53
221 RC_1024_5_6 = 42
222 RC_1024_5_7 = 41
223
224 RC_1024_6_0 = 31
225 RC_1024_6_1 = 44
226 RC_1024_6_2 = 47
227 RC_1024_6_3 = 46
228 RC_1024_6_4 = 19
229 RC_1024_6_5 = 42
230 RC_1024_6_6 = 44
231 RC_1024_6_7 = 25
232
233 RC_1024_7_0 = 9
234 RC_1024_7_1 = 48
235 RC_1024_7_2 = 35
236 RC_1024_7_3 = 52
237 RC_1024_7_4 = 23
238 RC_1024_7_5 = 31
239 RC_1024_7_6 = 37
240 RC_1024_7_7 = 20
241 #
242 # Input: reg
243 # Output: <reg> <<< RC_BlkSize_roundNum_mixNum, BlkSize=256/512/1024
244 #
245 .macro RotL64 reg,BLK_SIZE,ROUND_NUM,MIX_NUM
246 .if RC_\BLK_SIZE\()_\ROUND_NUM\()_\MIX_NUM #is there anything to do?
247 rolq $RC_\BLK_SIZE\()_\ROUND_NUM\()_\MIX_NUM,%\reg
248 .endif
249 .endm
250 #
251 #----------------------------------------------------------------
252 #
253 # MACROS: define local vars and configure stack
254 #
255 #----------------------------------------------------------------
256 # declare allocated space on the stack
257 .macro StackVar localName,localSize
258 \localName = _STK_OFFS_
259 _STK_OFFS_ = _STK_OFFS_+(\localSize)
260 .endm #StackVar
261 #
262 #----------------------------------------------------------------
263 #
264 # MACRO: Configure stack frame, allocate local vars
265 #
266 .macro Setup_Stack BLK_BITS,KS_CNT,debugCnt
267 WCNT = (\BLK_BITS)/64
268 #
269 _PushCnt_ = 0 #save nonvolatile regs on stack
270 .irp _reg_,rbp,rbx,r12,r13,r14,r15
271 pushq %\_reg_
272 _PushCnt_ = _PushCnt_ + 1 #track count to keep alignment
273 .endr
274 #
275 _STK_OFFS_ = 0 #starting offset from rsp
276 #---- local variables #<-- rsp
277 StackVar X_stk ,8*(WCNT) #local context vars
278 StackVar ksTwk ,8*3 #key schedule: tweak words
279 StackVar ksKey ,8*(WCNT)+8 #key schedule: key words
280 .if ((SKEIN_ASM_UNROLL) & (\BLK_BITS)) == 0
281 StackVar ksRot ,16*(\KS_CNT) #leave space for "rotation" to happen
282 .endif
283 StackVar Wcopy ,8*(WCNT) #copy of input block
284 .if _SKEIN_DEBUG
285 .if \debugCnt + 0 #temp location for debug X[] info
286 StackVar xDebug_\BLK_BITS ,8*(\debugCnt)
287 .endif
288 .endif
289 .if ((8*_PushCnt_ + _STK_OFFS_) % 8) == 0
290 StackVar align16,8 #keep 16-byte aligned (adjust for retAddr?)
291 tmpStk_\BLK_BITS = align16 #use this
292 .endif
293 #---- saved caller parameters (from regs rdi, rsi, rdx, rcx)
294 StackVar ctxPtr ,8 #context ptr
295 StackVar blkPtr ,8 #pointer to block data
296 StackVar blkCnt ,8 #number of full blocks to process
297 StackVar bitAdd ,8 #bit count to add to tweak
298 LOCAL_SIZE = _STK_OFFS_ #size of "local" vars
299 #----
300 StackVar savRegs,8*_PushCnt_ #saved registers
301 StackVar retAddr,8 #return address
302 #---- caller's stack frame (aligned mod 16)
303 #
304 # set up the stack frame pointer (rbp)
305 #
306 FRAME_OFFS = ksTwk + 128 #allow short (negative) offset to ksTwk, kwKey
307 .if FRAME_OFFS > _STK_OFFS_ #keep rbp in the "locals" range
308 FRAME_OFFS = _STK_OFFS_
309 .endif
310 F_O = -FRAME_OFFS
311 #
312 #put some useful defines in the .lst file (for grep)
313 __STK_LCL_SIZE_\BLK_BITS = LOCAL_SIZE
314 __STK_TOT_SIZE_\BLK_BITS = _STK_OFFS_
315 __STK_FRM_OFFS_\BLK_BITS = FRAME_OFFS
316 #
317 # Notes on stack frame setup:
318 # * the most frequently used variable is X_stk[], based at [rsp+0]
319 # * the next most used is the key schedule arrays, ksKey and ksTwk
320 # so rbp is "centered" there, allowing short offsets to the key
321 # schedule even in 1024-bit Skein case
322 # * the Wcopy variables are infrequently accessed, but they have long
323 # offsets from both rsp and rbp only in the 1024-bit case.
324 # * all other local vars and calling parameters can be accessed
325 # with short offsets, except in the 1024-bit case
326 #
327 subq $LOCAL_SIZE,%rsp #make room for the locals
328 leaq FRAME_OFFS(%rsp),%rbp #maximize use of short offsets
329 movq %rdi, ctxPtr+F_O(%rbp) #save caller's parameters on the stack
330 movq %rsi, blkPtr+F_O(%rbp)
331 movq %rdx, blkCnt+F_O(%rbp)
332 movq %rcx, bitAdd+F_O(%rbp)
333 #
334 .endm #Setup_Stack
335 #
336 #----------------------------------------------------------------
337 #
338 .macro Reset_Stack
339 addq $LOCAL_SIZE,%rsp #get rid of locals (wipe?)
340 .irp _reg_,r15,r14,r13,r12,rbx,rbp
341 popq %\_reg_ #restore caller's regs
342 _PushCnt_ = _PushCnt_ - 1
343 .endr
344 .if _PushCnt_
345 .error "Mismatched push/pops?"
346 .endif
347 .endm # Reset_Stack
348 #
349 #----------------------------------------------------------------
350 # macros to help debug internals
351 #
352 .if _SKEIN_DEBUG
353 .extern Skein_Show_Block #calls to C routines
354 .extern Skein_Show_Round
355 #
356 SKEIN_RND_SPECIAL = 1000
357 SKEIN_RND_KEY_INITIAL = SKEIN_RND_SPECIAL+0
358 SKEIN_RND_KEY_INJECT = SKEIN_RND_SPECIAL+1
359 SKEIN_RND_FEED_FWD = SKEIN_RND_SPECIAL+2
360 #
361 .macro Skein_Debug_Block BLK_BITS
362 #
363 #void Skein_Show_Block(uint_t bits,const Skein_Ctxt_Hdr_t *h,const u64b_t *X,
364 # const u08b_t *blkPtr, const u64b_t *wPtr,
365 # const u64b_t *ksPtr,const u64b_t *tsPtr)
366 #
367 _NN_ = 0
368 .irp _reg_,rax,rcx,rdx,rsi,rdi,r8,r9,r10,r11
369 pushq %\_reg_ #save all volatile regs on tack before the call
370 _NN_ = _NN_ + 1
371 .endr
372 # get and push call parameters
373 movq $\BLK_BITS ,%rdi #bits
374 movq ctxPtr+F_O(%rbp),%rsi #h (pointer)
375 leaq X_VARS (%rsi),%rdx #X (pointer)
376 movq blkPtr+F_O(%rbp),%rcx #blkPtr
377 leaq Wcopy +F_O(%rbp),%r8 #wPtr
378 leaq ksKey +F_O(%rbp),%r9 #key pointer
379 leaq ksTwk +F_O(%rbp),%rax #tweak pointer
380 pushq %rax # (pass on the stack)
381 call Skein_Show_Block #call external debug handler
382 addq $8*1,%rsp #discard parameters on stack
383 .if (_NN_ % 2 ) == 0 #check stack alignment
384 .error "Stack misalignment problem in Skein_Debug_Block_\_BLK_BITS"
385 .endif
386 .irp _reg_,r11,r10,r9,r8,rdi,rsi,rdx,rcx,rax
387 popq %\_reg_ #restore regs
388 _NN_ = _NN_ - 1
389 .endr
390 .if _NN_
391 .error "Push/pop mismatch problem in Skein_Debug_Block_\_BLK_BITS"
392 .endif
393 .endm # Skein_Debug_Block
394 #
395 # the macro to "call" to debug a round
396 #
397 .macro Skein_Debug_Round BLK_BITS,R,RDI_OFFS,afterOp
398 # call the appropriate (local) debug "function"
399 pushq %rdx #save rdx, so we can use it for round "number"
400 .if ((SKEIN_ASM_UNROLL) & \BLK_BITS) || (\R >= SKEIN_RND_SPECIAL)
401 movq $\R,%rdx
402 .else #compute round number using edi
403 _rOffs_ = \RDI_OFFS + 0
404 .if \BLK_BITS == 1024
405 movq rIdx_offs+8(%rsp),%rdx #get rIdx off the stack (adjust for pushq rdx above)
406 leaq 1+(((\R)-1) & 3)+_rOffs_(,%rdx,4),%rdx
407 .else
408 leaq 1+(((\R)-1) & 3)+_rOffs_(,%rdi,4),%rdx
409 .endif
410 .endif
411 call Skein_Debug_Round_\BLK_BITS
412 popq %rdx #restore origianl rdx value
413 #
414 afterOp
415 .endm # Skein_Debug_Round
416 .else #------- _SKEIN_DEBUG (dummy macros if debug not enabled)
417 .macro Skein_Debug_Block BLK_BITS
418 .endm
419 #
420 .macro Skein_Debug_Round BLK_BITS,R,RDI_OFFS,afterOp
421 .endm
422 #
423 .endif # _SKEIN_DEBUG
424 #
425 #----------------------------------------------------------------
426 #
427 .macro addReg dstReg,srcReg_A,srcReg_B,useAddOp,immOffs
428 .if \immOffs + 0
429 leaq \immOffs(%\srcReg_A\srcReg_B,%\dstReg),%\dstReg
430 .elseif ((\useAddOp + 0) == 0)
431 .ifndef ASM_NO_LEA #lea seems to be faster on Core 2 Duo CPUs!
432 leaq (%\srcReg_A\srcReg_B,%\dstReg),%\dstReg
433 .else
434 addq %\srcReg_A\srcReg_B,%\dstReg
435 .endif
436 .else
437 addq %\srcReg_A\srcReg_B,%\dstReg
438 .endif
439 .endm
440
441 # keep Intel-style ordering here, to match addReg
442 .macro xorReg dstReg,srcReg_A,srcReg_B
443 xorq %\srcReg_A\srcReg_B,%\dstReg
444 .endm
445 #
446 #----------------------------------------------------------------
447 #
448 .macro C_label lName
449 \lName: #use both "genders" to work across linkage conventions
450 _\lName:
451 .global \lName
452 .global _\lName
453 .endm
454 #
455 #=================================== Skein_256 =============================================
456 #
457 .if _USE_ASM_ & 256
458 #
459 # void Skein_256_Process_Block(Skein_256_Ctxt_t *ctx,const u08b_t *blkPtr,size_t blkCnt,size_t bitcntAdd)#
460 #
461 #################
462 #
463 # code
464 #
465 C_label Skein_256_Process_Block
466 Setup_Stack 256,((ROUNDS_256/8)+1)
467 movq TWEAK+8(%rdi),%r14
468 jmp Skein_256_block_loop
469 .p2align 4
470 # main hash loop for Skein_256
471 Skein_256_block_loop:
472 #
473 # general register usage:
474 # RAX..RDX = X0..X3
475 # R08..R12 = ks[0..4]
476 # R13..R15 = ts[0..2]
477 # RSP, RBP = stack/frame pointers
478 # RDI = round counter or context pointer
479 # RSI = temp
480 #
481 movq TWEAK+0(%rdi) ,%r13
482 addq bitAdd+F_O(%rbp) ,%r13 #computed updated tweak value T0
483 movq %r14 ,%r15
484 xorq %r13 ,%r15 #now %r13.%r15 is set as the tweak
485
486 movq $KW_PARITY ,%r12
487 movq X_VARS+ 0(%rdi),%r8
488 movq X_VARS+ 8(%rdi),%r9
489 movq X_VARS+16(%rdi),%r10
490 movq X_VARS+24(%rdi),%r11
491 movq %r13,TWEAK+0(%rdi) #save updated tweak value ctx->h.T[0]
492 xorq %r8 ,%r12 #start accumulating overall parity
493
494 movq blkPtr +F_O(%rbp) ,%rsi #esi --> input block
495 xorq %r9 ,%r12
496 movq 0(%rsi) ,%rax #get X[0..3]
497 xorq %r10 ,%r12
498 movq 8(%rsi) ,%rbx
499 xorq %r11 ,%r12
500 movq 16(%rsi) ,%rcx
501 movq 24(%rsi) ,%rdx
502
503 movq %rax,Wcopy+ 0+F_O(%rbp) #save copy of input block
504 movq %rbx,Wcopy+ 8+F_O(%rbp)
505 movq %rcx,Wcopy+16+F_O(%rbp)
506 movq %rdx,Wcopy+24+F_O(%rbp)
507
508 addq %r8 ,%rax #initial key injection
509 addq %r9 ,%rbx
510 addq %r10,%rcx
511 addq %r11,%rdx
512 addq %r13,%rbx
513 addq %r14,%rcx
514
515 .if _SKEIN_DEBUG
516 movq %r14,TWEAK+ 8(%rdi) #save updated tweak T[1] (start bit cleared?)
517 movq %r8 ,ksKey+ 0+F_O(%rbp) #save key schedule on stack for Skein_Debug_Block
518 movq %r9 ,ksKey+ 8+F_O(%rbp)
519 movq %r10,ksKey+16+F_O(%rbp)
520 movq %r11,ksKey+24+F_O(%rbp)
521 movq %r12,ksKey+32+F_O(%rbp)
522
523 movq %r13,ksTwk+ 0+F_O(%rbp)
524 movq %r14,ksTwk+ 8+F_O(%rbp)
525 movq %r15,ksTwk+16+F_O(%rbp)
526
527 movq %rax,X_stk + 0(%rsp) #save X[] on stack for Skein_Debug_Block
528 movq %rbx,X_stk + 8(%rsp)
529 movq %rcx,X_stk +16(%rsp)
530 movq %rdx,X_stk +24(%rsp)
531
532 Skein_Debug_Block 256 #debug dump
533 Skein_Debug_Round 256,SKEIN_RND_KEY_INITIAL
534 .endif
535 #
536 .if (((SKEIN_ASM_UNROLL) & 256) == 0)
537 movq %r8 ,ksKey+40+F_O(%rbp) #save key schedule on stack for looping code
538 movq %r9 ,ksKey+ 8+F_O(%rbp)
539 movq %r10,ksKey+16+F_O(%rbp)
540 movq %r11,ksKey+24+F_O(%rbp)
541 movq %r12,ksKey+32+F_O(%rbp)
542
543 movq %r13,ksTwk+24+F_O(%rbp)
544 movq %r14,ksTwk+ 8+F_O(%rbp)
545 movq %r15,ksTwk+16+F_O(%rbp)
546 .endif
547 addq $WCNT*8,%rsi #skip the block
548 movq %rsi,blkPtr +F_O(%rbp) #update block pointer
549 #
550 # now the key schedule is computed. Start the rounds
551 #
552 .if (SKEIN_ASM_UNROLL) & 256
553 _UNROLL_CNT = ROUNDS_256/8
554 .else
555 _UNROLL_CNT = SKEIN_UNROLL_256
556 .if ((ROUNDS_256/8) % _UNROLL_CNT)
557 .error "Invalid SKEIN_UNROLL_256"
558 .endif
559 xorq %rdi,%rdi #rdi = iteration count
560 Skein_256_round_loop:
561 .endif
562 _Rbase_ = 0
563 .rept _UNROLL_CNT*2
564 # all X and ks vars in regs # (ops to "rotate" ks vars, via mem, if not unrolled)
565 # round 4*_RBase_ + 0
566 addReg rax, rbx
567 RotL64 rbx, 256,%((4*_Rbase_+0) % 8),0
568 addReg rcx, rdx
569 .if ((SKEIN_ASM_UNROLL) & 256) == 0
570 movq ksKey+8*1+F_O(%rbp,%rdi,8),%r8
571 .endif
572 xorReg rbx, rax
573 RotL64 rdx, 256,%((4*_Rbase_+0) % 8),1
574 xorReg rdx, rcx
575 .if (SKEIN_ASM_UNROLL) & 256
576 .irp _r0_,%( 8+(_Rbase_+3) % 5)
577 .irp _r1_,%(13+(_Rbase_+2) % 3)
578 leaq (%r\_r0_,%r\_r1_),%rdi #precompute key injection value for %rcx
579 .endr
580 .endr
581 .endif
582 .if ((SKEIN_ASM_UNROLL) & 256) == 0
583 movq ksTwk+8*1+F_O(%rbp,%rdi,8),%r13
584 .endif
585 Skein_Debug_Round 256,%(4*_Rbase_+1)
586
587 # round 4*_Rbase_ + 1
588 addReg rax, rdx
589 RotL64 rdx, 256,%((4*_Rbase_+1) % 8),0
590 xorReg rdx, rax
591 .if ((SKEIN_ASM_UNROLL) & 256) == 0
592 movq ksKey+8*2+F_O(%rbp,%rdi,8),%r9
593 .endif
594 addReg rcx, rbx
595 RotL64 rbx, 256,%((4*_Rbase_+1) % 8),1
596 xorReg rbx, rcx
597 .if ((SKEIN_ASM_UNROLL) & 256) == 0
598 movq ksKey+8*4+F_O(%rbp,%rdi,8),%r11
599 .endif
600 Skein_Debug_Round 256,%(4*_Rbase_+2)
601 .if (SKEIN_ASM_UNROLL) & 256
602 .irp _r0_,%( 8+(_Rbase_+2) % 5)
603 .irp _r1_,%(13+(_Rbase_+1) % 3)
604 leaq (%r\_r0_,%r\_r1_),%rsi #precompute key injection value for %rbx
605 .endr
606 .endr
607 .endif
608 # round 4*_Rbase_ + 2
609 addReg rax, rbx
610 RotL64 rbx, 256,%((4*_Rbase_+2) % 8),0
611 addReg rcx, rdx
612 .if ((SKEIN_ASM_UNROLL) & 256) == 0
613 movq ksKey+8*3+F_O(%rbp,%rdi,8),%r10
614 .endif
615 xorReg rbx, rax
616 RotL64 rdx, 256,%((4*_Rbase_+2) % 8),1
617 xorReg rdx, rcx
618 .if ((SKEIN_ASM_UNROLL) & 256) == 0
619 movq %r8,ksKey+8*6+F_O(%rbp,%rdi,8) #"rotate" the key
620 leaq 1(%r11,%rdi),%r11 #precompute key + tweak
621 .endif
622 Skein_Debug_Round 256,%(4*_Rbase_+3)
623 # round 4*_Rbase_ + 3
624 addReg rax, rdx
625 RotL64 rdx, 256,%((4*_Rbase_+3) % 8),0
626 addReg rcx, rbx
627 .if ((SKEIN_ASM_UNROLL) & 256) == 0
628 addq ksTwk+8*2+F_O(%rbp,%rdi,8),%r10 #precompute key + tweak
629 movq %r13,ksTwk+8*4+F_O(%rbp,%rdi,8) #"rotate" the tweak
630 .endif
631 xorReg rdx, rax
632 RotL64 rbx, 256,%((4*_Rbase_+3) % 8),1
633 xorReg rbx, rcx
634 Skein_Debug_Round 256,%(4*_Rbase_+4)
635 .if ((SKEIN_ASM_UNROLL) & 256) == 0
636 addReg r9 ,r13 #precompute key+tweak
637 .endif
638 #inject key schedule words
639 _Rbase_ = _Rbase_+1
640 .if (SKEIN_ASM_UNROLL) & 256
641 addReg rax,r,%(8+((_Rbase_+0) % 5))
642 addReg rbx,rsi
643 addReg rcx,rdi
644 addReg rdx,r,%(8+((_Rbase_+3) % 5)),,_Rbase_
645 .else
646 incq %rdi
647 addReg rax,r8
648 addReg rcx,r10
649 addReg rbx,r9
650 addReg rdx,r11
651 .endif
652 Skein_Debug_Round 256,SKEIN_RND_KEY_INJECT
653 .endr #rept _UNROLL_CNT
654 #
655 .if ((SKEIN_ASM_UNROLL) & 256) == 0
656 cmpq $2*(ROUNDS_256/8),%rdi
657 jb Skein_256_round_loop
658 .endif # (SKEIN_ASM_UNROLL & 256) == 0
659 movq ctxPtr +F_O(%rbp),%rdi #restore rdi --> context
660
661 #----------------------------
662 # feedforward: ctx->X[i] = X[i] ^ w[i], {i=0..3}
663 movq $FIRST_MASK64 ,%r14
664 xorq Wcopy + 0+F_O (%rbp),%rax
665 xorq Wcopy + 8+F_O (%rbp),%rbx
666 xorq Wcopy +16+F_O (%rbp),%rcx
667 xorq Wcopy +24+F_O (%rbp),%rdx
668 andq TWEAK + 8 (%rdi),%r14
669 movq %rax,X_VARS+ 0(%rdi) #store final result
670 movq %rbx,X_VARS+ 8(%rdi)
671 movq %rcx,X_VARS+16(%rdi)
672 movq %rdx,X_VARS+24(%rdi)
673
674 Skein_Debug_Round 256,SKEIN_RND_FEED_FWD
675
676 # go back for more blocks, if needed
677 decq blkCnt+F_O(%rbp)
678 jnz Skein_256_block_loop
679 movq %r14,TWEAK + 8(%rdi)
680 Reset_Stack
681 ret
682 Skein_256_Process_Block_End:
683
684 .if _SKEIN_DEBUG
685 Skein_Debug_Round_256: #here with rdx == round "number" from macro
686 pushq %rsi #save two regs for BLK_BITS-specific parms
687 pushq %rdi
688 movq 24(%rsp),%rdi #get back original rdx (pushed on stack in macro call) to rdi
689 movq %rax,X_stk+ 0+F_O(%rbp) #save X[] state on stack so debug routines can access it
690 movq %rbx,X_stk+ 8+F_O(%rbp) #(use FP_ since rsp has changed!)
691 movq %rcx,X_stk+16+F_O(%rbp)
692 movq %rdi,X_stk+24+F_O(%rbp)
693
694 movq ctxPtr+F_O(%rbp),%rsi #ctx_hdr_ptr
695 movq $256,%rdi #now <rdi,rsi,rdx> are set for the call
696 jmp Skein_Debug_Round_Common
697 .endif
698 #
699 .if _SKEIN_CODE_SIZE
700 C_label Skein_256_Process_Block_CodeSize
701 movq $(Skein_256_Process_Block_End-Skein_256_Process_Block),%rax
702 ret
703 #
704 C_label Skein_256_Unroll_Cnt
705 .if _UNROLL_CNT <> ROUNDS_256/8
706 movq $_UNROLL_CNT,%rax
707 .else
708 xorq %rax,%rax
709 .endif
710 ret
711 .endif
712 #
713 .endif #_USE_ASM_ & 256
714 #
715 #=================================== Skein_512 =============================================
716 #
717 .if _USE_ASM_ & 512
718 #
719 # void Skein_512_Process_Block(Skein_512_Ctxt_t *ctx,const u08b_t *blkPtr,size_t blkCnt,size_t bitcntAdd)
720 #
721 # X[i] == %r[8+i] #register assignments for X[] values during rounds (i=0..7)
722 #
723 #################
724 # MACRO: one round for 512-bit blocks
725 #
726 .macro R_512_OneRound rn0,rn1,rn2,rn3,rn4,rn5,rn6,rn7,_Rn_,op1,op2,op3,op4
727 #
728 addReg r\rn0, r\rn1
729 RotL64 r\rn1, 512,%((\_Rn_) % 8),0
730 xorReg r\rn1, r\rn0
731 \op1
732 addReg r\rn2, r\rn3
733 RotL64 r\rn3, 512,%((\_Rn_) % 8),1
734 xorReg r\rn3, r\rn2
735 \op2
736 addReg r\rn4, r\rn5
737 RotL64 r\rn5, 512,%((\_Rn_) % 8),2
738 xorReg r\rn5, r\rn4
739 \op3
740 addReg r\rn6, r\rn7
741 RotL64 r\rn7, 512,%((\_Rn_) % 8),3
742 xorReg r\rn7, r\rn6
743 \op4
744 Skein_Debug_Round 512,%(\_Rn_+1),-4
745 #
746 .endm #R_512_OneRound
747 #
748 #################
749 # MACRO: eight rounds for 512-bit blocks
750 #
751 .macro R_512_FourRounds _RR_ #RR = base round number (0 % 8)
752 .if ((SKEIN_ASM_UNROLL) & 512)
753 # here for fully unrolled case.
754 _II_ = ((\_RR_)/4) + 1 #key injection counter
755 R_512_OneRound 8, 9,10,11,12,13,14,15,%((\_RR_)+0),<movq ksKey+8*(((_II_)+3) % 9)+F_O(%rbp),%rax>,,<movq ksKey+8*(((_II_)+4) % 9)+F_O(%rbp),%rbx>
756 R_512_OneRound 10, 9,12,15,14,13, 8,11,%((\_RR_)+1),<movq ksKey+8*(((_II_)+5) % 9)+F_O(%rbp),%rcx>,,<movq ksKey+8*(((_II_)+6) % 9)+F_O(%rbp),%rdx>
757 R_512_OneRound 12, 9,14,11, 8,13,10,15,%((\_RR_)+2),<movq ksKey+8*(((_II_)+7) % 9)+F_O(%rbp),%rsi>,,<addq ksTwk+8*(((_II_)+0) % 3)+F_O(%rbp),%rcx>
758 R_512_OneRound 14, 9, 8,15,10,13,12,11,%((\_RR_)+3),<addq ksTwk+8*(((_II_)+1) % 3)+F_O(%rbp),%rdx>,
759 # inject the key schedule
760 addq ksKey+8*(((_II_)+0)%9)+F_O(%rbp),%r8
761 addReg r11, rax
762 addq ksKey+8*(((_II_)+1)%9)+F_O(%rbp),%r9
763 addReg r12, rbx
764 addq ksKey+8*(((_II_)+2)%9)+F_O(%rbp),%r10
765 addReg r13, rcx
766 addReg r14, rdx
767 addReg r15, rsi,,,(_II_)
768 .else
769 # here for looping case #"rotate" key/tweak schedule (move up on stack)
770 incq %rdi #bump key injection counter
771 R_512_OneRound 8, 9,10,11,12,13,14,15,%((\_RR_)+0),<movq ksKey+8*6+F_O(%rbp,%rdi,8),%rdx>,<movq ksTwk-8*1+F_O(%rbp,%rdi,8),%rax>,<movq ksKey-8*1+F_O(%rbp,%rdi,8),%rsi>
772 R_512_OneRound 10, 9,12,15,14,13, 8,11,%((\_RR_)+1),<movq ksKey+8*5+F_O(%rbp,%rdi,8),%rcx>,<movq %rax,ksTwk+8*2+F_O(%rbp,%rdi,8) >,<movq %rsi,ksKey+8*8+F_O(%rbp,%rdi,8)>
773 R_512_OneRound 12, 9,14,11, 8,13,10,15,%((\_RR_)+2),<movq ksKey+8*4+F_O(%rbp,%rdi,8),%rbx>,<addq ksTwk+8*1+F_O(%rbp,%rdi,8),%rdx>,<movq ksKey+8*7+F_O(%rbp,%rdi,8),%rsi>
774 R_512_OneRound 14, 9, 8,15,10,13,12,11,%((\_RR_)+3),<movq ksKey+8*3+F_O(%rbp,%rdi,8),%rax>,<addq ksTwk+8*0+F_O(%rbp,%rdi,8),%rcx>
775 # inject the key schedule
776 addq ksKey+8*0+F_O(%rbp,%rdi,8),%r8
777 addReg r11, rax
778 addReg r12, rbx
779 addq ksKey+8*1+F_O(%rbp,%rdi,8),%r9
780 addReg r13, rcx
781 addReg r14, rdx
782 addq ksKey+8*2+F_O(%rbp,%rdi,8),%r10
783 addReg r15, rsi
784 addReg r15, rdi #inject the round number
785 .endif
786
787 #show the result of the key injection
788 Skein_Debug_Round 512,SKEIN_RND_KEY_INJECT
789 .endm #R_512_EightRounds
790 #
791 #################
792 # instantiated code
793 #
794 C_label Skein_512_Process_Block
795 Setup_Stack 512,ROUNDS_512/8
796 movq TWEAK+ 8(%rdi),%rbx
797 jmp Skein_512_block_loop
798 .p2align 4
799 # main hash loop for Skein_512
800 Skein_512_block_loop:
801 # general register usage:
802 # RAX..RDX = temps for key schedule pre-loads
803 # R8 ..R15 = X0..X7
804 # RSP, RBP = stack/frame pointers
805 # RDI = round counter or context pointer
806 # RSI = temp
807 #
808 movq TWEAK + 0(%rdi),%rax
809 addq bitAdd+F_O(%rbp),%rax #computed updated tweak value T0
810 movq %rbx,%rcx
811 xorq %rax,%rcx #%rax/%rbx/%rcx = tweak schedule
812 movq %rax,TWEAK+ 0 (%rdi) #save updated tweak value ctx->h.T[0]
813 movq %rax,ksTwk+ 0+F_O(%rbp)
814 movq $KW_PARITY,%rdx
815 movq blkPtr +F_O(%rbp),%rsi #%rsi --> input block
816 movq %rbx,ksTwk+ 8+F_O(%rbp)
817 movq %rcx,ksTwk+16+F_O(%rbp)
818 .irp _Rn_,8,9,10,11,12,13,14,15
819 movq X_VARS+8*(\_Rn_-8)(%rdi),%r\_Rn_
820 xorq %r\_Rn_,%rdx #compute overall parity
821 movq %r\_Rn_,ksKey+8*(\_Rn_-8)+F_O(%rbp)
822 .endr #load state into %r8 ..%r15, compute parity
823 movq %rdx,ksKey+8*(8)+F_O(%rbp)#save key schedule parity
824
825 addReg r13,rax #precompute key injection for tweak
826 addReg r14, rbx
827 .if _SKEIN_DEBUG
828 movq %rbx,TWEAK+ 8(%rdi) #save updated tweak value ctx->h.T[1] for Skein_Debug_Block below
829 .endif
830 movq 0(%rsi),%rax #load input block
831 movq 8(%rsi),%rbx
832 movq 16(%rsi),%rcx
833 movq 24(%rsi),%rdx
834 addReg r8 , rax #do initial key injection
835 addReg r9 , rbx
836 movq %rax,Wcopy+ 0+F_O(%rbp) #keep local copy for feedforward
837 movq %rbx,Wcopy+ 8+F_O(%rbp)
838 addReg r10, rcx
839 addReg r11, rdx
840 movq %rcx,Wcopy+16+F_O(%rbp)
841 movq %rdx,Wcopy+24+F_O(%rbp)
842
843 movq 32(%rsi),%rax
844 movq 40(%rsi),%rbx
845 movq 48(%rsi),%rcx
846 movq 56(%rsi),%rdx
847 addReg r12, rax
848 addReg r13, rbx
849 addReg r14, rcx
850 addReg r15, rdx
851 movq %rax,Wcopy+32+F_O(%rbp)
852 movq %rbx,Wcopy+40+F_O(%rbp)
853 movq %rcx,Wcopy+48+F_O(%rbp)
854 movq %rdx,Wcopy+56+F_O(%rbp)
855
856 .if _SKEIN_DEBUG
857 .irp _Rn_,8,9,10,11,12,13,14,15 #save values on stack for debug output
858 movq %r\_Rn_,X_stk+8*(\_Rn_-8)(%rsp)
859 .endr
860
861 Skein_Debug_Block 512 #debug dump
862 Skein_Debug_Round 512,SKEIN_RND_KEY_INITIAL
863 .endif
864 addq $8*WCNT,%rsi #skip the block
865 movq %rsi,blkPtr+F_O(%rbp) #update block pointer
866 #
867 #################
868 # now the key schedule is computed. Start the rounds
869 #
870 .if (SKEIN_ASM_UNROLL) & 512
871 _UNROLL_CNT = ROUNDS_512/8
872 .else
873 _UNROLL_CNT = SKEIN_UNROLL_512
874 .if ((ROUNDS_512/8) % _UNROLL_CNT)
875 .error "Invalid SKEIN_UNROLL_512"
876 .endif
877 xorq %rdi,%rdi #rdi = round counter
878 Skein_512_round_loop:
879 .endif
880 #
881 _Rbase_ = 0
882 .rept _UNROLL_CNT*2
883 R_512_FourRounds %(4*_Rbase_+00)
884 _Rbase_ = _Rbase_+1
885 .endr #rept _UNROLL_CNT
886 #
887 .if ((SKEIN_ASM_UNROLL) & 512) == 0
888 cmpq $2*(ROUNDS_512/8),%rdi
889 jb Skein_512_round_loop
890 movq ctxPtr +F_O(%rbp),%rdi #restore rdi --> context
891 .endif
892 # end of rounds
893 #################
894 # feedforward: ctx->X[i] = X[i] ^ w[i], {i=0..7}
895 .irp _Rn_,8,9,10,11,12,13,14,15
896 .if (\_Rn_ == 8)
897 movq $FIRST_MASK64,%rbx
898 .endif
899 xorq Wcopy+8*(\_Rn_-8)+F_O(%rbp),%r\_Rn_ #feedforward XOR
900 movq %r\_Rn_,X_VARS+8*(\_Rn_-8)(%rdi) #and store result
901 .if (\_Rn_ == 14)
902 andq TWEAK+ 8(%rdi),%rbx
903 .endif
904 .endr
905 Skein_Debug_Round 512,SKEIN_RND_FEED_FWD
906
907 # go back for more blocks, if needed
908 decq blkCnt+F_O(%rbp)
909 jnz Skein_512_block_loop
910 movq %rbx,TWEAK + 8(%rdi)
911
912 Reset_Stack
913 ret
914 Skein_512_Process_Block_End:
915 #
916 .if _SKEIN_DEBUG
917 # call here with rdx = "round number"
918 Skein_Debug_Round_512:
919 pushq %rsi #save two regs for BLK_BITS-specific parms
920 pushq %rdi
921 .irp _Rn_,8,9,10,11,12,13,14,15 #save X[] state on stack so debug routines can access it
922 movq %r\_Rn_,X_stk+8*(\_Rn_-8)+F_O(%rbp)
923 .endr
924 movq ctxPtr+F_O(%rbp),%rsi #ctx_hdr_ptr
925 movq $512,%rdi #now <rdi,rsi,rdx> are set for the call
926 jmp Skein_Debug_Round_Common
927 .endif
928 #
929 .if _SKEIN_CODE_SIZE
930 C_label Skein_512_Process_Block_CodeSize
931 movq $(Skein_512_Process_Block_End-Skein_512_Process_Block),%rax
932 ret
933 #
934 C_label Skein_512_Unroll_Cnt
935 .if _UNROLL_CNT <> (ROUNDS_512/8)
936 movq $_UNROLL_CNT,%rax
937 .else
938 xorq %rax,%rax
939 .endif
940 ret
941 .endif
942 #
943 .endif # _USE_ASM_ & 512
944 #
945 #=================================== Skein1024 =============================================
946 .if _USE_ASM_ & 1024
947 #
948 # void Skein1024_Process_Block(Skein_1024_Ctxt_t *ctx,const u08b_t *blkPtr,size_t blkCnt,size_t bitcntAdd)#
949 #
950 #################
951 # use details of permutation to make register assignments
952 #
953 o1K_rdi = 0 #offsets in X[] associated with each register
954 o1K_rsi = 1
955 o1K_rbp = 2
956 o1K_rax = 3
957 o1K_rcx = 4 #rcx is "shared" with X6, since X4/X6 alternate
958 o1K_rbx = 5
959 o1K_rdx = 7
960 o1K_r8 = 8
961 o1K_r9 = 9
962 o1K_r10 = 10
963 o1K_r11 = 11
964 o1K_r12 = 12
965 o1K_r13 = 13
966 o1K_r14 = 14
967 o1K_r15 = 15
968 #
969 rIdx_offs = tmpStk_1024
970 #
971 .macro r1024_Mix w0,w1,reg0,reg1,_RN0_,_Rn1_,op1
972 addReg \reg0 , \reg1 #perform the MIX
973 RotL64 \reg1 , 1024,%((\_RN0_) % 8),\_Rn1_
974 xorReg \reg1 , \reg0
975 .if ((\_RN0_) & 3) == 3 #time to do key injection?
976 .if _SKEIN_DEBUG
977 movq %\reg0 , xDebug_1024+8*\w0(%rsp) #save intermediate values for Debug_Round
978 movq %\reg1 , xDebug_1024+8*\w1(%rsp) # (before inline key injection)
979 .endif
980 _II_ = ((\_RN0_)/4)+1 #injection count
981 .if (SKEIN_ASM_UNROLL) & 1024 #here to do fully unrolled key injection
982 addq ksKey+ 8*((_II_+\w0) % 17)(%rsp),%\reg0
983 addq ksKey+ 8*((_II_+\w1) % 17)(%rsp),%\reg1
984 .if \w1 == 13 #tweak injection
985 addq ksTwk+ 8*((_II_+ 0) % 3)(%rsp),%\reg1
986 .elseif \w0 == 14
987 addq ksTwk+ 8*((_II_+ 1) % 3)(%rsp),%\reg0
988 .elseif \w1 == 15
989 addq $_II_, %\reg1 #(injection counter)
990 .endif
991 .else #here to do looping key injection
992 .if (\w0 == 0)
993 movq %rdi, X_stk+8*\w0(%rsp) #if so, store N0 so we can use reg as index
994 movq rIdx_offs(%rsp),%rdi #get the injection counter index into rdi
995 .else
996 addq ksKey+8+8*\w0(%rsp,%rdi,8),%\reg0 #even key injection
997 .endif
998 .if \w1 == 13 #tweak injection
999 addq ksTwk+8+8* 0(%rsp,%rdi,8),%\reg1
1000 .elseif \w0 == 14
1001 addq ksTwk+8+8* 1(%rsp,%rdi,8),%\reg0
1002 .elseif \w1 == 15
1003 addReg \reg1,rdi,,,1 #(injection counter)
1004 .endif
1005 addq ksKey+8+8*\w1(%rsp,%rdi,8),%\reg1 #odd key injection
1006 .endif
1007 .endif
1008 # insert the op provided, .if any
1009 \op1
1010 .endm
1011 #################
1012 # MACRO: four rounds for 1024-bit blocks
1013 #
1014 .macro r1024_FourRounds _RR_ #RR = base round number (0 mod 4)
1015 # should be here with X4 set properly, X6 stored on stack
1016 _Rn_ = (\_RR_) + 0
1017 r1024_Mix 0, 1,rdi,rsi,_Rn_,0
1018 r1024_Mix 2, 3,rbp,rax,_Rn_,1
1019 r1024_Mix 4, 5,rcx,rbx,_Rn_,2,<movq %rcx,X_stk+8*4(%rsp)> #save X4 on stack (x4/x6 alternate)
1020 r1024_Mix 8, 9,r8 ,r9 ,_Rn_,4,<movq X_stk+8*6(%rsp),%rcx> #load X6 from stack
1021 r1024_Mix 10,11,r10,r11,_Rn_,5
1022 r1024_Mix 12,13,r12,r13,_Rn_,6
1023 r1024_Mix 6, 7,rcx,rdx,_Rn_,3
1024 r1024_Mix 14,15,r14,r15,_Rn_,7
1025 .if _SKEIN_DEBUG
1026 Skein_Debug_Round 1024,%(_Rn_+1)
1027 .endif
1028 _Rn_ = (\_RR_) + 1
1029 r1024_Mix 0, 9,rdi,r9 ,_Rn_,0
1030 r1024_Mix 2,13,rbp,r13,_Rn_,1
1031 r1024_Mix 6,11,rcx,r11,_Rn_,2,<movq %rcx,X_stk+8*6(%rsp)> #save X6 on stack (x4/x6 alternate)
1032 r1024_Mix 10, 7,r10,rdx,_Rn_,4,<movq X_stk+8*4(%rsp),%rcx> #load X4 from stack
1033 r1024_Mix 12, 3,r12,rax,_Rn_,5
1034 r1024_Mix 14, 5,r14,rbx,_Rn_,6
1035 r1024_Mix 4,15,rcx,r15,_Rn_,3
1036 r1024_Mix 8, 1,r8 ,rsi,_Rn_,7
1037 .if _SKEIN_DEBUG
1038 Skein_Debug_Round 1024,%(_Rn_+1)
1039 .endif
1040 _Rn_ = (\_RR_) + 2
1041 r1024_Mix 0, 7,rdi,rdx,_Rn_,0
1042 r1024_Mix 2, 5,rbp,rbx,_Rn_,1
1043 r1024_Mix 4, 3,rcx,rax,_Rn_,2,<movq %rcx,X_stk+8*4(%rsp)> #save X4 on stack (x4/x6 alternate)
1044 r1024_Mix 12,15,r12,r15,_Rn_,4,<movq X_stk+8*6(%rsp),%rcx> #load X6 from stack
1045 r1024_Mix 14,13,r14,r13,_Rn_,5
1046 r1024_Mix 8,11,r8 ,r11,_Rn_,6
1047 r1024_Mix 6, 1,rcx,rsi,_Rn_,3
1048 r1024_Mix 10, 9,r10,r9 ,_Rn_,7
1049 .if _SKEIN_DEBUG
1050 Skein_Debug_Round 1024,%(_Rn_+1)
1051 .endif
1052 _Rn_ = (\_RR_) + 3
1053 r1024_Mix 0,15,rdi,r15,_Rn_,0
1054 r1024_Mix 2,11,rbp,r11,_Rn_,1
1055 r1024_Mix 6,13,rcx,r13,_Rn_,2,<movq %rcx,X_stk+8*6(%rsp)> #save X6 on stack (x4/x6 alternate)
1056 r1024_Mix 14, 1,r14,rsi,_Rn_,4,<movq X_stk+8*4(%rsp),%rcx> #load X4 from stack
1057 r1024_Mix 8, 5,r8 ,rbx,_Rn_,5
1058 r1024_Mix 10, 3,r10,rax,_Rn_,6
1059 r1024_Mix 4, 9,rcx,r9 ,_Rn_,3
1060 r1024_Mix 12, 7,r12,rdx,_Rn_,7
1061 .if _SKEIN_DEBUG
1062 Skein_Debug_Round 1024,%(_Rn_+1)
1063 .endif
1064
1065 .if ((SKEIN_ASM_UNROLL) & 1024) == 0 #here with rdi == rIdx, X0 on stack
1066 #"rotate" the key schedule on the stack
1067 i8 = o1K_r8
1068 i0 = o1K_rdi
1069 movq %r8 , X_stk+8*i8(%rsp) #free up a register (save it on the stack)
1070 movq ksKey+8* 0(%rsp,%rdi,8),%r8 #get key word
1071 movq %r8 , ksKey+8*17(%rsp,%rdi,8) #rotate key (must do key first or tweak clobbers it!)
1072 movq ksTwk+8* 0(%rsp,%rdi,8),%r8 #get tweak word
1073 movq %r8 , ksTwk+8* 3(%rsp,%rdi,8) #rotate tweak (onto the stack)
1074 movq X_stk+8*i8(%rsp) ,%r8 #get the reg back
1075 incq %rdi #bump the index
1076 movq %rdi, rIdx_offs (%rsp) #save rdi again
1077 movq ksKey+8*i0(%rsp,%rdi,8),%rdi #get the key schedule word for X0 back
1078 addq X_stk+8*i0(%rsp) ,%rdi #perform the X0 key injection
1079 .endif
1080 #show the result of the key injection
1081 Skein_Debug_Round 1024,SKEIN_RND_KEY_INJECT
1082 .endm #r1024_FourRounds
1083 #
1084 ################
1085 # code
1086 #
1087 C_label Skein1024_Process_Block
1088 #
1089 Setup_Stack 1024,ROUNDS_1024/8,WCNT
1090 movq TWEAK+ 8(%rdi),%r9
1091 jmp Skein1024_block_loop
1092 # main hash loop for Skein1024
1093 .p2align 4
1094 Skein1024_block_loop:
1095 # general register usage:
1096 # RSP = stack pointer
1097 # RAX..RDX,RSI,RDI = X1, X3..X7 (state words)
1098 # R8 ..R15 = X8..X15 (state words)
1099 # RBP = temp (used for X0 and X2)
1100 #
1101 .if ((SKEIN_ASM_UNROLL) & 1024) == 0
1102 xorq %rax,%rax #init loop index on the stack
1103 movq %rax,rIdx_offs(%rsp)
1104 .endif
1105 movq TWEAK+ 0(%rdi),%r8
1106 addq bitAdd+ F_O(%rbp),%r8 #computed updated tweak value T0
1107 movq %r9 ,%r10
1108 xorq %r8 ,%r10 #%rax/%rbx/%rcx = tweak schedule
1109 movq %r8 ,TWEAK+ 0(%rdi) #save updated tweak value ctx->h.T[0]
1110 movq %r8 ,ksTwk+ 0+F_O(%rbp)
1111 movq %r9 ,ksTwk+ 8+F_O(%rbp) #keep values in %r8 ,%r9 for initial tweak injection below
1112 movq %r10,ksTwk+16+F_O(%rbp)
1113 .if _SKEIN_DEBUG
1114 movq %r9 ,TWEAK+ 8(%rdi) #save updated tweak value ctx->h.T[1] for Skein_Debug_Block
1115 .endif
1116 movq blkPtr +F_O(%rbp),%rsi # rsi --> input block
1117 movq $KW_PARITY ,%rax #overall key schedule parity
1118
1119 # the logic here assumes the set {rdi,rsi,rbp,rax} = X[0,1,2,3]
1120 .irp _rN_,0,1,2,3,4,6 #process the "initial" words, using r14/r15 as temps
1121 movq X_VARS+8*\_rN_(%rdi),%r14 #get state word
1122 movq 8*\_rN_(%rsi),%r15 #get msg word
1123 xorq %r14,%rax #update key schedule overall parity
1124 movq %r14,ksKey +8*\_rN_+F_O(%rbp) #save key schedule word on stack
1125 movq %r15,Wcopy +8*\_rN_+F_O(%rbp) #save local msg Wcopy
1126 addq %r15,%r14 #do the initial key injection
1127 movq %r14,X_stk +8*\_rN_ (%rsp) #save initial state var on stack
1128 .endr
1129 # now process the rest, using the "real" registers
1130 # (MUST do it in reverse order to inject tweaks r8/r9 first)
1131 .irp _rr_,r15,r14,r13,r12,r11,r10,r9,r8,rdx,rbx
1132 _oo_ = o1K_\_rr_ #offset assocated with the register
1133 movq X_VARS+8*_oo_(%rdi),%\_rr_ #get key schedule word from context
1134 movq 8*_oo_(%rsi),%rcx #get next input msg word
1135 movq %\_rr_, ksKey +8*_oo_(%rsp) #save key schedule on stack
1136 xorq %\_rr_, %rax #accumulate key schedule parity
1137 movq %rcx,Wcopy+8*_oo_+F_O(%rbp) #save copy of msg word for feedforward
1138 addq %rcx,%\_rr_ #do the initial key injection
1139 .if _oo_ == 13 #do the initial tweak injection
1140 addReg \_rr_,r8 # (only in words 13/14)
1141 .elseif _oo_ == 14
1142 addReg \_rr_,r9
1143 .endif
1144 .endr
1145 movq %rax,ksKey+8*WCNT+F_O(%rbp) #save key schedule parity
1146 .if _SKEIN_DEBUG
1147 Skein_Debug_Block 1024 #initial debug dump
1148 .endif
1149 addq $8*WCNT,%rsi #bump the msg ptr
1150 movq %rsi,blkPtr+F_O(%rbp) #save bumped msg ptr
1151 # re-load words 0..4 from stack, enter the main loop
1152 .irp _rr_,rdi,rsi,rbp,rax,rcx #(no need to re-load x6, already on stack)
1153 movq X_stk+8*o1K_\_rr_(%rsp),%\_rr_ #re-load state and get ready to go!
1154 .endr
1155 .if _SKEIN_DEBUG
1156 Skein_Debug_Round 1024,SKEIN_RND_KEY_INITIAL #show state after initial key injection
1157 .endif
1158 #
1159 #################
1160 # now the key schedule is computed. Start the rounds
1161 #
1162 .if (SKEIN_ASM_UNROLL) & 1024
1163 _UNROLL_CNT = ROUNDS_1024/8
1164 .else
1165 _UNROLL_CNT = SKEIN_UNROLL_1024
1166 .if ((ROUNDS_1024/8) % _UNROLL_CNT)
1167 .error "Invalid SKEIN_UNROLL_1024"
1168 .endif
1169 Skein1024_round_loop:
1170 .endif
1171 #
1172 _Rbase_ = 0
1173 .rept _UNROLL_CNT*2 #implement the rounds, 4 at a time
1174 r1024_FourRounds %(4*_Rbase_+00)
1175 _Rbase_ = _Rbase_+1
1176 .endr #rept _UNROLL_CNT
1177 #
1178 .if ((SKEIN_ASM_UNROLL) & 1024) == 0
1179 cmpq $2*(ROUNDS_1024/8),tmpStk_1024(%rsp) #see .if we are done
1180 jb Skein1024_round_loop
1181 .endif
1182 # end of rounds
1183 #################
1184 #
1185 # feedforward: ctx->X[i] = X[i] ^ w[i], {i=0..15}
1186 movq %rdx,X_stk+8*o1K_rdx(%rsp) #we need a register. x6 already on stack
1187 movq ctxPtr(%rsp),%rdx
1188
1189 .irp _rr_,rdi,rsi,rbp,rax,rcx,rbx,r8,r9,r10,r11,r12,r13,r14,r15 #do all but x6,x7
1190 _oo_ = o1K_\_rr_
1191 xorq Wcopy +8*_oo_(%rsp),%\_rr_ #feedforward XOR
1192 movq %\_rr_,X_VARS+8*_oo_(%rdx) #save result into context
1193 .if (_oo_ == 9)
1194 movq $FIRST_MASK64 ,%r9
1195 .endif
1196 .if (_oo_ == 14)
1197 andq TWEAK+ 8(%rdx),%r9
1198 .endif
1199 .endr
1200 #
1201 movq X_stk +8*6(%rsp),%rax #now process x6,x7 (skipped in .irp above)
1202 movq X_stk +8*7(%rsp),%rbx
1203 xorq Wcopy +8*6(%rsp),%rax
1204 xorq Wcopy +8*7(%rsp),%rbx
1205 movq %rax,X_VARS+8*6(%rdx)
1206 decq blkCnt(%rsp) #set zero flag iff done
1207 movq %rbx,X_VARS+8*7(%rdx)
1208
1209 Skein_Debug_Round 1024,SKEIN_RND_FEED_FWD,,<cmpq $0,blkCnt(%rsp)>
1210 # go back for more blocks, if needed
1211 movq ctxPtr(%rsp),%rdi #don't muck with the flags here!
1212 lea FRAME_OFFS(%rsp),%rbp
1213 jnz Skein1024_block_loop
1214 movq %r9 ,TWEAK+ 8(%rdx)
1215 Reset_Stack
1216 ret
1217 #
1218 Skein1024_Process_Block_End:
1219 #
1220 .if _SKEIN_DEBUG
1221 Skein_Debug_Round_1024:
1222 # call here with rdx = "round number",
1223 _SP_OFFS_ = 8*2 #stack "offset" here: rdx, return addr
1224 #
1225 #save rest of X[] state on stack so debug routines can access it
1226 .irp _rr_,rsi,rbp,rax,rbx,r8,r9,r10,r11,r12,r13,r14,r15
1227 movq %\_rr_,X_stk+8*o1K_\_rr_+_SP_OFFS_(%rsp)
1228 .endr
1229 # Figure out what to do with x0 (rdi). When rdx == 0 mod 4, it's already on stack
1230 cmpq $SKEIN_RND_SPECIAL,%rdx #special rounds always save
1231 jae save_x0
1232 testq $3,%rdx #otherwise only if rdx != 0 mod 4
1233 jz save_x0_not
1234 save_x0:
1235 movq %rdi,X_stk+8*o1K_rdi+_SP_OFFS_(%rsp)
1236 save_x0_not:
1237 #figure out the x4/x6 swapping state and save the correct one!
1238 cmpq $SKEIN_RND_SPECIAL,%rdx #special rounds always do x4
1239 jae save_x4
1240 testq $1,%rdx #and even ones have r4 as well
1241 jz save_x4
1242 movq %rcx,X_stk+8*6+_SP_OFFS_(%rsp)
1243 jmp debug_1024_go
1244 save_x4:
1245 movq %rcx,X_stk+8*4+_SP_OFFS_(%rsp)
1246 debug_1024_go:
1247 #now all is saved in Xstk[] except for rdx
1248 push %rsi #save two regs for BLK_BITS-specific parms
1249 push %rdi
1250 _SP_OFFS_ = _SP_OFFS_ + 16 #adjust stack offset accordingly (now 32)
1251
1252 movq _SP_OFFS_-8(%rsp),%rsi #get back original %rdx (pushed on stack in macro call)
1253 movq %rsi,X_stk+8*o1K_rdx+_SP_OFFS_(%rsp) #and save it in its rightful place in X_stk[]
1254
1255 movq ctxPtr+_SP_OFFS_(%rsp),%rsi #rsi = ctx_hdr_ptr
1256 movq $1024,%rdi #rdi = block size
1257 jmp Skein_Debug_Round_Common
1258 .endif
1259 #
1260 .if _SKEIN_CODE_SIZE
1261 C_label Skein1024_Process_Block_CodeSize
1262 movq $(Skein1024_Process_Block_End-Skein1024_Process_Block),%rax
1263 ret
1264 #
1265 C_label Skein1024_Unroll_Cnt
1266 .if _UNROLL_CNT <> (ROUNDS_1024/8)
1267 movq $_UNROLL_CNT,%rax
1268 .else
1269 xorq %rax,%rax
1270 .endif
1271 ret
1272 .endif
1273 #
1274 .endif # _USE_ASM_ and 1024
1275 #
1276 .if _SKEIN_DEBUG
1277 #----------------------------------------------------------------
1278 #local debug routine to set up for calls to:
1279 # void Skein_Show_Round(uint_t bits,const Skein_Ctxt_Hdr_t *h,int r,const u64b_t *X)
1280 # [ rdi rsi rdx rcx]
1281 #
1282 # here with %rdx = round number
1283 # %rsi = ctx_hdr_ptr
1284 # %rdi = block size (256/512/1024)
1285 # on stack: saved rdi, saved rsi, retAddr, saved rdx
1286 #
1287 Skein_Debug_Round_Common:
1288 _SP_OFFS_ = 32 #account for four words on stack already
1289 .irp _rr_,rax,rbx,rcx,rbp,r8,r9,r10,r11,r12,r13,r14,r15 #save the rest of the regs
1290 pushq %\_rr_
1291 _SP_OFFS_ = _SP_OFFS_+8
1292 .endr
1293 .if (_SP_OFFS_ % 16) # make sure stack is still 16-byte aligned here
1294 .error "Debug_Round_Common: stack alignment"
1295 .endif
1296 # compute %rcx = ptr to the X[] array on the stack (final parameter to call)
1297 leaq X_stk+_SP_OFFS_(%rsp),%rcx #adjust for reg pushes, return address
1298 cmpq $SKEIN_RND_FEED_FWD,%rdx #special handling for feedforward "round"?
1299 jnz _got_rcxA
1300 leaq X_VARS(%rsi),%rcx
1301 _got_rcxA:
1302 .if _USE_ASM_ & 1024
1303 # special handling for 1024-bit case
1304 # (for rounds right before with key injection:
1305 # use xDebug_1024[] instead of X_stk[])
1306 cmpq $SKEIN_RND_SPECIAL,%rdx
1307 jae _got_rcxB #must be a normal round
1308 orq %rdx,%rdx
1309 jz _got_rcxB #just before key injection
1310 test $3,%rdx
1311 jne _got_rcxB
1312 cmp $1024,%rdi #only 1024-bit(s) for now
1313 jne _got_rcxB
1314 leaq xDebug_1024+_SP_OFFS_(%rsp),%rcx
1315 _got_rcxB:
1316 .endif
1317 call Skein_Show_Round #call external debug handler
1318
1319 .irp _rr_,r15,r14,r13,r12,r11,r10,r9,r8,rbp,rcx,rbx,rax #restore regs
1320 popq %\_rr_
1321 _SP_OFFS_ = _SP_OFFS_-8
1322 .endr
1323 .if _SP_OFFS_ - 32
1324 .error "Debug_Round_Common: push/pop misalignment!"
1325 .endif
1326 popq %rdi
1327 popq %rsi
1328 ret
1329 .endif
1330 #----------------------------------------------------------------
1331 .section .note.GNU-stack,"",@progbits
1332
1333 .end
Cache object: b8b2979f33123748464d971888f03395
|