| #!/usr/bin/env perl |
| # Copyright 2017-2020 The OpenSSL Project Authors. All Rights Reserved. |
| # |
| # Licensed under the Apache License 2.0 (the "License"). You may not use |
| # this file except in compliance with the License. You can obtain a copy |
| # in the file LICENSE in the source distribution or at |
| # https://www.openssl.org/source/license.html |
| # |
| # ==================================================================== |
| # Written by Andy Polyakov <appro@openssl.org> for the OpenSSL |
| # project. The module is, however, dual licensed under OpenSSL and |
| # CRYPTOGAMS licenses depending on where you obtain it. For further |
| # details see http://www.openssl.org/~appro/cryptogams/. |
| # ==================================================================== |
| # |
| # Keccak-1600 for AVX-512F. |
| # |
| # July 2017. |
| # |
| # Below code is KECCAK_1X_ALT implementation (see sha/keccak1600.c). |
| # Pretty straightforward, the only "magic" is data layout in registers. |
| # It's impossible to have one that is optimal for every step, hence |
| # it's changing as algorithm progresses. Data is saved in linear order, |
| # but in-register order morphs between rounds. Even rounds take in |
| # linear layout, and odd rounds - transposed, or "verticaly-shaped"... |
| # |
| ######################################################################## |
| # Numbers are cycles per processed byte out of large message. |
| # |
| # r=1088(*) |
| # |
| # Knights Landing 7.6 |
| # Skylake-X 5.7 |
| # |
| # (*) Corresponds to SHA3-256. |
| |
| ######################################################################## |
| # Below code is combination of two ideas. One is taken from Keccak Code |
| # Package, hereafter KCP, and another one from initial version of this |
| # module. What is common is observation that Pi's input and output are |
| # "mostly transposed", i.e. if input is aligned by x coordinate, then |
| # output is [mostly] aligned by y. Both versions, KCP and predecessor, |
| # were trying to use one of them from round to round, which resulted in |
| # some kind of transposition in each round. This version still does |
| # transpose data, but only every second round. Another essential factor |
| # is that KCP transposition has to be performed with instructions that |
| # turned to be rather expensive on Knights Landing, both latency- and |
| # throughput-wise. Not to mention that some of them have to depend on |
| # each other. On the other hand initial version of this module was |
| # relying heavily on blend instructions. There were lots of them, |
| # resulting in higher instruction count, yet it performed better on |
| # Knights Landing, because processor can execute pair of them each |
| # cycle and they have minimal latency. This module is an attempt to |
| # bring best parts together:-) |
| # |
| # Coordinates below correspond to those in sha/keccak1600.c. Input |
| # layout is straight linear: |
| # |
| # [0][4] [0][3] [0][2] [0][1] [0][0] |
| # [1][4] [1][3] [1][2] [1][1] [1][0] |
| # [2][4] [2][3] [2][2] [2][1] [2][0] |
| # [3][4] [3][3] [3][2] [3][1] [3][0] |
| # [4][4] [4][3] [4][2] [4][1] [4][0] |
| # |
| # It's perfect for Theta, while Pi is reduced to intra-register |
| # permutations which yield layout perfect for Chi: |
| # |
| # [4][0] [3][0] [2][0] [1][0] [0][0] |
| # [4][1] [3][1] [2][1] [1][1] [0][1] |
| # [4][2] [3][2] [2][2] [1][2] [0][2] |
| # [4][3] [3][3] [2][3] [1][3] [0][3] |
| # [4][4] [3][4] [2][4] [1][4] [0][4] |
| # |
| # Now instead of performing full transposition and feeding it to next |
| # identical round, we perform kind of diagonal transposition to layout |
| # from initial version of this module, and make it suitable for Theta: |
| # |
| # [4][4] [3][3] [2][2] [1][1] [0][0]>4.3.2.1.0>[4][4] [3][3] [2][2] [1][1] [0][0] |
| # [4][0] [3][4] [2][3] [1][2] [0][1]>3.2.1.0.4>[3][4] [2][3] [1][2] [0][1] [4][0] |
| # [4][1] [3][0] [2][4] [1][3] [0][2]>2.1.0.4.3>[2][4] [1][3] [0][2] [4][1] [3][0] |
| # [4][2] [3][1] [2][0] [1][4] [0][3]>1.0.4.3.2>[1][4] [0][3] [4][2] [3][1] [2][0] |
| # [4][3] [3][2] [2][1] [1][0] [0][4]>0.4.3.2.1>[0][4] [4][3] [3][2] [2][1] [1][0] |
| # |
| # Now intra-register permutations yield initial [almost] straight |
| # linear layout: |
| # |
| # [4][4] [3][3] [2][2] [1][1] [0][0] |
| ##[0][4] [0][3] [0][2] [0][1] [0][0] |
| # [3][4] [2][3] [1][2] [0][1] [4][0] |
| ##[2][3] [2][2] [2][1] [2][0] [2][4] |
| # [2][4] [1][3] [0][2] [4][1] [3][0] |
| ##[4][2] [4][1] [4][0] [4][4] [4][3] |
| # [1][4] [0][3] [4][2] [3][1] [2][0] |
| ##[1][1] [1][0] [1][4] [1][3] [1][2] |
| # [0][4] [4][3] [3][2] [2][1] [1][0] |
| ##[3][0] [3][4] [3][3] [3][2] [3][1] |
| # |
| # This means that odd round Chi is performed in less suitable layout, |
| # with a number of additional permutations. But overall it turned to be |
| # a win. Permutations are fastest possible on Knights Landing and they |
| # are laid down to be independent of each other. In the essence I traded |
| # 20 blend instructions for 3 permutations. The result is 13% faster |
| # than KCP on Skylake-X, and >40% on Knights Landing. |
| # |
| # As implied, data is loaded in straight linear order. Digits in |
| # variables' names represent coordinates of right-most element of |
| # loaded data chunk: |
| |
| my ($A00, # [0][4] [0][3] [0][2] [0][1] [0][0] |
| $A10, # [1][4] [1][3] [1][2] [1][1] [1][0] |
| $A20, # [2][4] [2][3] [2][2] [2][1] [2][0] |
| $A30, # [3][4] [3][3] [3][2] [3][1] [3][0] |
| $A40) = # [4][4] [4][3] [4][2] [4][1] [4][0] |
| map("%zmm$_",(0..4)); |
| |
| # We also need to map the magic order into offsets within structure: |
| |
| my @A_jagged = ([0,0], [0,1], [0,2], [0,3], [0,4], |
| [1,0], [1,1], [1,2], [1,3], [1,4], |
| [2,0], [2,1], [2,2], [2,3], [2,4], |
| [3,0], [3,1], [3,2], [3,3], [3,4], |
| [4,0], [4,1], [4,2], [4,3], [4,4]); |
| @A_jagged = map(8*($$_[0]*8+$$_[1]), @A_jagged); # ... and now linear |
| |
| my @T = map("%zmm$_",(5..12)); |
| my @Theta = map("%zmm$_",(33,13..16)); # invalid @Theta[0] is not typo |
| my @Pi0 = map("%zmm$_",(17..21)); |
| my @Rhotate0 = map("%zmm$_",(22..26)); |
| my @Rhotate1 = map("%zmm$_",(27..31)); |
| |
| my ($C00,$D00) = @T[0..1]; |
| my ($k00001,$k00010,$k00100,$k01000,$k10000,$k11111) = map("%k$_",(1..6)); |
| |
| $code.=<<___; |
| .text |
| |
| .type __KeccakF1600,\@function |
| .align 32 |
| __KeccakF1600: |
| lea iotas(%rip),%r10 |
| mov \$12,%eax |
| jmp .Loop_avx512 |
| |
| .align 32 |
| .Loop_avx512: |
| ######################################### Theta, even round |
| vmovdqa64 $A00,@T[0] # put aside original A00 |
| vpternlogq \$0x96,$A20,$A10,$A00 # and use it as "C00" |
| vpternlogq \$0x96,$A40,$A30,$A00 |
| |
| vprolq \$1,$A00,$D00 |
| vpermq $A00,@Theta[1],$A00 |
| vpermq $D00,@Theta[4],$D00 |
| |
| vpternlogq \$0x96,$A00,$D00,@T[0] # T[0] is original A00 |
| vpternlogq \$0x96,$A00,$D00,$A10 |
| vpternlogq \$0x96,$A00,$D00,$A20 |
| vpternlogq \$0x96,$A00,$D00,$A30 |
| vpternlogq \$0x96,$A00,$D00,$A40 |
| |
| ######################################### Rho |
| vprolvq @Rhotate0[0],@T[0],$A00 # T[0] is original A00 |
| vprolvq @Rhotate0[1],$A10,$A10 |
| vprolvq @Rhotate0[2],$A20,$A20 |
| vprolvq @Rhotate0[3],$A30,$A30 |
| vprolvq @Rhotate0[4],$A40,$A40 |
| |
| ######################################### Pi |
| vpermq $A00,@Pi0[0],$A00 |
| vpermq $A10,@Pi0[1],$A10 |
| vpermq $A20,@Pi0[2],$A20 |
| vpermq $A30,@Pi0[3],$A30 |
| vpermq $A40,@Pi0[4],$A40 |
| |
| ######################################### Chi |
| vmovdqa64 $A00,@T[0] |
| vmovdqa64 $A10,@T[1] |
| vpternlogq \$0xD2,$A20,$A10,$A00 |
| vpternlogq \$0xD2,$A30,$A20,$A10 |
| vpternlogq \$0xD2,$A40,$A30,$A20 |
| vpternlogq \$0xD2,@T[0],$A40,$A30 |
| vpternlogq \$0xD2,@T[1],@T[0],$A40 |
| |
| ######################################### Iota |
| vpxorq (%r10),$A00,${A00}{$k00001} |
| lea 16(%r10),%r10 |
| |
| ######################################### Harmonize rounds |
| vpblendmq $A20,$A10,@{T[1]}{$k00010} |
| vpblendmq $A30,$A20,@{T[2]}{$k00010} |
| vpblendmq $A40,$A30,@{T[3]}{$k00010} |
| vpblendmq $A10,$A00,@{T[0]}{$k00010} |
| vpblendmq $A00,$A40,@{T[4]}{$k00010} |
| |
| vpblendmq $A30,@T[1],@{T[1]}{$k00100} |
| vpblendmq $A40,@T[2],@{T[2]}{$k00100} |
| vpblendmq $A20,@T[0],@{T[0]}{$k00100} |
| vpblendmq $A00,@T[3],@{T[3]}{$k00100} |
| vpblendmq $A10,@T[4],@{T[4]}{$k00100} |
| |
| vpblendmq $A40,@T[1],@{T[1]}{$k01000} |
| vpblendmq $A30,@T[0],@{T[0]}{$k01000} |
| vpblendmq $A00,@T[2],@{T[2]}{$k01000} |
| vpblendmq $A10,@T[3],@{T[3]}{$k01000} |
| vpblendmq $A20,@T[4],@{T[4]}{$k01000} |
| |
| vpblendmq $A40,@T[0],@{T[0]}{$k10000} |
| vpblendmq $A00,@T[1],@{T[1]}{$k10000} |
| vpblendmq $A10,@T[2],@{T[2]}{$k10000} |
| vpblendmq $A20,@T[3],@{T[3]}{$k10000} |
| vpblendmq $A30,@T[4],@{T[4]}{$k10000} |
| |
| #vpermq @T[0],@Theta[0],$A00 # doesn't actually change order |
| vpermq @T[1],@Theta[1],$A10 |
| vpermq @T[2],@Theta[2],$A20 |
| vpermq @T[3],@Theta[3],$A30 |
| vpermq @T[4],@Theta[4],$A40 |
| |
| ######################################### Theta, odd round |
| vmovdqa64 $T[0],$A00 # real A00 |
| vpternlogq \$0x96,$A20,$A10,$C00 # C00 is @T[0]'s alias |
| vpternlogq \$0x96,$A40,$A30,$C00 |
| |
| vprolq \$1,$C00,$D00 |
| vpermq $C00,@Theta[1],$C00 |
| vpermq $D00,@Theta[4],$D00 |
| |
| vpternlogq \$0x96,$C00,$D00,$A00 |
| vpternlogq \$0x96,$C00,$D00,$A30 |
| vpternlogq \$0x96,$C00,$D00,$A10 |
| vpternlogq \$0x96,$C00,$D00,$A40 |
| vpternlogq \$0x96,$C00,$D00,$A20 |
| |
| ######################################### Rho |
| vprolvq @Rhotate1[0],$A00,$A00 |
| vprolvq @Rhotate1[3],$A30,@T[1] |
| vprolvq @Rhotate1[1],$A10,@T[2] |
| vprolvq @Rhotate1[4],$A40,@T[3] |
| vprolvq @Rhotate1[2],$A20,@T[4] |
| |
| vpermq $A00,@Theta[4],@T[5] |
| vpermq $A00,@Theta[3],@T[6] |
| |
| ######################################### Iota |
| vpxorq -8(%r10),$A00,${A00}{$k00001} |
| |
| ######################################### Pi |
| vpermq @T[1],@Theta[2],$A10 |
| vpermq @T[2],@Theta[4],$A20 |
| vpermq @T[3],@Theta[1],$A30 |
| vpermq @T[4],@Theta[3],$A40 |
| |
| ######################################### Chi |
| vpternlogq \$0xD2,@T[6],@T[5],$A00 |
| |
| vpermq @T[1],@Theta[1],@T[7] |
| #vpermq @T[1],@Theta[0],@T[1] |
| vpternlogq \$0xD2,@T[1],@T[7],$A10 |
| |
| vpermq @T[2],@Theta[3],@T[0] |
| vpermq @T[2],@Theta[2],@T[2] |
| vpternlogq \$0xD2,@T[2],@T[0],$A20 |
| |
| #vpermq @T[3],@Theta[0],@T[3] |
| vpermq @T[3],@Theta[4],@T[1] |
| vpternlogq \$0xD2,@T[1],@T[3],$A30 |
| |
| vpermq @T[4],@Theta[2],@T[0] |
| vpermq @T[4],@Theta[1],@T[4] |
| vpternlogq \$0xD2,@T[4],@T[0],$A40 |
| |
| dec %eax |
| jnz .Loop_avx512 |
| |
| ret |
| .size __KeccakF1600,.-__KeccakF1600 |
| ___ |
| |
| my ($A_flat,$inp,$len,$bsz) = ("%rdi","%rsi","%rdx","%rcx"); |
| my $out = $inp; # in squeeze |
| |
| $code.=<<___; |
| .globl SHA3_absorb |
| .type SHA3_absorb,\@function |
| .align 32 |
| SHA3_absorb: |
| mov %rsp,%r11 |
| |
| lea -320(%rsp),%rsp |
| and \$-64,%rsp |
| |
| lea 96($A_flat),$A_flat |
| lea 96($inp),$inp |
| lea 128(%rsp),%r9 |
| |
| lea theta_perm(%rip),%r8 |
| |
| kxnorw $k11111,$k11111,$k11111 |
| kshiftrw \$15,$k11111,$k00001 |
| kshiftrw \$11,$k11111,$k11111 |
| kshiftlw \$1,$k00001,$k00010 |
| kshiftlw \$2,$k00001,$k00100 |
| kshiftlw \$3,$k00001,$k01000 |
| kshiftlw \$4,$k00001,$k10000 |
| |
| #vmovdqa64 64*0(%r8),@Theta[0] |
| vmovdqa64 64*1(%r8),@Theta[1] |
| vmovdqa64 64*2(%r8),@Theta[2] |
| vmovdqa64 64*3(%r8),@Theta[3] |
| vmovdqa64 64*4(%r8),@Theta[4] |
| |
| vmovdqa64 64*5(%r8),@Rhotate1[0] |
| vmovdqa64 64*6(%r8),@Rhotate1[1] |
| vmovdqa64 64*7(%r8),@Rhotate1[2] |
| vmovdqa64 64*8(%r8),@Rhotate1[3] |
| vmovdqa64 64*9(%r8),@Rhotate1[4] |
| |
| vmovdqa64 64*10(%r8),@Rhotate0[0] |
| vmovdqa64 64*11(%r8),@Rhotate0[1] |
| vmovdqa64 64*12(%r8),@Rhotate0[2] |
| vmovdqa64 64*13(%r8),@Rhotate0[3] |
| vmovdqa64 64*14(%r8),@Rhotate0[4] |
| |
| vmovdqa64 64*15(%r8),@Pi0[0] |
| vmovdqa64 64*16(%r8),@Pi0[1] |
| vmovdqa64 64*17(%r8),@Pi0[2] |
| vmovdqa64 64*18(%r8),@Pi0[3] |
| vmovdqa64 64*19(%r8),@Pi0[4] |
| |
| vmovdqu64 40*0-96($A_flat),${A00}{$k11111}{z} |
| vpxorq @T[0],@T[0],@T[0] |
| vmovdqu64 40*1-96($A_flat),${A10}{$k11111}{z} |
| vmovdqu64 40*2-96($A_flat),${A20}{$k11111}{z} |
| vmovdqu64 40*3-96($A_flat),${A30}{$k11111}{z} |
| vmovdqu64 40*4-96($A_flat),${A40}{$k11111}{z} |
| |
| vmovdqa64 @T[0],0*64-128(%r9) # zero transfer area on stack |
| vmovdqa64 @T[0],1*64-128(%r9) |
| vmovdqa64 @T[0],2*64-128(%r9) |
| vmovdqa64 @T[0],3*64-128(%r9) |
| vmovdqa64 @T[0],4*64-128(%r9) |
| jmp .Loop_absorb_avx512 |
| |
| .align 32 |
| .Loop_absorb_avx512: |
| mov $bsz,%rax |
| sub $bsz,$len |
| jc .Ldone_absorb_avx512 |
| |
| shr \$3,%eax |
| ___ |
| for(my $i=0; $i<25; $i++) { |
| $code.=<<___ |
| mov 8*$i-96($inp),%r8 |
| mov %r8,$A_jagged[$i]-128(%r9) |
| dec %eax |
| jz .Labsorved_avx512 |
| ___ |
| } |
| $code.=<<___; |
| .Labsorved_avx512: |
| lea ($inp,$bsz),$inp |
| |
| vpxorq 64*0-128(%r9),$A00,$A00 |
| vpxorq 64*1-128(%r9),$A10,$A10 |
| vpxorq 64*2-128(%r9),$A20,$A20 |
| vpxorq 64*3-128(%r9),$A30,$A30 |
| vpxorq 64*4-128(%r9),$A40,$A40 |
| |
| call __KeccakF1600 |
| |
| jmp .Loop_absorb_avx512 |
| |
| .align 32 |
| .Ldone_absorb_avx512: |
| vmovdqu64 $A00,40*0-96($A_flat){$k11111} |
| vmovdqu64 $A10,40*1-96($A_flat){$k11111} |
| vmovdqu64 $A20,40*2-96($A_flat){$k11111} |
| vmovdqu64 $A30,40*3-96($A_flat){$k11111} |
| vmovdqu64 $A40,40*4-96($A_flat){$k11111} |
| |
| vzeroupper |
| |
| lea (%r11),%rsp |
| lea ($len,$bsz),%rax # return value |
| ret |
| .size SHA3_absorb,.-SHA3_absorb |
| |
| .globl SHA3_squeeze |
| .type SHA3_squeeze,\@function |
| .align 32 |
| SHA3_squeeze: |
| mov %rsp,%r11 |
| |
| lea 96($A_flat),$A_flat |
| cmp $bsz,$len |
| jbe .Lno_output_extension_avx512 |
| |
| lea theta_perm(%rip),%r8 |
| |
| kxnorw $k11111,$k11111,$k11111 |
| kshiftrw \$15,$k11111,$k00001 |
| kshiftrw \$11,$k11111,$k11111 |
| kshiftlw \$1,$k00001,$k00010 |
| kshiftlw \$2,$k00001,$k00100 |
| kshiftlw \$3,$k00001,$k01000 |
| kshiftlw \$4,$k00001,$k10000 |
| |
| #vmovdqa64 64*0(%r8),@Theta[0] |
| vmovdqa64 64*1(%r8),@Theta[1] |
| vmovdqa64 64*2(%r8),@Theta[2] |
| vmovdqa64 64*3(%r8),@Theta[3] |
| vmovdqa64 64*4(%r8),@Theta[4] |
| |
| vmovdqa64 64*5(%r8),@Rhotate1[0] |
| vmovdqa64 64*6(%r8),@Rhotate1[1] |
| vmovdqa64 64*7(%r8),@Rhotate1[2] |
| vmovdqa64 64*8(%r8),@Rhotate1[3] |
| vmovdqa64 64*9(%r8),@Rhotate1[4] |
| |
| vmovdqa64 64*10(%r8),@Rhotate0[0] |
| vmovdqa64 64*11(%r8),@Rhotate0[1] |
| vmovdqa64 64*12(%r8),@Rhotate0[2] |
| vmovdqa64 64*13(%r8),@Rhotate0[3] |
| vmovdqa64 64*14(%r8),@Rhotate0[4] |
| |
| vmovdqa64 64*15(%r8),@Pi0[0] |
| vmovdqa64 64*16(%r8),@Pi0[1] |
| vmovdqa64 64*17(%r8),@Pi0[2] |
| vmovdqa64 64*18(%r8),@Pi0[3] |
| vmovdqa64 64*19(%r8),@Pi0[4] |
| |
| vmovdqu64 40*0-96($A_flat),${A00}{$k11111}{z} |
| vmovdqu64 40*1-96($A_flat),${A10}{$k11111}{z} |
| vmovdqu64 40*2-96($A_flat),${A20}{$k11111}{z} |
| vmovdqu64 40*3-96($A_flat),${A30}{$k11111}{z} |
| vmovdqu64 40*4-96($A_flat),${A40}{$k11111}{z} |
| |
| .Lno_output_extension_avx512: |
| shr \$3,$bsz |
| lea -96($A_flat),%r9 |
| mov $bsz,%rax |
| jmp .Loop_squeeze_avx512 |
| |
| .align 32 |
| .Loop_squeeze_avx512: |
| cmp \$8,$len |
| jb .Ltail_squeeze_avx512 |
| |
| mov (%r9),%r8 |
| lea 8(%r9),%r9 |
| mov %r8,($out) |
| lea 8($out),$out |
| sub \$8,$len # len -= 8 |
| jz .Ldone_squeeze_avx512 |
| |
| sub \$1,%rax # bsz-- |
| jnz .Loop_squeeze_avx512 |
| |
| #vpermq @Theta[4],@Theta[4],@Theta[3] |
| #vpermq @Theta[3],@Theta[4],@Theta[2] |
| #vpermq @Theta[3],@Theta[3],@Theta[1] |
| |
| call __KeccakF1600 |
| |
| vmovdqu64 $A00,40*0-96($A_flat){$k11111} |
| vmovdqu64 $A10,40*1-96($A_flat){$k11111} |
| vmovdqu64 $A20,40*2-96($A_flat){$k11111} |
| vmovdqu64 $A30,40*3-96($A_flat){$k11111} |
| vmovdqu64 $A40,40*4-96($A_flat){$k11111} |
| |
| lea -96($A_flat),%r9 |
| mov $bsz,%rax |
| jmp .Loop_squeeze_avx512 |
| |
| .Ltail_squeeze_avx512: |
| mov $out,%rdi |
| mov %r9,%rsi |
| mov $len,%rcx |
| .byte 0xf3,0xa4 # rep movsb |
| |
| .Ldone_squeeze_avx512: |
| vzeroupper |
| |
| lea (%r11),%rsp |
| ret |
| .size SHA3_squeeze,.-SHA3_squeeze |
| |
| .align 64 |
| theta_perm: |
| .quad 0, 1, 2, 3, 4, 5, 6, 7 # [not used] |
| .quad 4, 0, 1, 2, 3, 5, 6, 7 |
| .quad 3, 4, 0, 1, 2, 5, 6, 7 |
| .quad 2, 3, 4, 0, 1, 5, 6, 7 |
| .quad 1, 2, 3, 4, 0, 5, 6, 7 |
| |
| rhotates1: |
| .quad 0, 44, 43, 21, 14, 0, 0, 0 # [0][0] [1][1] [2][2] [3][3] [4][4] |
| .quad 18, 1, 6, 25, 8, 0, 0, 0 # [4][0] [0][1] [1][2] [2][3] [3][4] |
| .quad 41, 2, 62, 55, 39, 0, 0, 0 # [3][0] [4][1] [0][2] [1][3] [2][4] |
| .quad 3, 45, 61, 28, 20, 0, 0, 0 # [2][0] [3][1] [4][2] [0][3] [1][4] |
| .quad 36, 10, 15, 56, 27, 0, 0, 0 # [1][0] [2][1] [3][2] [4][3] [0][4] |
| |
| rhotates0: |
| .quad 0, 1, 62, 28, 27, 0, 0, 0 |
| .quad 36, 44, 6, 55, 20, 0, 0, 0 |
| .quad 3, 10, 43, 25, 39, 0, 0, 0 |
| .quad 41, 45, 15, 21, 8, 0, 0, 0 |
| .quad 18, 2, 61, 56, 14, 0, 0, 0 |
| |
| pi0_perm: |
| .quad 0, 3, 1, 4, 2, 5, 6, 7 |
| .quad 1, 4, 2, 0, 3, 5, 6, 7 |
| .quad 2, 0, 3, 1, 4, 5, 6, 7 |
| .quad 3, 1, 4, 2, 0, 5, 6, 7 |
| .quad 4, 2, 0, 3, 1, 5, 6, 7 |
| |
| |
| iotas: |
| .quad 0x0000000000000001 |
| .quad 0x0000000000008082 |
| .quad 0x800000000000808a |
| .quad 0x8000000080008000 |
| .quad 0x000000000000808b |
| .quad 0x0000000080000001 |
| .quad 0x8000000080008081 |
| .quad 0x8000000000008009 |
| .quad 0x000000000000008a |
| .quad 0x0000000000000088 |
| .quad 0x0000000080008009 |
| .quad 0x000000008000000a |
| .quad 0x000000008000808b |
| .quad 0x800000000000008b |
| .quad 0x8000000000008089 |
| .quad 0x8000000000008003 |
| .quad 0x8000000000008002 |
| .quad 0x8000000000000080 |
| .quad 0x000000000000800a |
| .quad 0x800000008000000a |
| .quad 0x8000000080008081 |
| .quad 0x8000000000008080 |
| .quad 0x0000000080000001 |
| .quad 0x8000000080008008 |
| |
| .asciz "Keccak-1600 absorb and squeeze for AVX-512F, CRYPTOGAMS by <appro\@openssl.org>" |
| ___ |
| |
| $output=pop and open STDOUT,">$output"; |
| print $code; |
| close STDOUT or die "error closing STDOUT: $!"; |