| #! /usr/bin/env perl |
| # Copyright (C) 2023 Intel Corporation |
| # |
| # Licensed under the OpenSSL license (the "License"). You may not use |
| # this file except in compliance with the License. You can obtain a copy |
| # in the file LICENSE in the source distribution or at |
| # https://www.openssl.org/source/license.html |
| |
| # This implementation is based on the AES-XTS code (AVX512VAES + VPCLMULQDQ) |
| # from Intel(R) Intelligent Storage Acceleration Library Crypto Version |
| # (https://github.com/intel/isa-l_crypto). |
| # |
| ###################################################################### |
| # The main building block of the loop is code that encrypts/decrypts |
| # 8/16 blocks of data stitching with generation of tweak for the next |
| # 8/16 blocks, utilizing VAES and VPCLMULQDQ instructions with full width |
| # of ZMM registers. The main loop is selected based on the input length. |
| # main_loop_run_16 encrypts/decrypts 16 blocks in parallel and it's selected |
| # when input length >= 256 bytes (16 blocks) |
| # main_loop_run_8 encrypts/decrypts 8 blocks in parallel and it's selected |
| # when 128 bytes <= input length < 256 bytes (8-15 blocks) |
| # Input length < 128 bytes (8 blocks) is handled by do_n_blocks. |
| # |
| # This implementation mainly uses vpshrdq from AVX-512-VBMI2 family and vaesenc, |
| # vaesdec, vpclmulqdq from AVX-512F family. |
| $output = $#ARGV >= 0 && $ARGV[$#ARGV] =~ m|\.\w+$| ? pop : undef; |
| $flavour = $#ARGV >= 0 && $ARGV[0] !~ m|\.| ? shift : undef; |
| |
| $win64=0; $win64=1 if ($flavour =~ /[nm]asm|mingw64/ || $output =~ /\.asm$/); |
| $avx512vaes=0; |
| |
| $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1; |
| ( $xlate="${dir}x86_64-xlate.pl" and -f $xlate ) or |
| ( $xlate="${dir}../../perlasm/x86_64-xlate.pl" and -f $xlate) or |
| die "can't locate x86_64-xlate.pl"; |
| |
| if (`$ENV{CC} -Wa,-v -c -o /dev/null -x assembler /dev/null 2>&1` |
| =~ /GNU assembler version ([0-9]+)\.([0-9]+)/) { |
| my $ver = $1 + $2/100.0; # 3.1->3.01, 3.10->3.10 |
| $avx512vaes = ($ver >= 2.30); |
| } |
| |
| if (!$avx512vaes && $win64 && ($flavour =~ /nasm/ || $ENV{ASM} =~ /nasm/) && |
| `nasm -v 2>&1` =~ /NASM version ([0-9]+)\.([0-9]+)(?:\.([0-9]+))?/) { |
| my $ver = $1 + $2/100.0 + $3/10000.0; # 3.1.0->3.01, 3.10.1->3.1001 |
| $avx512vaes = ($ver >= 2.1108); |
| } |
| |
| if (!$avx512vaes && `$ENV{CC} -v 2>&1` |
| =~ /(Apple)?\s*((?:clang|LLVM) version|.*based on LLVM) ([0-9]+)\.([0-9]+)\.([0-9]+)?/) { |
| my $ver = $3 + $4/100.0 + $5/10000.0; # 3.1.0->3.01, 3.10.1->3.1001 |
| if ($1) { |
| # Apple conditions, they use a different version series, see |
| # https://en.wikipedia.org/wiki/Xcode#Xcode_7.0_-_10.x_(since_Free_On-Device_Development)_2 |
| # clang 7.0.0 is Apple clang 10.0.1 |
| $avx512vaes = ($ver>=10.0001) |
| } else { |
| $avx512vaes = ($ver>=7.0); |
| } |
| } |
| |
| open OUT,"| \"$^X\" \"$xlate\" $flavour \"$output\"" |
| or die "can't call $xlate: $!"; |
| *STDOUT=*OUT; |
| |
| #====================================================================== |
| |
| if ($avx512vaes) { |
| |
| my $GP_STORAGE = $win64 ? (16 * 18) : (16 * 8); # store rbx |
| my $XMM_STORAGE = $win64 ? (16 * 8) : 0; # store xmm6:xmm15 |
| my $VARIABLE_OFFSET = $win64 ? (16*8 + 16*10 + 8*3) : |
| (16*8 + 8*1); |
| |
| # right now, >= 0x80 (128) is used for expanded keys. all usages of |
| # rsp should be invoked via $TW, not shadowed by any other name or |
| # used directly. |
| my $TW = "%rsp"; |
| my $TEMPHIGH = "%rbx"; |
| my $TEMPLOW = "%rax"; |
| my $ZPOLY = "%zmm25"; |
| |
| # ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; |
| # ;;; Function arguments abstraction |
| # ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; |
| my ($key2, $key1, $tweak, $length, $input, $output); |
| |
| |
| $input = "%rdi"; |
| $output = "%rsi"; |
| $length = "%rdx"; |
| $key1 = "%rcx"; |
| $key2 = "%r8"; |
| $tweak = "%r9"; |
| |
| # arguments for temp parameters |
| my ($tmp1, $gf_poly_8b, $gf_poly_8b_temp); |
| $tmp1 = "%r8"; |
| $gf_poly_8b = "%r10"; |
| $gf_poly_8b_temp = "%r11"; |
| |
| # ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; |
| # ;;; Helper functions |
| # ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; |
| |
| # Generates "random" local labels |
| sub random_string() { |
| my @chars = ('a' .. 'z', 'A' .. 'Z', '0' .. '9', '_'); |
| my $length = 15; |
| my $str; |
| map { $str .= $chars[rand(33)] } 1 .. $length; |
| return $str; |
| } |
| |
| # ; Seed the RNG so the labels are generated deterministically |
| srand(12345); |
| |
| sub encrypt_tweak { |
| my $state_tweak = $_[0]; |
| my $is_128 = $_[1]; |
| |
| $code.=<<___; |
| vpxor ($key2), $state_tweak, $state_tweak |
| vaesenc 0x10($key2), $state_tweak, $state_tweak |
| vaesenc 0x20($key2), $state_tweak, $state_tweak |
| vaesenc 0x30($key2), $state_tweak, $state_tweak |
| vaesenc 0x40($key2), $state_tweak, $state_tweak |
| vaesenc 0x50($key2), $state_tweak, $state_tweak |
| vaesenc 0x60($key2), $state_tweak, $state_tweak |
| vaesenc 0x70($key2), $state_tweak, $state_tweak |
| vaesenc 0x80($key2), $state_tweak, $state_tweak |
| vaesenc 0x90($key2), $state_tweak, $state_tweak |
| ___ |
| |
| if ($is_128) { |
| $code .= "vaesenclast 0xa0($key2), $state_tweak, $state_tweak\n"; |
| } else { |
| $code .= "vaesenc 0xa0($key2), $state_tweak, $state_tweak\n"; |
| $code .= "vaesenc 0xb0($key2), $state_tweak, $state_tweak\n"; |
| $code .= "vaesenc 0xc0($key2), $state_tweak, $state_tweak\n"; |
| $code .= "vaesenc 0xd0($key2), $state_tweak, $state_tweak\n"; |
| $code .= "vaesenclast 0xe0($key2), $state_tweak, $state_tweak\n"; |
| } |
| $code .= "vmovdqa $state_tweak, ($TW)\n"; |
| } |
| |
| sub encrypt_final { |
| my $st = $_[0]; |
| my $tw = $_[1]; |
| my $is_128 = $_[2]; |
| |
| # xor Tweak value |
| $code .= "vpxor $tw, $st, $st\n"; |
| $code .= "vpxor ($key1), $st, $st\n"; |
| |
| my $rounds = $is_128 ? 10 : 14; |
| for (my $i = 1; $i < $rounds; $i++) { |
| $code .= "vaesenc 16*$i($key1), $st, $st\n"; |
| } |
| |
| $code .=<<___; |
| vaesenclast 16*$rounds($key1), $st, $st |
| vpxor $tw, $st, $st |
| ___ |
| } |
| |
| # decrypt initial blocks of AES |
| # 1, 2, 3, 4, 5, 6 or 7 blocks are encrypted |
| # next 8 Tweak values are generated |
| sub decrypt_initial { |
| my @st; |
| $st[0] = $_[0]; |
| $st[1] = $_[1]; |
| $st[2] = $_[2]; |
| $st[3] = $_[3]; |
| $st[4] = $_[4]; |
| $st[5] = $_[5]; |
| $st[6] = $_[6]; |
| $st[7] = $_[7]; |
| |
| my @tw; |
| $tw[0] = $_[8]; |
| $tw[1] = $_[9]; |
| $tw[2] = $_[10]; |
| $tw[3] = $_[11]; |
| $tw[4] = $_[12]; |
| $tw[5] = $_[13]; |
| $tw[6] = $_[14]; |
| my $t0 = $_[15]; |
| my $num_blocks = $_[16]; |
| my $lt128 = $_[17]; |
| my $is_128 = $_[18]; |
| |
| # num_blocks blocks encrypted |
| # num_blocks can be 1, 2, 3, 4, 5, 6, 7 |
| |
| # xor Tweak value |
| for (my $i = 0; $i < $num_blocks; $i++) { |
| $code .= "vpxor $tw[$i], $st[$i], $st[$i]\n"; |
| } |
| |
| $code .= "vmovdqu ($key1), $t0\n"; |
| |
| for (my $i = 0; $i < $num_blocks; $i++) { |
| $code .= "vpxor $t0, $st[$i], $st[$i]\n"; |
| } |
| |
| if (0 == $lt128) { |
| $code .= <<___; |
| xor $gf_poly_8b_temp, $gf_poly_8b_temp |
| shl \$1, $TEMPLOW |
| adc $TEMPHIGH, $TEMPHIGH |
| ___ |
| } |
| # round 1 |
| $code .= "vmovdqu 0x10($key1), $t0\n"; |
| |
| for (my $i = 0; $i < $num_blocks; $i++) { |
| $code .= "vaesdec $t0, $st[$i], $st[$i]\n"; |
| } |
| |
| if (0 == $lt128) { |
| $code .= <<___; |
| cmovc $gf_poly_8b, $gf_poly_8b_temp |
| xor $gf_poly_8b_temp, $TEMPLOW |
| mov $TEMPLOW, ($TW) # next Tweak1 generated |
| mov $TEMPLOW, 0x08($TW) |
| xor $gf_poly_8b_temp, $gf_poly_8b_temp |
| ___ |
| } |
| |
| # round 2 |
| $code .= "vmovdqu 0x20($key1), $t0\n"; |
| |
| for (my $i = 0; $i < $num_blocks; $i++) { |
| $code .= "vaesdec $t0, $st[$i], $st[$i]\n"; |
| } |
| |
| if (0 == $lt128) { |
| $code .= <<___; |
| shl \$1, $TEMPLOW |
| adc $TEMPHIGH, $TEMPHIGH |
| cmovc $gf_poly_8b, $gf_poly_8b_temp |
| xor $gf_poly_8b_temp, $TEMPLOW |
| mov $TEMPLOW, 0x10($TW) # next Tweak2 generated |
| ___ |
| } |
| |
| # round 3 |
| $code .= "vmovdqu 0x30($key1), $t0\n"; |
| |
| for (my $i = 0; $i < $num_blocks; $i++) { |
| $code .= "vaesdec $t0, $st[$i], $st[$i]\n"; |
| } |
| |
| if (0 == $lt128) { |
| $code .= <<___; |
| mov $TEMPHIGH, 0x18($TW) |
| xor $gf_poly_8b_temp, $gf_poly_8b_temp |
| shl \$1, $TEMPLOW |
| adc $TEMPHIGH, $TEMPHIGH |
| cmovc $gf_poly_8b, $gf_poly_8b_temp |
| ___ |
| } |
| |
| # round 4 |
| $code .= "vmovdqu 0x40($key1), $t0\n"; |
| |
| for (my $i = 0; $i < $num_blocks; $i++) { |
| $code .= "vaesdec $t0, $st[$i], $st[$i]\n"; |
| } |
| |
| if (0 == $lt128) { |
| $code .= <<___; |
| xor $gf_poly_8b_temp, $TEMPLOW |
| mov $TEMPLOW, 0x20($TW) # next Tweak3 generated |
| mov $TEMPHIGH, 0x28($TW) |
| xor $gf_poly_8b_temp, $gf_poly_8b_temp |
| shl \$1, $TEMPLOW |
| ___ |
| } |
| |
| # round 5 |
| $code .= "vmovdqu 0x50($key1), $t0\n"; |
| |
| for (my $i = 0; $i < $num_blocks; $i++) { |
| $code .= "vaesdec $t0, $st[$i], $st[$i]\n"; |
| } |
| |
| if (0 == $lt128) { |
| $code .= <<___; |
| adc $TEMPHIGH, $TEMPHIGH |
| cmovc $gf_poly_8b, $gf_poly_8b_temp |
| xor $gf_poly_8b_temp, $TEMPLOW |
| mov $TEMPLOW, 0x30($TW) # next Tweak4 generated |
| mov $TEMPHIGH, 0x38($TW) |
| ___ |
| } |
| |
| # round 6 |
| $code .= "vmovdqu 0x60($key1), $t0\n"; |
| |
| for (my $i = 0; $i < $num_blocks; $i++) { |
| $code .= "vaesdec $t0, $st[$i], $st[$i]\n"; |
| } |
| |
| if (0 == $lt128) { |
| $code .= <<___; |
| xor $gf_poly_8b_temp, $gf_poly_8b_temp |
| shl \$1, $TEMPLOW |
| adc $TEMPHIGH, $TEMPHIGH |
| cmovc $gf_poly_8b, $gf_poly_8b_temp |
| xor $gf_poly_8b_temp, $TEMPLOW |
| mov $TEMPLOW, 0x40($TW) # next Tweak5 generated |
| mov $TEMPHIGH, 0x48($TW) |
| ___ |
| } |
| |
| # round 7 |
| $code .= "vmovdqu 0x70($key1), $t0\n"; |
| |
| for (my $i = 0; $i < $num_blocks; $i++) { |
| $code .= "vaesdec $t0, $st[$i], $st[$i]\n"; |
| } |
| |
| if (0 == $lt128) { |
| $code .= <<___; |
| xor $gf_poly_8b_temp, $gf_poly_8b_temp |
| shl \$1, $TEMPLOW |
| adc $TEMPHIGH, $TEMPHIGH |
| cmovc $gf_poly_8b, $gf_poly_8b_temp |
| xor $gf_poly_8b_temp, $TEMPLOW |
| mov $TEMPLOW, 0x50($TW) # next Tweak6 generated |
| mov $TEMPHIGH, 0x58($TW) |
| ___ |
| } |
| |
| # round 8 |
| $code .= "vmovdqu 0x80($key1), $t0\n"; |
| |
| for (my $i = 0; $i < $num_blocks; $i++) { |
| $code .= "vaesdec $t0, $st[$i], $st[$i]\n"; |
| } |
| |
| if (0 == $lt128) { |
| $code .= <<___; |
| xor $gf_poly_8b_temp, $gf_poly_8b_temp |
| shl \$1, $TEMPLOW |
| adc $TEMPHIGH, $TEMPHIGH |
| cmovc $gf_poly_8b, $gf_poly_8b_temp |
| xor $gf_poly_8b_temp, $TEMPLOW |
| mov $TEMPLOW, 0x60($TW) # next Tweak7 generated |
| mov $TEMPHIGH, 0x68($TW) |
| ___ |
| } |
| |
| # round 9 |
| $code .= "vmovdqu 0x90($key1), $t0\n"; |
| |
| for (my $i = 0; $i < $num_blocks; $i++) { |
| $code .= "vaesdec $t0, $st[$i], $st[$i]\n"; |
| } |
| |
| if (0 == $lt128) { |
| $code .= <<___; |
| xor $gf_poly_8b_temp, $gf_poly_8b_temp |
| shl \$1, $TEMPLOW |
| adc $TEMPHIGH, $TEMPHIGH |
| cmovc $gf_poly_8b, $gf_poly_8b_temp |
| xor $gf_poly_8b_temp, $TEMPLOW |
| mov $TEMPLOW, 0x70($TW) # next Tweak8 generated |
| mov $TEMPHIGH, 0x78($TW) |
| ___ |
| } |
| |
| if ($is_128) { |
| # round 10 |
| $code .= "vmovdqu 0xa0($key1), $t0\n"; |
| for (my $i = 0; $i < $num_blocks; $i++) { |
| $code .= "vaesdeclast $t0, $st[$i], $st[$i]\n"; |
| } |
| } else { |
| # round 10 |
| $code .= "vmovdqu 0xa0($key1), $t0\n"; |
| for (my $i = 0; $i < $num_blocks; $i++) { |
| $code .= "vaesdec $t0, $st[$i], $st[$i]\n"; |
| } |
| |
| # round 11 |
| $code .= "vmovdqu 0xb0($key1), $t0\n"; |
| for (my $i = 0; $i < $num_blocks; $i++) { |
| $code .= "vaesdec $t0, $st[$i], $st[$i]\n"; |
| } |
| |
| # round 12 |
| $code .= "vmovdqu 0xc0($key1), $t0\n"; |
| for (my $i = 0; $i < $num_blocks; $i++) { |
| $code .= "vaesdec $t0, $st[$i], $st[$i]\n"; |
| } |
| |
| # round 13 |
| $code .= "vmovdqu 0xd0($key1), $t0\n"; |
| for (my $i = 0; $i < $num_blocks; $i++) { |
| $code .= "vaesdec $t0, $st[$i], $st[$i]\n"; |
| } |
| |
| # round 14 |
| $code .= "vmovdqu 0xe0($key1), $t0\n"; |
| for (my $i = 0; $i < $num_blocks; $i++) { |
| $code .= "vaesdeclast $t0, $st[$i], $st[$i]\n"; |
| } |
| } |
| |
| # xor Tweak values |
| for (my $i = 0; $i < $num_blocks; $i++) { |
| $code .= "vpxor $tw[$i], $st[$i], $st[$i]\n"; |
| } |
| |
| if (0 == $lt128) { |
| # load next Tweak values |
| $code .= <<___; |
| vmovdqa ($TW), $tw1 |
| vmovdqa 0x10($TW), $tw2 |
| vmovdqa 0x20($TW), $tw3 |
| vmovdqa 0x30($TW), $tw4 |
| vmovdqa 0x40($TW), $tw5 |
| vmovdqa 0x50($TW), $tw6 |
| vmovdqa 0x60($TW), $tw7 |
| ___ |
| } |
| } |
| |
| sub initialize { |
| my @st; |
| $st[0] = $_[0]; |
| $st[1] = $_[1]; |
| $st[2] = $_[2]; |
| $st[3] = $_[3]; |
| $st[4] = $_[4]; |
| $st[5] = $_[5]; |
| $st[6] = $_[6]; |
| $st[7] = $_[7]; |
| |
| my @tw; |
| $tw[0] = $_[8]; |
| $tw[1] = $_[9]; |
| $tw[2] = $_[10]; |
| $tw[3] = $_[11]; |
| $tw[4] = $_[12]; |
| $tw[5] = $_[13]; |
| $tw[6] = $_[14]; |
| my $num_initial_blocks = $_[15]; |
| |
| $code .= <<___; |
| vmovdqa 0x0($TW), $tw[0] |
| mov 0x0($TW), $TEMPLOW |
| mov 0x08($TW), $TEMPHIGH |
| vmovdqu 0x0($input), $st[0] |
| ___ |
| |
| if ($num_initial_blocks >= 2) { |
| for (my $i = 1; $i < $num_initial_blocks; $i++) { |
| $code .= "xor $gf_poly_8b_temp, $gf_poly_8b_temp\n"; |
| $code .= "shl \$1, $TEMPLOW\n"; |
| $code .= "adc $TEMPHIGH, $TEMPHIGH\n"; |
| $code .= "cmovc $gf_poly_8b, $gf_poly_8b_temp\n"; |
| $code .= "xor $gf_poly_8b_temp, $TEMPLOW\n"; |
| my $offset = $i * 16; |
| $code .= "mov $TEMPLOW, $offset($TW)\n"; |
| $code .= "mov $TEMPHIGH, $offset + 8($TW)\n"; |
| $code .= "vmovdqa $offset($TW), $tw[$i]\n"; |
| $code .= "vmovdqu $offset($input), $st[$i]\n"; |
| } |
| } |
| } |
| |
| # Encrypt 4 blocks in parallel |
| sub encrypt_by_four { |
| my $st1 = $_[0]; # state 1 |
| my $tw1 = $_[1]; # tweak 1 |
| my $tmp = $_[2]; |
| my $is_128 = $_[3]; |
| |
| $code .= "vbroadcasti32x4 ($key1), $tmp\n"; |
| $code .= "vpternlogq \$0x96, $tmp, $tw1, $st1\n"; |
| |
| my $rounds = $is_128 ? 10 : 14; |
| for (my $i = 1; $i < $rounds; $i++) { |
| $code .= "vbroadcasti32x4 16*$i($key1), $tmp\n"; |
| $code .= "vaesenc $tmp, $st1, $st1\n"; |
| } |
| |
| $code .= "vbroadcasti32x4 16*$rounds($key1), $tmp\n"; |
| $code .= "vaesenclast $tmp, $st1, $st1\n"; |
| |
| $code .= "vpxorq $tw1, $st1, $st1\n"; |
| } |
| |
| # Encrypt 8 blocks in parallel |
| # generate next 8 tweak values |
| sub encrypt_by_eight_zmm { |
| my $st1 = $_[0]; |
| my $st2 = $_[1]; |
| my $tw1 = $_[2]; |
| my $tw2 = $_[3]; |
| my $t0 = $_[4]; |
| my $last_eight = $_[5]; |
| my $is_128 = $_[6]; |
| |
| $code .= <<___; |
| vbroadcasti32x4 ($key1), $t0 |
| vpternlogq \$0x96, $t0, $tw1, $st1 |
| vpternlogq \$0x96, $t0, $tw2, $st2 |
| ___ |
| |
| if (0 == $last_eight) { |
| $code .= <<___; |
| vpsrldq \$0xf, $tw1, %zmm13 |
| vpclmulqdq \$0x0, $ZPOLY, %zmm13, %zmm14 |
| vpslldq \$0x1, $tw1, %zmm15 |
| vpxord %zmm14, %zmm15, %zmm15 |
| ___ |
| } |
| # round 1 |
| $code .= <<___; |
| vbroadcasti32x4 0x10($key1), $t0 |
| vaesenc $t0, $st1, $st1 |
| vaesenc $t0, $st2, $st2 |
| |
| # round 2 |
| vbroadcasti32x4 0x20($key1), $t0 |
| vaesenc $t0, $st1, $st1 |
| vaesenc $t0, $st2, $st2 |
| |
| # round 3 |
| vbroadcasti32x4 0x30($key1), $t0 |
| vaesenc $t0, $st1, $st1 |
| vaesenc $t0, $st2, $st2 |
| ___ |
| |
| if (0 == $last_eight) { |
| $code .= <<___; |
| vpsrldq \$0xf, $tw2, %zmm13 |
| vpclmulqdq \$0x0, $ZPOLY, %zmm13, %zmm14 |
| vpslldq \$0x1, $tw2, %zmm16 |
| vpxord %zmm14, %zmm16, %zmm16 |
| ___ |
| } |
| |
| $code .= <<___; |
| # round 4 |
| vbroadcasti32x4 0x40($key1), $t0 |
| vaesenc $t0, $st1, $st1 |
| vaesenc $t0, $st2, $st2 |
| |
| # round 5 |
| vbroadcasti32x4 0x50($key1), $t0 |
| vaesenc $t0, $st1, $st1 |
| vaesenc $t0, $st2, $st2 |
| |
| # round 6 |
| vbroadcasti32x4 0x60($key1), $t0 |
| vaesenc $t0, $st1, $st1 |
| vaesenc $t0, $st2, $st2 |
| |
| # round 7 |
| vbroadcasti32x4 0x70($key1), $t0 |
| vaesenc $t0, $st1, $st1 |
| vaesenc $t0, $st2, $st2 |
| |
| # round 8 |
| vbroadcasti32x4 0x80($key1), $t0 |
| vaesenc $t0, $st1, $st1 |
| vaesenc $t0, $st2, $st2 |
| |
| # round 9 |
| vbroadcasti32x4 0x90($key1), $t0 |
| vaesenc $t0, $st1, $st1 |
| vaesenc $t0, $st2, $st2 |
| ___ |
| |
| if ($is_128) { |
| $code .= <<___; |
| # round 10 |
| vbroadcasti32x4 0xa0($key1), $t0 |
| vaesenclast $t0, $st1, $st1 |
| vaesenclast $t0, $st2, $st2 |
| ___ |
| } else { |
| $code .= <<___; |
| # round 10 |
| vbroadcasti32x4 0xa0($key1), $t0 |
| vaesenc $t0, $st1, $st1 |
| vaesenc $t0, $st2, $st2 |
| |
| # round 11 |
| vbroadcasti32x4 0xb0($key1), $t0 |
| vaesenc $t0, $st1, $st1 |
| vaesenc $t0, $st2, $st2 |
| |
| # round 12 |
| vbroadcasti32x4 0xc0($key1), $t0 |
| vaesenc $t0, $st1, $st1 |
| vaesenc $t0, $st2, $st2 |
| |
| # round 13 |
| vbroadcasti32x4 0xd0($key1), $t0 |
| vaesenc $t0, $st1, $st1 |
| vaesenc $t0, $st2, $st2 |
| |
| # round 14 |
| vbroadcasti32x4 0xe0($key1), $t0 |
| vaesenclast $t0, $st1, $st1 |
| vaesenclast $t0, $st2, $st2 |
| ___ |
| } |
| |
| # xor Tweak values |
| $code .= "vpxorq $tw1, $st1, $st1\n"; |
| $code .= "vpxorq $tw2, $st2, $st2\n"; |
| |
| if (0 == $last_eight) { |
| # load next Tweak values |
| $code .= <<___; |
| vmovdqa32 %zmm15, $tw1 |
| vmovdqa32 %zmm16, $tw2 |
| ___ |
| } |
| } |
| |
| # Decrypt 8 blocks in parallel |
| # generate next 8 tweak values |
| sub decrypt_by_eight_zmm { |
| my $st1 = $_[0]; |
| my $st2 = $_[1]; |
| my $tw1 = $_[2]; |
| my $tw2 = $_[3]; |
| my $t0 = $_[4]; |
| my $last_eight = $_[5]; |
| my $is_128 = $_[6]; |
| |
| $code .= <<___; |
| # xor Tweak values |
| vpxorq $tw1, $st1, $st1 |
| vpxorq $tw2, $st2, $st2 |
| |
| # ARK |
| vbroadcasti32x4 ($key1), $t0 |
| vpxorq $t0, $st1, $st1 |
| vpxorq $t0, $st2, $st2 |
| ___ |
| |
| if (0 == $last_eight) { |
| $code .= <<___; |
| vpsrldq \$0xf, $tw1, %zmm13 |
| vpclmulqdq \$0x0,$ZPOLY, %zmm13, %zmm14 |
| vpslldq \$0x1, $tw1, %zmm15 |
| vpxord %zmm14, %zmm15, %zmm15 |
| ___ |
| } |
| # round 1 |
| $code .= <<___; |
| vbroadcasti32x4 0x10($key1), $t0 |
| vaesdec $t0, $st1, $st1 |
| vaesdec $t0, $st2, $st2 |
| |
| # round 2 |
| vbroadcasti32x4 0x20($key1), $t0 |
| vaesdec $t0, $st1, $st1 |
| vaesdec $t0, $st2, $st2 |
| |
| # round 3 |
| vbroadcasti32x4 0x30($key1), $t0 |
| vaesdec $t0, $st1, $st1 |
| vaesdec $t0, $st2, $st2 |
| ___ |
| |
| if (0 == $last_eight) { |
| $code .= <<___; |
| vpsrldq \$0xf, $tw2, %zmm13 |
| vpclmulqdq \$0x0,$ZPOLY, %zmm13, %zmm14 |
| vpslldq \$0x1, $tw2, %zmm16 |
| vpxord %zmm14, %zmm16, %zmm16 |
| ___ |
| } |
| |
| $code .= <<___; |
| # round 4 |
| vbroadcasti32x4 0x40($key1), $t0 |
| vaesdec $t0, $st1, $st1 |
| vaesdec $t0, $st2, $st2 |
| |
| # round 5 |
| vbroadcasti32x4 0x50($key1), $t0 |
| vaesdec $t0, $st1, $st1 |
| vaesdec $t0, $st2, $st2 |
| |
| # round 6 |
| vbroadcasti32x4 0x60($key1), $t0 |
| vaesdec $t0, $st1, $st1 |
| vaesdec $t0, $st2, $st2 |
| |
| # round 7 |
| vbroadcasti32x4 0x70($key1), $t0 |
| vaesdec $t0, $st1, $st1 |
| vaesdec $t0, $st2, $st2 |
| |
| # round 8 |
| vbroadcasti32x4 0x80($key1), $t0 |
| vaesdec $t0, $st1, $st1 |
| vaesdec $t0, $st2, $st2 |
| |
| # round 9 |
| vbroadcasti32x4 0x90($key1), $t0 |
| vaesdec $t0, $st1, $st1 |
| vaesdec $t0, $st2, $st2 |
| |
| ___ |
| if ($is_128) { |
| $code .= <<___; |
| # round 10 |
| vbroadcasti32x4 0xa0($key1), $t0 |
| vaesdeclast $t0, $st1, $st1 |
| vaesdeclast $t0, $st2, $st2 |
| ___ |
| } else { |
| $code .= <<___; |
| # round 10 |
| vbroadcasti32x4 0xa0($key1), $t0 |
| vaesdec $t0, $st1, $st1 |
| vaesdec $t0, $st2, $st2 |
| |
| # round 11 |
| vbroadcasti32x4 0xb0($key1), $t0 |
| vaesdec $t0, $st1, $st1 |
| vaesdec $t0, $st2, $st2 |
| |
| # round 12 |
| vbroadcasti32x4 0xc0($key1), $t0 |
| vaesdec $t0, $st1, $st1 |
| vaesdec $t0, $st2, $st2 |
| |
| # round 13 |
| vbroadcasti32x4 0xd0($key1), $t0 |
| vaesdec $t0, $st1, $st1 |
| vaesdec $t0, $st2, $st2 |
| |
| # round 14 |
| vbroadcasti32x4 0xe0($key1), $t0 |
| vaesdeclast $t0, $st1, $st1 |
| vaesdeclast $t0, $st2, $st2 |
| ___ |
| } |
| |
| $code .= <<___; |
| # xor Tweak values |
| vpxorq $tw1, $st1, $st1 |
| vpxorq $tw2, $st2, $st2 |
| |
| # load next Tweak values |
| vmovdqa32 %zmm15, $tw1 |
| vmovdqa32 %zmm16, $tw2 |
| ___ |
| } |
| |
| # Encrypt 16 blocks in parallel |
| # generate next 16 tweak values |
| sub encrypt_by_16_zmm { |
| my @st; |
| $st[0] = $_[0]; |
| $st[1] = $_[1]; |
| $st[2] = $_[2]; |
| $st[3] = $_[3]; |
| |
| my @tw; |
| $tw[0] = $_[4]; |
| $tw[1] = $_[5]; |
| $tw[2] = $_[6]; |
| $tw[3] = $_[7]; |
| |
| my $t0 = $_[8]; |
| my $last_eight = $_[9]; |
| my $is_128 = $_[10]; |
| |
| # xor Tweak values |
| for (my $i = 0; $i < 4; $i++) { |
| $code .= "vpxorq $tw[$i], $st[$i], $st[$i]\n"; |
| } |
| |
| # ARK |
| $code .= "vbroadcasti32x4 ($key1), $t0\n"; |
| for (my $i = 0; $i < 4; $i++) { |
| $code .= "vpxorq $t0, $st[$i], $st[$i]\n"; |
| } |
| |
| if (0 == $last_eight) { |
| $code .= <<___; |
| vpsrldq \$0xf, $tw[2], %zmm13 |
| vpclmulqdq \$0x0,$ZPOLY, %zmm13, %zmm14 |
| vpslldq \$0x1, $tw[2], %zmm15 |
| vpxord %zmm14, %zmm15, %zmm15 |
| ___ |
| } |
| |
| # round 1 |
| $code .= "vbroadcasti32x4 0x10($key1), $t0\n"; |
| for (my $i = 0; $i < 4; $i++) { |
| $code .= "vaesenc $t0, $st[$i], $st[$i]\n"; |
| } |
| |
| # round 2 |
| $code .= "vbroadcasti32x4 0x20($key1), $t0\n"; |
| for (my $i = 0; $i < 4; $i++) { |
| $code .= "vaesenc $t0, $st[$i], $st[$i]\n"; |
| } |
| |
| # round 3 |
| $code .= "vbroadcasti32x4 0x30($key1), $t0\n"; |
| for (my $i = 0; $i < 4; $i++) { |
| $code .= "vaesenc $t0, $st[$i], $st[$i]\n"; |
| } |
| |
| if (0 == $last_eight) { |
| $code .= <<___; |
| vpsrldq \$0xf, $tw[3], %zmm13 |
| vpclmulqdq \$0x0,$ZPOLY, %zmm13, %zmm14 |
| vpslldq \$0x1, $tw[3], %zmm16 |
| vpxord %zmm14, %zmm16, %zmm16 |
| ___ |
| } |
| # round 4 |
| $code .= "vbroadcasti32x4 0x40($key1), $t0\n"; |
| for (my $i = 0; $i < 4; $i++) { |
| $code .= "vaesenc $t0, $st[$i], $st[$i]\n"; |
| } |
| |
| # round 5 |
| $code .= "vbroadcasti32x4 0x50($key1), $t0\n"; |
| for (my $i = 0; $i < 4; $i++) { |
| $code .= "vaesenc $t0, $st[$i], $st[$i]\n"; |
| } |
| |
| # round 6 |
| $code .= "vbroadcasti32x4 0x60($key1), $t0\n"; |
| for (my $i = 0; $i < 4; $i++) { |
| $code .= "vaesenc $t0, $st[$i], $st[$i]\n"; |
| } |
| |
| if (0 == $last_eight) { |
| $code .= <<___; |
| vpsrldq \$0xf, %zmm15, %zmm13 |
| vpclmulqdq \$0x0,$ZPOLY, %zmm13, %zmm14 |
| vpslldq \$0x1, %zmm15, %zmm17 |
| vpxord %zmm14, %zmm17, %zmm17 |
| ___ |
| } |
| # round 7 |
| $code .= "vbroadcasti32x4 0x70($key1), $t0\n"; |
| for (my $i = 0; $i < 4; $i++) { |
| $code .= "vaesenc $t0, $st[$i], $st[$i]\n"; |
| } |
| |
| # round 8 |
| $code .= "vbroadcasti32x4 0x80($key1), $t0\n"; |
| for (my $i = 0; $i < 4; $i++) { |
| $code .= "vaesenc $t0, $st[$i], $st[$i]\n"; |
| } |
| |
| # round 9 |
| $code .= "vbroadcasti32x4 0x90($key1), $t0\n"; |
| for (my $i = 0; $i < 4; $i++) { |
| $code .= "vaesenc $t0, $st[$i], $st[$i]\n"; |
| } |
| |
| if (0 == $last_eight) { |
| $code .= <<___; |
| vpsrldq \$0xf, %zmm16, %zmm13 |
| vpclmulqdq \$0x0,$ZPOLY, %zmm13, %zmm14 |
| vpslldq \$0x1, %zmm16, %zmm18 |
| vpxord %zmm14, %zmm18, %zmm18 |
| ___ |
| } |
| if ($is_128) { |
| # round 10 |
| $code .= "vbroadcasti32x4 0xa0($key1), $t0\n"; |
| for (my $i = 0; $i < 4; $i++) { |
| $code .= "vaesenclast $t0, $st[$i], $st[$i]\n"; |
| } |
| } else { |
| # round 10 |
| $code .= "vbroadcasti32x4 0xa0($key1), $t0\n"; |
| for (my $i = 0; $i < 4; $i++) { |
| $code .= "vaesenc $t0, $st[$i], $st[$i]\n"; |
| } |
| # round 11 |
| $code .= "vbroadcasti32x4 0xb0($key1), $t0\n"; |
| for (my $i = 0; $i < 4; $i++) { |
| $code .= "vaesenc $t0, $st[$i], $st[$i]\n"; |
| } |
| # round 12 |
| $code .= "vbroadcasti32x4 0xc0($key1), $t0\n"; |
| for (my $i = 0; $i < 4; $i++) { |
| $code .= "vaesenc $t0, $st[$i], $st[$i]\n"; |
| } |
| # round 13 |
| $code .= "vbroadcasti32x4 0xd0($key1), $t0\n"; |
| for (my $i = 0; $i < 4; $i++) { |
| $code .= "vaesenc $t0, $st[$i], $st[$i]\n"; |
| } |
| # round 14 |
| $code .= "vbroadcasti32x4 0xe0($key1), $t0\n"; |
| for (my $i = 0; $i < 4; $i++) { |
| $code .= "vaesenclast $t0, $st[$i], $st[$i]\n"; |
| } |
| } |
| |
| # xor Tweak values |
| for (my $i = 0; $i < 4; $i++) { |
| $code .= "vpxorq $tw[$i], $st[$i], $st[$i]\n"; |
| } |
| |
| $code .= <<___; |
| # load next Tweak values |
| vmovdqa32 %zmm15, $tw[0] |
| vmovdqa32 %zmm16, $tw[1] |
| vmovdqa32 %zmm17, $tw[2] |
| vmovdqa32 %zmm18, $tw[3] |
| ___ |
| } |
| |
| # Decrypt 16 blocks in parallel |
| # generate next 8 tweak values |
| sub decrypt_by_16_zmm { |
| my @st; |
| $st[0] = $_[0]; |
| $st[1] = $_[1]; |
| $st[2] = $_[2]; |
| $st[3] = $_[3]; |
| |
| my @tw; |
| $tw[0] = $_[4]; |
| $tw[1] = $_[5]; |
| $tw[2] = $_[6]; |
| $tw[3] = $_[7]; |
| |
| my $t0 = $_[8]; |
| my $last_eight = $_[9]; |
| my $is_128 = $_[10]; |
| |
| # xor Tweak values |
| for (my $i = 0; $i < 4; $i++) { |
| $code .= "vpxorq $tw[$i], $st[$i], $st[$i]\n"; |
| } |
| |
| # ARK |
| $code .= "vbroadcasti32x4 ($key1), $t0\n"; |
| for (my $i = 0; $i < 4; $i++) { |
| $code .= "vpxorq $t0, $st[$i], $st[$i]\n"; |
| } |
| |
| if (0 == $last_eight) { |
| $code .= <<___; |
| vpsrldq \$0xf, $tw[2], %zmm13 |
| vpclmulqdq \$0x0,$ZPOLY, %zmm13, %zmm14 |
| vpslldq \$0x1, $tw[2], %zmm15 |
| vpxord %zmm14, %zmm15, %zmm15 |
| ___ |
| } |
| |
| # round 1 |
| $code .= "vbroadcasti32x4 0x10($key1), $t0\n"; |
| for (my $i = 0; $i < 4; $i++) { |
| $code .= "vaesdec $t0, $st[$i], $st[$i]\n"; |
| } |
| |
| # round 2 |
| $code .= "vbroadcasti32x4 0x20($key1), $t0\n"; |
| for (my $i = 0; $i < 4; $i++) { |
| $code .= "vaesdec $t0, $st[$i], $st[$i]\n"; |
| } |
| |
| # round 3 |
| $code .= "vbroadcasti32x4 0x30($key1), $t0\n"; |
| for (my $i = 0; $i < 4; $i++) { |
| $code .= "vaesdec $t0, $st[$i], $st[$i]\n"; |
| } |
| |
| if (0 == $last_eight) { |
| $code .= <<___; |
| vpsrldq \$0xf, $tw[3], %zmm13 |
| vpclmulqdq \$0x0,$ZPOLY, %zmm13, %zmm14 |
| vpslldq \$0x1, $tw[3], %zmm16 |
| vpxord %zmm14, %zmm16, %zmm16 |
| ___ |
| } |
| # round 4 |
| $code .= "vbroadcasti32x4 0x40($key1), $t0\n"; |
| for (my $i = 0; $i < 4; $i++) { |
| $code .= "vaesdec $t0, $st[$i], $st[$i]\n"; |
| } |
| |
| # round 5 |
| $code .= "vbroadcasti32x4 0x50($key1), $t0\n"; |
| for (my $i = 0; $i < 4; $i++) { |
| $code .= "vaesdec $t0, $st[$i], $st[$i]\n"; |
| } |
| |
| # round 6 |
| $code .= "vbroadcasti32x4 0x60($key1), $t0\n"; |
| for (my $i = 0; $i < 4; $i++) { |
| $code .= "vaesdec $t0, $st[$i], $st[$i]\n"; |
| } |
| |
| if (0 == $last_eight) { |
| $code .= <<___; |
| vpsrldq \$0xf, %zmm15, %zmm13 |
| vpclmulqdq \$0x0,$ZPOLY, %zmm13, %zmm14 |
| vpslldq \$0x1, %zmm15, %zmm17 |
| vpxord %zmm14, %zmm17, %zmm17 |
| ___ |
| } |
| # round 7 |
| $code .= "vbroadcasti32x4 0x70($key1), $t0\n"; |
| for (my $i = 0; $i < 4; $i++) { |
| $code .= "vaesdec $t0, $st[$i], $st[$i]\n"; |
| } |
| |
| # round 8 |
| $code .= "vbroadcasti32x4 0x80($key1), $t0\n"; |
| for (my $i = 0; $i < 4; $i++) { |
| $code .= "vaesdec $t0, $st[$i], $st[$i]\n"; |
| } |
| |
| # round 9 |
| $code .= "vbroadcasti32x4 0x90($key1), $t0\n"; |
| for (my $i = 0; $i < 4; $i++) { |
| $code .= "vaesdec $t0, $st[$i], $st[$i]\n"; |
| } |
| |
| if (0 == $last_eight) { |
| $code .= <<___; |
| vpsrldq \$0xf, %zmm16, %zmm13 |
| vpclmulqdq \$0x0,$ZPOLY, %zmm13, %zmm14 |
| vpslldq \$0x1, %zmm16, %zmm18 |
| vpxord %zmm14, %zmm18, %zmm18 |
| ___ |
| } |
| if ($is_128) { |
| # round 10 |
| $code .= "vbroadcasti32x4 0xa0($key1), $t0\n"; |
| for (my $i = 0; $i < 4; $i++) { |
| $code .= "vaesdeclast $t0, $st[$i], $st[$i]\n"; |
| } |
| } else { |
| # round 10 |
| $code .= "vbroadcasti32x4 0xa0($key1), $t0\n"; |
| for (my $i = 0; $i < 4; $i++) { |
| $code .= "vaesdec $t0, $st[$i], $st[$i]\n"; |
| } |
| |
| # round 11 |
| $code .= "vbroadcasti32x4 0xb0($key1), $t0\n"; |
| for (my $i = 0; $i < 4; $i++) { |
| $code .= "vaesdec $t0, $st[$i], $st[$i]\n"; |
| } |
| |
| # round 12 |
| $code .= "vbroadcasti32x4 0xc0($key1), $t0\n"; |
| for (my $i = 0; $i < 4; $i++) { |
| $code .= "vaesdec $t0, $st[$i], $st[$i]\n"; |
| } |
| |
| # round 13 |
| $code .= "vbroadcasti32x4 0xd0($key1), $t0\n"; |
| for (my $i = 0; $i < 4; $i++) { |
| $code .= "vaesdec $t0, $st[$i], $st[$i]\n"; |
| } |
| |
| # round 14 |
| $code .= "vbroadcasti32x4 0xe0($key1), $t0\n"; |
| for (my $i = 0; $i < 4; $i++) { |
| $code .= "vaesdeclast $t0, $st[$i], $st[$i]\n"; |
| } |
| } |
| |
| # xor Tweak values |
| for (my $i = 0; $i < 4; $i++) { |
| $code .= "vpxorq $tw[$i], $st[$i], $st[$i]\n"; |
| } |
| |
| $code .= <<___; |
| # load next Tweak values |
| vmovdqa32 %zmm15, $tw[0] |
| vmovdqa32 %zmm16, $tw[1] |
| vmovdqa32 %zmm17, $tw[2] |
| vmovdqa32 %zmm18, $tw[3] |
| ___ |
| } |
| |
| $code .= ".text\n"; |
| |
| { |
| $code.=<<"___"; |
| .extern OPENSSL_ia32cap_P |
| .globl aesni_xts_avx512_eligible |
| .type aesni_xts_avx512_eligible,\@abi-omnipotent |
| .align 32 |
| aesni_xts_avx512_eligible: |
| mov OPENSSL_ia32cap_P+8(%rip), %ecx |
| xor %eax,%eax |
| # 1<<31|1<<30|1<<17|1<<16 avx512vl + avx512bw + avx512dq + avx512f |
| and \$0xc0030000, %ecx |
| cmp \$0xc0030000, %ecx |
| jne .L_done |
| mov OPENSSL_ia32cap_P+12(%rip), %ecx |
| # 1<<10|1<<9|1<<6 vaes + vpclmulqdq + vbmi2 |
| and \$0x640, %ecx |
| cmp \$0x640, %ecx |
| cmove %ecx,%eax |
| .L_done: |
| ret |
| .size aesni_xts_avx512_eligible, .-aesni_xts_avx512_eligible |
| ___ |
| } |
| |
| |
| # ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; |
| # ;void aesni_xts_[128|256]_encrypt_avx512( |
| # ; const uint8_t *in, // input data |
| # ; uint8_t *out, // output data |
| # ; size_t length, // sector size, in bytes |
| # ; const AES_KEY *key1, // key used for "ECB" encryption |
| # ; const AES_KEY *key2, // key used for tweaking |
| # ; const uint8_t iv[16]) // initial tweak value, 16 bytes |
| # ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; |
| sub enc { |
| my $is_128 = $_[0]; |
| my $rndsuffix = &random_string(); |
| |
| if ($is_128) { |
| $code.=<<___; |
| .globl aesni_xts_128_encrypt_avx512 |
| .hidden aesni_xts_128_encrypt_avx512 |
| .type aesni_xts_128_encrypt_avx512,\@function,6 |
| .align 32 |
| aesni_xts_128_encrypt_avx512: |
| .cfi_startproc |
| endbranch |
| ___ |
| } else { |
| $code.=<<___; |
| .globl aesni_xts_256_encrypt_avx512 |
| .hidden aesni_xts_256_encrypt_avx512 |
| .type aesni_xts_256_encrypt_avx512,\@function,6 |
| .align 32 |
| aesni_xts_256_encrypt_avx512: |
| .cfi_startproc |
| endbranch |
| ___ |
| } |
| $code .= "push %rbp\n"; |
| $code .= "mov $TW,%rbp\n"; |
| $code .= "sub \$$VARIABLE_OFFSET,$TW\n"; |
| $code .= "and \$0xffffffffffffffc0,$TW\n"; |
| $code .= "mov %rbx,$GP_STORAGE($TW)\n"; |
| |
| if ($win64) { |
| $code .= "mov %rdi,$GP_STORAGE + 8*1($TW)\n"; |
| $code .= "mov %rsi,$GP_STORAGE + 8*2($TW)\n"; |
| $code .= "vmovdqa %xmm6, $XMM_STORAGE + 16*0($TW)\n"; |
| $code .= "vmovdqa %xmm7, $XMM_STORAGE + 16*1($TW)\n"; |
| $code .= "vmovdqa %xmm8, $XMM_STORAGE + 16*2($TW)\n"; |
| $code .= "vmovdqa %xmm9, $XMM_STORAGE + 16*3($TW)\n"; |
| $code .= "vmovdqa %xmm10, $XMM_STORAGE + 16*4($TW)\n"; |
| $code .= "vmovdqa %xmm11, $XMM_STORAGE + 16*5($TW)\n"; |
| $code .= "vmovdqa %xmm12, $XMM_STORAGE + 16*6($TW)\n"; |
| $code .= "vmovdqa %xmm13, $XMM_STORAGE + 16*7($TW)\n"; |
| $code .= "vmovdqa %xmm14, $XMM_STORAGE + 16*8($TW)\n"; |
| $code .= "vmovdqa %xmm15, $XMM_STORAGE + 16*9($TW)\n"; |
| } |
| |
| $code .= "mov \$0x87, $gf_poly_8b\n"; |
| $code .= "vmovdqu ($tweak),%xmm1\n"; # read initial tweak values |
| |
| encrypt_tweak("%xmm1", $is_128); |
| |
| if ($win64) { |
| $code .= "mov $input, 8 + 8*5(%rbp)\n"; # ciphertext pointer |
| $code .= "mov $output, 8 + 8*6(%rbp)\n"; # plaintext pointer |
| } |
| |
| { |
| $code.=<<___; |
| |
| cmp \$0x80,$length |
| jl .L_less_than_128_bytes_${rndsuffix} |
| vpbroadcastq $gf_poly_8b,$ZPOLY |
| cmp \$0x100,$length |
| jge .L_start_by16_${rndsuffix} |
| cmp \$0x80,$length |
| jge .L_start_by8_${rndsuffix} |
| |
| .L_do_n_blocks_${rndsuffix}: |
| cmp \$0x0,$length |
| je .L_ret_${rndsuffix} |
| cmp \$0x70,$length |
| jge .L_remaining_num_blocks_is_7_${rndsuffix} |
| cmp \$0x60,$length |
| jge .L_remaining_num_blocks_is_6_${rndsuffix} |
| cmp \$0x50,$length |
| jge .L_remaining_num_blocks_is_5_${rndsuffix} |
| cmp \$0x40,$length |
| jge .L_remaining_num_blocks_is_4_${rndsuffix} |
| cmp \$0x30,$length |
| jge .L_remaining_num_blocks_is_3_${rndsuffix} |
| cmp \$0x20,$length |
| jge .L_remaining_num_blocks_is_2_${rndsuffix} |
| cmp \$0x10,$length |
| jge .L_remaining_num_blocks_is_1_${rndsuffix} |
| vmovdqa %xmm0,%xmm8 |
| vmovdqa %xmm9,%xmm0 |
| jmp .L_steal_cipher_${rndsuffix} |
| |
| .L_remaining_num_blocks_is_7_${rndsuffix}: |
| mov \$0x0000ffffffffffff,$tmp1 |
| kmovq $tmp1,%k1 |
| vmovdqu8 ($input),%zmm1 |
| vmovdqu8 0x40($input),%zmm2{%k1} |
| add \$0x70,$input |
| ___ |
| } |
| |
| encrypt_by_eight_zmm("%zmm1", "%zmm2", "%zmm9", "%zmm10", "%zmm0", 1, $is_128); |
| |
| { |
| $code .= <<___; |
| vmovdqu8 %zmm1,($output) |
| vmovdqu8 %zmm2,0x40($output){%k1} |
| add \$0x70,$output |
| vextracti32x4 \$0x2,%zmm2,%xmm8 |
| vextracti32x4 \$0x3,%zmm10,%xmm0 |
| and \$0xf,$length |
| je .L_ret_${rndsuffix} |
| jmp .L_steal_cipher_${rndsuffix} |
| |
| .L_remaining_num_blocks_is_6_${rndsuffix}: |
| vmovdqu8 ($input),%zmm1 |
| vmovdqu8 0x40($input),%ymm2 |
| add \$0x60,$input |
| ___ |
| } |
| |
| encrypt_by_eight_zmm("%zmm1", "%zmm2", "%zmm9", "%zmm10", "%zmm0", 1, $is_128); |
| |
| { |
| $code .= <<___; |
| vmovdqu8 %zmm1,($output) |
| vmovdqu8 %ymm2,0x40($output) |
| add \$0x60,$output |
| vextracti32x4 \$0x1,%zmm2,%xmm8 |
| vextracti32x4 \$0x2,%zmm10,%xmm0 |
| and \$0xf,$length |
| je .L_ret_${rndsuffix} |
| jmp .L_steal_cipher_${rndsuffix} |
| |
| .L_remaining_num_blocks_is_5_${rndsuffix}: |
| vmovdqu8 ($input),%zmm1 |
| vmovdqu 0x40($input),%xmm2 |
| add \$0x50,$input |
| ___ |
| } |
| |
| encrypt_by_eight_zmm("%zmm1", "%zmm2", "%zmm9", "%zmm10", "%zmm0", 1, $is_128); |
| |
| { |
| $code .= <<___; |
| vmovdqu8 %zmm1,($output) |
| vmovdqu %xmm2,0x40($output) |
| add \$0x50,$output |
| vmovdqa %xmm2,%xmm8 |
| vextracti32x4 \$0x1,%zmm10,%xmm0 |
| and \$0xf,$length |
| je .L_ret_${rndsuffix} |
| jmp .L_steal_cipher_${rndsuffix} |
| |
| .L_remaining_num_blocks_is_4_${rndsuffix}: |
| vmovdqu8 ($input),%zmm1 |
| add \$0x40,$input |
| ___ |
| } |
| |
| encrypt_by_four("%zmm1", "%zmm9", "%zmm0", $is_128); |
| |
| { |
| $code .= <<___; |
| vmovdqu8 %zmm1,($output) |
| add \$0x40,$output |
| vextracti32x4 \$0x3,%zmm1,%xmm8 |
| vmovdqa64 %xmm10, %xmm0 |
| and \$0xf,$length |
| je .L_ret_${rndsuffix} |
| jmp .L_steal_cipher_${rndsuffix} |
| ___ |
| } |
| |
| { |
| $code .= <<___; |
| .L_remaining_num_blocks_is_3_${rndsuffix}: |
| mov \$-1, $tmp1 |
| shr \$0x10, $tmp1 |
| kmovq $tmp1, %k1 |
| vmovdqu8 ($input), %zmm1{%k1} |
| add \$0x30, $input |
| ___ |
| } |
| |
| encrypt_by_four("%zmm1", "%zmm9", "%zmm0", $is_128); |
| |
| { |
| $code .= <<___; |
| vmovdqu8 %zmm1, ($output){%k1} |
| add \$0x30, $output |
| vextracti32x4 \$0x2, %zmm1, %xmm8 |
| vextracti32x4 \$0x3, %zmm9, %xmm0 |
| and \$0xf, $length |
| je .L_ret_${rndsuffix} |
| jmp .L_steal_cipher_${rndsuffix} |
| ___ |
| } |
| |
| { |
| $code .= <<___; |
| .L_remaining_num_blocks_is_2_${rndsuffix}: |
| vmovdqu8 ($input), %ymm1 |
| add \$0x20, $input |
| ___ |
| } |
| |
| encrypt_by_four("%ymm1", "%ymm9", "%ymm0", $is_128); |
| |
| { |
| $code .= <<___; |
| vmovdqu %ymm1,($output) |
| add \$0x20,$output |
| vextracti32x4 \$0x1, %zmm1, %xmm8 |
| vextracti32x4 \$0x2,%zmm9,%xmm0 |
| and \$0xf,$length |
| je .L_ret_${rndsuffix} |
| jmp .L_steal_cipher_${rndsuffix} |
| ___ |
| } |
| |
| { |
| $code .= <<___; |
| .L_remaining_num_blocks_is_1_${rndsuffix}: |
| vmovdqu ($input),%xmm1 |
| add \$0x10,$input |
| ___ |
| } |
| |
| encrypt_final("%xmm1", "%xmm9", $is_128); |
| |
| { |
| $code .= <<___; |
| vmovdqu %xmm1,($output) |
| add \$0x10,$output |
| vmovdqa %xmm1,%xmm8 |
| vextracti32x4 \$0x1,%zmm9,%xmm0 |
| and \$0xf,$length |
| je .L_ret_${rndsuffix} |
| jmp .L_steal_cipher_${rndsuffix} |
| |
| |
| .L_start_by16_${rndsuffix}: |
| vbroadcasti32x4 ($TW),%zmm0 |
| vbroadcasti32x4 shufb_15_7(%rip),%zmm8 |
| mov \$0xaa,$tmp1 |
| kmovq $tmp1,%k2 |
| vpshufb %zmm8,%zmm0,%zmm1 |
| vpsllvq const_dq3210(%rip),%zmm0,%zmm4 |
| vpsrlvq const_dq5678(%rip),%zmm1,%zmm2 |
| vpclmulqdq \$0x0,%zmm25,%zmm2,%zmm3 |
| vpxorq %zmm2,%zmm4,%zmm4{%k2} |
| vpxord %zmm4,%zmm3,%zmm9 |
| vpsllvq const_dq7654(%rip),%zmm0,%zmm5 |
| vpsrlvq const_dq1234(%rip),%zmm1,%zmm6 |
| vpclmulqdq \$0x0,%zmm25,%zmm6,%zmm7 |
| vpxorq %zmm6,%zmm5,%zmm5{%k2} |
| vpxord %zmm5,%zmm7,%zmm10 |
| vpsrldq \$0xf,%zmm9,%zmm13 |
| vpclmulqdq \$0x0,%zmm25,%zmm13,%zmm14 |
| vpslldq \$0x1,%zmm9,%zmm11 |
| vpxord %zmm14,%zmm11,%zmm11 |
| vpsrldq \$0xf,%zmm10,%zmm15 |
| vpclmulqdq \$0x0,%zmm25,%zmm15,%zmm16 |
| vpslldq \$0x1,%zmm10,%zmm12 |
| vpxord %zmm16,%zmm12,%zmm12 |
| |
| .L_main_loop_run_16_${rndsuffix}: |
| vmovdqu8 ($input),%zmm1 |
| vmovdqu8 0x40($input),%zmm2 |
| vmovdqu8 0x80($input),%zmm3 |
| vmovdqu8 0xc0($input),%zmm4 |
| add \$0x100,$input |
| ___ |
| } |
| |
| encrypt_by_16_zmm("%zmm1", "%zmm2", "%zmm3", "%zmm4", "%zmm9", |
| "%zmm10", "%zmm11", "%zmm12", "%zmm0", 0, $is_128); |
| |
| { |
| $code .= <<___; |
| vmovdqu8 %zmm1,($output) |
| vmovdqu8 %zmm2,0x40($output) |
| vmovdqu8 %zmm3,0x80($output) |
| vmovdqu8 %zmm4,0xc0($output) |
| add \$0x100,$output |
| sub \$0x100,$length |
| cmp \$0x100,$length |
| jae .L_main_loop_run_16_${rndsuffix} |
| cmp \$0x80,$length |
| jae .L_main_loop_run_8_${rndsuffix} |
| vextracti32x4 \$0x3,%zmm4,%xmm0 |
| jmp .L_do_n_blocks_${rndsuffix} |
| |
| .L_start_by8_${rndsuffix}: |
| vbroadcasti32x4 ($TW),%zmm0 |
| vbroadcasti32x4 shufb_15_7(%rip),%zmm8 |
| mov \$0xaa,$tmp1 |
| kmovq $tmp1,%k2 |
| vpshufb %zmm8,%zmm0,%zmm1 |
| vpsllvq const_dq3210(%rip),%zmm0,%zmm4 |
| vpsrlvq const_dq5678(%rip),%zmm1,%zmm2 |
| vpclmulqdq \$0x0,%zmm25,%zmm2,%zmm3 |
| vpxorq %zmm2,%zmm4,%zmm4{%k2} |
| vpxord %zmm4,%zmm3,%zmm9 |
| vpsllvq const_dq7654(%rip),%zmm0,%zmm5 |
| vpsrlvq const_dq1234(%rip),%zmm1,%zmm6 |
| vpclmulqdq \$0x0,%zmm25,%zmm6,%zmm7 |
| vpxorq %zmm6,%zmm5,%zmm5{%k2} |
| vpxord %zmm5,%zmm7,%zmm10 |
| |
| .L_main_loop_run_8_${rndsuffix}: |
| vmovdqu8 ($input),%zmm1 |
| vmovdqu8 0x40($input),%zmm2 |
| add \$0x80,$input |
| ___ |
| } |
| |
| encrypt_by_eight_zmm("%zmm1", "%zmm2", "%zmm9", "%zmm10", "%zmm0", 0, $is_128); |
| |
| { |
| $code .= <<___; |
| vmovdqu8 %zmm1,($output) |
| vmovdqu8 %zmm2,0x40($output) |
| add \$0x80,$output |
| sub \$0x80,$length |
| cmp \$0x80,$length |
| jae .L_main_loop_run_8_${rndsuffix} |
| vextracti32x4 \$0x3,%zmm2,%xmm0 |
| jmp .L_do_n_blocks_${rndsuffix} |
| |
| .L_steal_cipher_${rndsuffix}: |
| vmovdqa %xmm8,%xmm2 |
| lea vpshufb_shf_table(%rip),$TEMPLOW |
| vmovdqu ($TEMPLOW,$length,1),%xmm10 |
| vpshufb %xmm10,%xmm8,%xmm8 |
| vmovdqu -0x10($input,$length,1),%xmm3 |
| vmovdqu %xmm8,-0x10($output,$length,1) |
| lea vpshufb_shf_table(%rip),$TEMPLOW |
| add \$16, $TEMPLOW |
| sub $length,$TEMPLOW |
| vmovdqu ($TEMPLOW),%xmm10 |
| vpxor mask1(%rip),%xmm10,%xmm10 |
| vpshufb %xmm10,%xmm3,%xmm3 |
| vpblendvb %xmm10,%xmm2,%xmm3,%xmm3 |
| vpxor %xmm0,%xmm3,%xmm8 |
| vpxor ($key1),%xmm8,%xmm8 |
| vaesenc 0x10($key1),%xmm8,%xmm8 |
| vaesenc 0x20($key1),%xmm8,%xmm8 |
| vaesenc 0x30($key1),%xmm8,%xmm8 |
| vaesenc 0x40($key1),%xmm8,%xmm8 |
| vaesenc 0x50($key1),%xmm8,%xmm8 |
| vaesenc 0x60($key1),%xmm8,%xmm8 |
| vaesenc 0x70($key1),%xmm8,%xmm8 |
| vaesenc 0x80($key1),%xmm8,%xmm8 |
| vaesenc 0x90($key1),%xmm8,%xmm8 |
| ___ |
| if ($is_128) { |
| $code .= "vaesenclast 0xa0($key1),%xmm8,%xmm8\n"; |
| } else { |
| $code .= <<___ |
| vaesenc 0xa0($key1),%xmm8,%xmm8 |
| vaesenc 0xb0($key1),%xmm8,%xmm8 |
| vaesenc 0xc0($key1),%xmm8,%xmm8 |
| vaesenc 0xd0($key1),%xmm8,%xmm8 |
| vaesenclast 0xe0($key1),%xmm8,%xmm8 |
| ___ |
| } |
| $code .= "vpxor %xmm0,%xmm8,%xmm8\n"; |
| $code .= "vmovdqu %xmm8,-0x10($output)\n"; |
| } |
| |
| { |
| $code .= <<___; |
| .L_ret_${rndsuffix}: |
| mov $GP_STORAGE($TW),%rbx |
| xor $tmp1,$tmp1 |
| mov $tmp1,$GP_STORAGE($TW) |
| # Zero-out the whole of `%zmm0`. |
| vpxorq %zmm0,%zmm0,%zmm0 |
| ___ |
| } |
| |
| if ($win64) { |
| $code .= <<___; |
| mov $GP_STORAGE + 8*1($TW),%rdi |
| mov $tmp1,$GP_STORAGE + 8*1($TW) |
| mov $GP_STORAGE + 8*2($TW),%rsi |
| mov $tmp1,$GP_STORAGE + 8*2($TW) |
| |
| vmovdqa $XMM_STORAGE + 16 * 0($TW), %xmm6 |
| vmovdqa $XMM_STORAGE + 16 * 1($TW), %xmm7 |
| vmovdqa $XMM_STORAGE + 16 * 2($TW), %xmm8 |
| vmovdqa $XMM_STORAGE + 16 * 3($TW), %xmm9 |
| |
| # Zero the 64 bytes we just restored to the xmm registers. |
| vmovdqa64 %zmm0,$XMM_STORAGE($TW) |
| |
| vmovdqa $XMM_STORAGE + 16 * 4($TW), %xmm10 |
| vmovdqa $XMM_STORAGE + 16 * 5($TW), %xmm11 |
| vmovdqa $XMM_STORAGE + 16 * 6($TW), %xmm12 |
| vmovdqa $XMM_STORAGE + 16 * 7($TW), %xmm13 |
| |
| # And again. |
| vmovdqa64 %zmm0,$XMM_STORAGE + 16 * 4($TW) |
| |
| vmovdqa $XMM_STORAGE + 16 * 8($TW), %xmm14 |
| vmovdqa $XMM_STORAGE + 16 * 9($TW), %xmm15 |
| |
| # Last round is only 32 bytes (256-bits), so we use `%ymm` as the |
| # source operand. |
| vmovdqa %ymm0,$XMM_STORAGE + 16 * 8($TW) |
| ___ |
| } |
| |
| { |
| $code .= <<___; |
| mov %rbp,$TW |
| pop %rbp |
| vzeroupper |
| ret |
| |
| .L_less_than_128_bytes_${rndsuffix}: |
| vpbroadcastq $gf_poly_8b, $ZPOLY |
| cmp \$0x10,$length |
| jb .L_ret_${rndsuffix} |
| vbroadcasti32x4 ($TW), %zmm0 |
| vbroadcasti32x4 shufb_15_7(%rip), %zmm8 |
| movl \$0xaa, %r8d |
| kmovq %r8, %k2 |
| mov $length,$tmp1 |
| and \$0x70,$tmp1 |
| cmp \$0x60,$tmp1 |
| je .L_num_blocks_is_6_${rndsuffix} |
| cmp \$0x50,$tmp1 |
| je .L_num_blocks_is_5_${rndsuffix} |
| cmp \$0x40,$tmp1 |
| je .L_num_blocks_is_4_${rndsuffix} |
| cmp \$0x30,$tmp1 |
| je .L_num_blocks_is_3_${rndsuffix} |
| cmp \$0x20,$tmp1 |
| je .L_num_blocks_is_2_${rndsuffix} |
| cmp \$0x10,$tmp1 |
| je .L_num_blocks_is_1_${rndsuffix} |
| |
| .L_num_blocks_is_7_${rndsuffix}: |
| vpshufb %zmm8, %zmm0, %zmm1 |
| vpsllvq const_dq3210(%rip), %zmm0, %zmm4 |
| vpsrlvq const_dq5678(%rip), %zmm1, %zmm2 |
| vpclmulqdq \$0x00, $ZPOLY, %zmm2, %zmm3 |
| vpxorq %zmm2, %zmm4, %zmm4{%k2} |
| vpxord %zmm4, %zmm3, %zmm9 |
| vpsllvq const_dq7654(%rip), %zmm0, %zmm5 |
| vpsrlvq const_dq1234(%rip), %zmm1, %zmm6 |
| vpclmulqdq \$0x00, $ZPOLY, %zmm6, %zmm7 |
| vpxorq %zmm6, %zmm5, %zmm5{%k2} |
| vpxord %zmm5, %zmm7, %zmm10 |
| mov \$0x0000ffffffffffff, $tmp1 |
| kmovq $tmp1, %k1 |
| vmovdqu8 16*0($input), %zmm1 |
| vmovdqu8 16*4($input), %zmm2{%k1} |
| |
| add \$0x70,$input |
| ___ |
| } |
| |
| encrypt_by_eight_zmm("%zmm1", "%zmm2", "%zmm9", "%zmm10", "%zmm0", 1, $is_128); |
| |
| { |
| $code .= <<___; |
| vmovdqu8 %zmm1, 16*0($output) |
| vmovdqu8 %zmm2, 16*4($output){%k1} |
| add \$0x70,$output |
| vextracti32x4 \$0x2, %zmm2, %xmm8 |
| vextracti32x4 \$0x3, %zmm10, %xmm0 |
| and \$0xf,$length |
| je .L_ret_${rndsuffix} |
| jmp .L_steal_cipher_${rndsuffix} |
| ___ |
| } |
| |
| { |
| $code .= <<___; |
| .L_num_blocks_is_6_${rndsuffix}: |
| vpshufb %zmm8, %zmm0, %zmm1 |
| vpsllvq const_dq3210(%rip), %zmm0, %zmm4 |
| vpsrlvq const_dq5678(%rip), %zmm1, %zmm2 |
| vpclmulqdq \$0x00, $ZPOLY, %zmm2, %zmm3 |
| vpxorq %zmm2, %zmm4, %zmm4{%k2} |
| vpxord %zmm4, %zmm3, %zmm9 |
| vpsllvq const_dq7654(%rip), %zmm0, %zmm5 |
| vpsrlvq const_dq1234(%rip), %zmm1, %zmm6 |
| vpclmulqdq \$0x00, $ZPOLY, %zmm6, %zmm7 |
| vpxorq %zmm6, %zmm5, %zmm5{%k2} |
| vpxord %zmm5, %zmm7, %zmm10 |
| vmovdqu8 16*0($input), %zmm1 |
| vmovdqu8 16*4($input), %ymm2 |
| add \$96, $input |
| ___ |
| } |
| |
| encrypt_by_eight_zmm("%zmm1", "%zmm2", "%zmm9", "%zmm10", "%zmm0", 1, $is_128); |
| |
| { |
| $code .= <<___; |
| vmovdqu8 %zmm1, 16*0($output) |
| vmovdqu8 %ymm2, 16*4($output) |
| add \$96, $output |
| |
| vextracti32x4 \$0x1, %ymm2, %xmm8 |
| vextracti32x4 \$0x2, %zmm10, %xmm0 |
| and \$0xf,$length |
| je .L_ret_${rndsuffix} |
| jmp .L_steal_cipher_${rndsuffix} |
| ___ |
| } |
| |
| { |
| $code .= <<___; |
| .L_num_blocks_is_5_${rndsuffix}: |
| vpshufb %zmm8, %zmm0, %zmm1 |
| vpsllvq const_dq3210(%rip), %zmm0, %zmm4 |
| vpsrlvq const_dq5678(%rip), %zmm1, %zmm2 |
| vpclmulqdq \$0x00, $ZPOLY, %zmm2, %zmm3 |
| vpxorq %zmm2, %zmm4, %zmm4{%k2} |
| vpxord %zmm4, %zmm3, %zmm9 |
| vpsllvq const_dq7654(%rip), %zmm0, %zmm5 |
| vpsrlvq const_dq1234(%rip), %zmm1, %zmm6 |
| vpclmulqdq \$0x00, $ZPOLY, %zmm6, %zmm7 |
| vpxorq %zmm6, %zmm5, %zmm5{%k2} |
| vpxord %zmm5, %zmm7, %zmm10 |
| vmovdqu8 16*0($input), %zmm1 |
| vmovdqu8 16*4($input), %xmm2 |
| add \$80, $input |
| ___ |
| } |
| |
| encrypt_by_eight_zmm("%zmm1", "%zmm2", "%zmm9", "%zmm10", "%zmm0", 1, $is_128); |
| |
| { |
| $code .= <<___; |
| vmovdqu8 %zmm1, 16*0($output) |
| vmovdqu8 %xmm2, 16*4($output) |
| add \$80, $output |
| |
| vmovdqa %xmm2, %xmm8 |
| vextracti32x4 \$0x1, %zmm10, %xmm0 |
| and \$0xf,$length |
| je .L_ret_${rndsuffix} |
| jmp .L_steal_cipher_${rndsuffix} |
| ___ |
| } |
| |
| { |
| $code .= <<___; |
| .L_num_blocks_is_4_${rndsuffix}: |
| vpshufb %zmm8, %zmm0, %zmm1 |
| vpsllvq const_dq3210(%rip), %zmm0, %zmm4 |
| vpsrlvq const_dq5678(%rip), %zmm1, %zmm2 |
| vpclmulqdq \$0x00, $ZPOLY, %zmm2, %zmm3 |
| vpxorq %zmm2, %zmm4, %zmm4{%k2} |
| vpxord %zmm4, %zmm3, %zmm9 |
| vpsllvq const_dq7654(%rip), %zmm0, %zmm5 |
| vpsrlvq const_dq1234(%rip), %zmm1, %zmm6 |
| vpclmulqdq \$0x00, $ZPOLY, %zmm6, %zmm7 |
| vpxorq %zmm6, %zmm5, %zmm5{%k2} |
| vpxord %zmm5, %zmm7, %zmm10 |
| vmovdqu8 16*0($input), %zmm1 |
| add \$64, $input |
| ___ |
| } |
| |
| encrypt_by_four("%zmm1", "%zmm9", "%zmm0", $is_128); |
| |
| { |
| $code .= <<___; |
| vmovdqu8 %zmm1, 16*0($output) |
| add \$64, $output |
| vextracti32x4 \$0x3, %zmm1, %xmm8 |
| vmovdqa %xmm10, %xmm0 |
| and \$0xf,$length |
| je .L_ret_${rndsuffix} |
| jmp .L_steal_cipher_${rndsuffix} |
| ___ |
| } |
| |
| { |
| $code .= <<___; |
| .L_num_blocks_is_3_${rndsuffix}: |
| vpshufb %zmm8, %zmm0, %zmm1 |
| vpsllvq const_dq3210(%rip), %zmm0, %zmm4 |
| vpsrlvq const_dq5678(%rip), %zmm1, %zmm2 |
| vpclmulqdq \$0x00, $ZPOLY, %zmm2, %zmm3 |
| vpxorq %zmm2, %zmm4, %zmm4{%k2} |
| vpxord %zmm4, %zmm3, %zmm9 |
| mov \$0x0000ffffffffffff, $tmp1 |
| kmovq $tmp1, %k1 |
| vmovdqu8 16*0($input), %zmm1{%k1} |
| add \$48, $input |
| ___ |
| } |
| |
| encrypt_by_four("%zmm1", "%zmm9", "%zmm0", $is_128); |
| |
| { |
| $code .= <<___; |
| vmovdqu8 %zmm1, 16*0($output){%k1} |
| add \$48, $output |
| vextracti32x4 \$2, %zmm1, %xmm8 |
| vextracti32x4 \$3, %zmm9, %xmm0 |
| and \$0xf,$length |
| je .L_ret_${rndsuffix} |
| jmp .L_steal_cipher_${rndsuffix} |
| ___ |
| } |
| |
| { |
| $code .= <<___; |
| .L_num_blocks_is_2_${rndsuffix}: |
| vpshufb %zmm8, %zmm0, %zmm1 |
| vpsllvq const_dq3210(%rip), %zmm0, %zmm4 |
| vpsrlvq const_dq5678(%rip), %zmm1, %zmm2 |
| vpclmulqdq \$0x00, $ZPOLY, %zmm2, %zmm3 |
| vpxorq %zmm2, %zmm4, %zmm4{%k2} |
| vpxord %zmm4, %zmm3, %zmm9 |
| |
| vmovdqu8 16*0($input), %ymm1 |
| add \$32, $input |
| ___ |
| } |
| |
| encrypt_by_four("%ymm1", "%ymm9", "%ymm0", $is_128); |
| |
| { |
| $code .= <<___; |
| vmovdqu8 %ymm1, 16*0($output) |
| add \$32, $output |
| |
| vextracti32x4 \$1, %ymm1, %xmm8 |
| vextracti32x4 \$2, %zmm9, %xmm0 |
| and \$0xf,$length |
| je .L_ret_${rndsuffix} |
| jmp .L_steal_cipher_${rndsuffix} |
| ___ |
| } |
| |
| { |
| $code .= <<___; |
| .L_num_blocks_is_1_${rndsuffix}: |
| vpshufb %zmm8, %zmm0, %zmm1 |
| vpsllvq const_dq3210(%rip), %zmm0, %zmm4 |
| vpsrlvq const_dq5678(%rip), %zmm1, %zmm2 |
| vpclmulqdq \$0x00, $ZPOLY, %zmm2, %zmm3 |
| vpxorq %zmm2, %zmm4, %zmm4{%k2} |
| vpxord %zmm4, %zmm3, %zmm9 |
| |
| vmovdqu8 16*0($input), %xmm1 |
| add \$16, $input |
| ___ |
| } |
| |
| encrypt_by_four("%ymm1", "%ymm9", "%ymm0", $is_128); |
| |
| { |
| $code .= <<___; |
| vmovdqu8 %xmm1, 16*0($output) |
| add \$16, $output |
| |
| vmovdqa %xmm1, %xmm8 |
| vextracti32x4 \$1, %zmm9, %xmm0 |
| and \$0xf,$length |
| je .L_ret_${rndsuffix} |
| jmp .L_steal_cipher_${rndsuffix} |
| .cfi_endproc |
| ___ |
| } |
| } |
| |
| # ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; |
| # ;void aesni_xts_[128|256]_decrypt_avx512( |
| # ; const uint8_t *in, // input data |
| # ; uint8_t *out, // output data |
| # ; size_t length, // sector size, in bytes |
| # ; const AES_KEY *key1, // key used for "ECB" encryption, 16*2 bytes |
| # ; const AES_KEY *key2, // key used for tweaking, 16*2 bytes |
| # ; const uint8_t iv[16]) // initial tweak value, 16 bytes |
| # ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; |
| sub dec { |
| my $is_128 = $_[0]; |
| my $rndsuffix = &random_string(); |
| |
| if ($is_128) { |
| $code.=<<___; |
| .globl aesni_xts_128_decrypt_avx512 |
| .hidden aesni_xts_128_decrypt_avx512 |
| .type aesni_xts_128_decrypt_avx512,\@function,6 |
| .align 32 |
| aesni_xts_128_decrypt_avx512: |
| .cfi_startproc |
| endbranch |
| ___ |
| } else { |
| $code.=<<___; |
| .globl aesni_xts_256_decrypt_avx512 |
| .hidden aesni_xts_256_decrypt_avx512 |
| .type aesni_xts_256_decrypt_avx512,\@function,6 |
| .align 32 |
| aesni_xts_256_decrypt_avx512: |
| .cfi_startproc |
| endbranch |
| ___ |
| } |
| $code .= "push %rbp\n"; |
| $code .= "mov $TW,%rbp\n"; |
| $code .= "sub \$$VARIABLE_OFFSET,$TW\n"; |
| $code .= "and \$0xffffffffffffffc0,$TW\n"; |
| $code .= "mov %rbx,$GP_STORAGE($TW)\n"; |
| |
| if ($win64) { |
| $code .= "mov %rdi,$GP_STORAGE + 8*1($TW)\n"; |
| $code .= "mov %rsi,$GP_STORAGE + 8*2($TW)\n"; |
| $code .= "vmovdqa %xmm6, $XMM_STORAGE + 16*0($TW)\n"; |
| $code .= "vmovdqa %xmm7, $XMM_STORAGE + 16*1($TW)\n"; |
| $code .= "vmovdqa %xmm8, $XMM_STORAGE + 16*2($TW)\n"; |
| $code .= "vmovdqa %xmm9, $XMM_STORAGE + 16*3($TW)\n"; |
| $code .= "vmovdqa %xmm10, $XMM_STORAGE + 16*4($TW)\n"; |
| $code .= "vmovdqa %xmm11, $XMM_STORAGE + 16*5($TW)\n"; |
| $code .= "vmovdqa %xmm12, $XMM_STORAGE + 16*6($TW)\n"; |
| $code .= "vmovdqa %xmm13, $XMM_STORAGE + 16*7($TW)\n"; |
| $code .= "vmovdqa %xmm14, $XMM_STORAGE + 16*8($TW)\n"; |
| $code .= "vmovdqa %xmm15, $XMM_STORAGE + 16*9($TW)\n"; |
| } |
| |
| $code .= "mov \$0x87, $gf_poly_8b\n"; |
| $code .= "vmovdqu ($tweak),%xmm1\n"; # read initial tweak values |
| |
| encrypt_tweak("%xmm1", $is_128); |
| |
| if ($win64) { |
| $code .= "mov $input, 8 + 8*5(%rbp)\n"; # ciphertext pointer |
| $code .= "mov $output, 8 + 8*6(%rbp)\n"; # plaintext pointer |
| } |
| |
| { |
| $code.=<<___; |
| |
| cmp \$0x80,$length |
| jb .L_less_than_128_bytes_${rndsuffix} |
| vpbroadcastq $gf_poly_8b,$ZPOLY |
| cmp \$0x100,$length |
| jge .L_start_by16_${rndsuffix} |
| jmp .L_start_by8_${rndsuffix} |
| |
| .L_do_n_blocks_${rndsuffix}: |
| cmp \$0x0,$length |
| je .L_ret_${rndsuffix} |
| cmp \$0x70,$length |
| jge .L_remaining_num_blocks_is_7_${rndsuffix} |
| cmp \$0x60,$length |
| jge .L_remaining_num_blocks_is_6_${rndsuffix} |
| cmp \$0x50,$length |
| jge .L_remaining_num_blocks_is_5_${rndsuffix} |
| cmp \$0x40,$length |
| jge .L_remaining_num_blocks_is_4_${rndsuffix} |
| cmp \$0x30,$length |
| jge .L_remaining_num_blocks_is_3_${rndsuffix} |
| cmp \$0x20,$length |
| jge .L_remaining_num_blocks_is_2_${rndsuffix} |
| cmp \$0x10,$length |
| jge .L_remaining_num_blocks_is_1_${rndsuffix} |
| |
| # _remaining_num_blocks_is_0: |
| vmovdqu %xmm5, %xmm1 |
| # xmm5 contains last full block to decrypt with next teawk |
| ___ |
| } |
| decrypt_initial("%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", |
| "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12", |
| "%xmm13", "%xmm14", "%xmm15", "%xmm0", 1, 1, $is_128); |
| |
| { |
| $code .= <<___; |
| vmovdqu %xmm1, -0x10($output) |
| vmovdqa %xmm1, %xmm8 |
| |
| # Calc previous tweak |
| mov \$0x1,$tmp1 |
| kmovq $tmp1, %k1 |
| vpsllq \$0x3f,%xmm9,%xmm13 |
| vpsraq \$0x3f,%xmm13,%xmm14 |
| vpandq %xmm25,%xmm14,%xmm5 |
| vpxorq %xmm5,%xmm9,%xmm9{%k1} |
| vpsrldq \$0x8,%xmm9,%xmm10 |
| .byte 98, 211, 181, 8, 115, 194, 1 #vpshrdq \$0x1,%xmm10,%xmm9,%xmm0 |
| vpslldq \$0x8,%xmm13,%xmm13 |
| vpxorq %xmm13,%xmm0,%xmm0 |
| jmp .L_steal_cipher_${rndsuffix} |
| |
| .L_remaining_num_blocks_is_7_${rndsuffix}: |
| mov \$0xffffffffffffffff,$tmp1 |
| shr \$0x10,$tmp1 |
| kmovq $tmp1,%k1 |
| vmovdqu8 ($input),%zmm1 |
| vmovdqu8 0x40($input),%zmm2{%k1} |
| add \$0x70,$input |
| and \$0xf,$length |
| je .L_done_7_remain_${rndsuffix} |
| vextracti32x4 \$0x2,%zmm10,%xmm12 |
| vextracti32x4 \$0x3,%zmm10,%xmm13 |
| vinserti32x4 \$0x2,%xmm13,%zmm10,%zmm10 |
| ___ |
| } |
| |
| decrypt_by_eight_zmm("%zmm1", "%zmm2", "%zmm9", "%zmm10", "%zmm0", 1, $is_128); |
| |
| { |
| $code .= <<___; |
| vmovdqu8 %zmm1, ($output) |
| vmovdqu8 %zmm2, 0x40($output){%k1} |
| add \$0x70, $output |
| vextracti32x4 \$0x2,%zmm2,%xmm8 |
| vmovdqa %xmm12,%xmm0 |
| jmp .L_steal_cipher_${rndsuffix} |
| ___ |
| } |
| |
| $code .= "\n.L_done_7_remain_${rndsuffix}:\n"; |
| decrypt_by_eight_zmm("%zmm1", "%zmm2", "%zmm9", "%zmm10", "%zmm0", 1, $is_128); |
| |
| { |
| $code .= <<___; |
| vmovdqu8 %zmm1, ($output) |
| vmovdqu8 %zmm2, 0x40($output){%k1} |
| jmp .L_ret_${rndsuffix} |
| |
| .L_remaining_num_blocks_is_6_${rndsuffix}: |
| vmovdqu8 ($input),%zmm1 |
| vmovdqu8 0x40($input),%ymm2 |
| add \$0x60,$input |
| and \$0xf, $length |
| je .L_done_6_remain_${rndsuffix} |
| vextracti32x4 \$0x1,%zmm10,%xmm12 |
| vextracti32x4 \$0x2,%zmm10,%xmm13 |
| vinserti32x4 \$0x1,%xmm13,%zmm10,%zmm10 |
| ___ |
| } |
| |
| decrypt_by_eight_zmm("%zmm1", "%zmm2", "%zmm9", "%zmm10", "%zmm0", 1, $is_128); |
| |
| { |
| $code .= <<___; |
| vmovdqu8 %zmm1, ($output) |
| vmovdqu8 %ymm2, 0x40($output) |
| add \$0x60,$output |
| vextracti32x4 \$0x1,%zmm2,%xmm8 |
| vmovdqa %xmm12,%xmm0 |
| jmp .L_steal_cipher_${rndsuffix} |
| ___ |
| } |
| |
| $code .= "\n.L_done_6_remain_${rndsuffix}:\n"; |
| decrypt_by_eight_zmm("%zmm1", "%zmm2", "%zmm9", "%zmm10", "%zmm0", 1, $is_128); |
| |
| { |
| $code .= <<___; |
| vmovdqu8 %zmm1, ($output) |
| vmovdqu8 %ymm2,0x40($output) |
| jmp .L_ret_${rndsuffix} |
| |
| .L_remaining_num_blocks_is_5_${rndsuffix}: |
| vmovdqu8 ($input),%zmm1 |
| vmovdqu 0x40($input),%xmm2 |
| add \$0x50,$input |
| and \$0xf,$length |
| je .L_done_5_remain_${rndsuffix} |
| vmovdqa %xmm10,%xmm12 |
| vextracti32x4 \$0x1,%zmm10,%xmm10 |
| ___ |
| } |
| |
| decrypt_by_eight_zmm("%zmm1", "%zmm2", "%zmm9", "%zmm10", "%zmm0", 1, $is_128); |
| |
| { |
| $code .= <<___; |
| vmovdqu8 %zmm1, ($output) |
| vmovdqu %xmm2, 0x40($output) |
| add \$0x50, $output |
| vmovdqa %xmm2,%xmm8 |
| vmovdqa %xmm12,%xmm0 |
| jmp .L_steal_cipher_${rndsuffix} |
| ___ |
| } |
| |
| $code .= "\n.L_done_5_remain_${rndsuffix}:\n"; |
| decrypt_by_eight_zmm("%zmm1", "%zmm2", "%zmm9", "%zmm10", "%zmm0", 1, $is_128); |
| |
| { |
| $code .= <<___; |
| vmovdqu8 %zmm1, ($output) |
| vmovdqu8 %xmm2, 0x40($output) |
| jmp .L_ret_${rndsuffix} |
| |
| .L_remaining_num_blocks_is_4_${rndsuffix}: |
| vmovdqu8 ($input),%zmm1 |
| add \$0x40,$input |
| and \$0xf, $length |
| je .L_done_4_remain_${rndsuffix} |
| vextracti32x4 \$0x3,%zmm9,%xmm12 |
| vinserti32x4 \$0x3,%xmm10,%zmm9,%zmm9 |
| ___ |
| } |
| |
| decrypt_by_eight_zmm("%zmm1", "%zmm2", "%zmm9", "%zmm10", "%zmm0", 1, $is_128); |
| |
| { |
| $code .= <<___; |
| vmovdqu8 %zmm1,($output) |
| add \$0x40,$output |
| vextracti32x4 \$0x3,%zmm1,%xmm8 |
| vmovdqa %xmm12,%xmm0 |
| jmp .L_steal_cipher_${rndsuffix} |
| ___ |
| } |
| |
| $code .= "\n.L_done_4_remain_${rndsuffix}:\n"; |
| decrypt_by_eight_zmm("%zmm1", "%zmm2", "%zmm9", "%zmm10", "%zmm0", 1, $is_128); |
| |
| { |
| $code .= <<___; |
| vmovdqu8 %zmm1, ($output) |
| jmp .L_ret_${rndsuffix} |
| |
| .L_remaining_num_blocks_is_3_${rndsuffix}: |
| vmovdqu ($input),%xmm1 |
| vmovdqu 0x10($input),%xmm2 |
| vmovdqu 0x20($input),%xmm3 |
| add \$0x30,$input |
| and \$0xf,$length |
| je .L_done_3_remain_${rndsuffix} |
| vextracti32x4 \$0x2,%zmm9,%xmm13 |
| vextracti32x4 \$0x1,%zmm9,%xmm10 |
| vextracti32x4 \$0x3,%zmm9,%xmm11 |
| ___ |
| } |
| |
| decrypt_initial("%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", |
| "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12", |
| "%xmm13", "%xmm14", "%xmm15", "%xmm0", 3, 1, $is_128); |
| |
| { |
| $code .= <<___; |
| vmovdqu %xmm1,($output) |
| vmovdqu %xmm2,0x10($output) |
| vmovdqu %xmm3,0x20($output) |
| add \$0x30,$output |
| vmovdqa %xmm3,%xmm8 |
| vmovdqa %xmm13,%xmm0 |
| jmp .L_steal_cipher_${rndsuffix} |
| ___ |
| } |
| $code .= "\n.L_done_3_remain_${rndsuffix}:\n"; |
| $code .= "vextracti32x4 \$0x1,%zmm9,%xmm10\n"; |
| $code .= "vextracti32x4 \$0x2,%zmm9,%xmm11\n"; |
| |
| decrypt_initial("%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", |
| "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12", |
| "%xmm13", "%xmm14", "%xmm15", "%xmm0", 3, 1, $is_128); |
| |
| { |
| $code .= <<___; |
| vmovdqu %xmm1,($output) |
| vmovdqu %xmm2,0x10($output) |
| vmovdqu %xmm3,0x20($output) |
| jmp .L_ret_${rndsuffix} |
| |
| .L_remaining_num_blocks_is_2_${rndsuffix}: |
| vmovdqu ($input),%xmm1 |
| vmovdqu 0x10($input),%xmm2 |
| add \$0x20,$input |
| and \$0xf,$length |
| je .L_done_2_remain_${rndsuffix} |
| vextracti32x4 \$0x2,%zmm9,%xmm10 |
| vextracti32x4 \$0x1,%zmm9,%xmm12 |
| ___ |
| } |
| |
| decrypt_initial("%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", |
| "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12", |
| "%xmm13", "%xmm14", "%xmm15", "%xmm0", 2, 1, $is_128); |
| |
| { |
| $code .= <<___; |
| vmovdqu %xmm1,($output) |
| vmovdqu %xmm2,0x10($output) |
| add \$0x20,$output |
| vmovdqa %xmm2,%xmm8 |
| vmovdqa %xmm12,%xmm0 |
| jmp .L_steal_cipher_${rndsuffix} |
| ___ |
| } |
| $code .= "\n.L_done_2_remain_${rndsuffix}:\n"; |
| $code .= "vextracti32x4 \$0x1,%zmm9,%xmm10\n"; |
| |
| decrypt_initial("%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", |
| "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12", |
| "%xmm13", "%xmm14", "%xmm15", "%xmm0", 2, 1, $is_128); |
| |
| { |
| $code .= <<___; |
| vmovdqu %xmm1,($output) |
| vmovdqu %xmm2,0x10($output) |
| jmp .L_ret_${rndsuffix} |
| |
| .L_remaining_num_blocks_is_1_${rndsuffix}: |
| vmovdqu ($input),%xmm1 |
| add \$0x10,$input |
| and \$0xf,$length |
| je .L_done_1_remain_${rndsuffix} |
| vextracti32x4 \$0x1,%zmm9,%xmm11 |
| ___ |
| } |
| |
| decrypt_initial("%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", |
| "%xmm7", "%xmm8", "%xmm11", "%xmm10", "%xmm9", "%xmm12", |
| "%xmm13", "%xmm14", "%xmm15", "%xmm0", 1, 1, $is_128); |
| { |
| $code .= <<___; |
| vmovdqu %xmm1,($output) |
| add \$0x10,$output |
| vmovdqa %xmm1,%xmm8 |
| vmovdqa %xmm9,%xmm0 |
| jmp .L_steal_cipher_${rndsuffix} |
| ___ |
| } |
| |
| $code .= "\n.L_done_1_remain_${rndsuffix}:\n"; |
| |
| decrypt_initial("%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", |
| "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12", |
| "%xmm13", "%xmm14", "%xmm15", "%xmm0", 1, 1, $is_128); |
| |
| { |
| $code .= <<___; |
| vmovdqu %xmm1, ($output) |
| jmp .L_ret_${rndsuffix} |
| |
| .L_start_by16_${rndsuffix}: |
| vbroadcasti32x4 ($TW),%zmm0 |
| vbroadcasti32x4 shufb_15_7(%rip),%zmm8 |
| mov \$0xaa,$tmp1 |
| kmovq $tmp1,%k2 |
| |
| # Mult tweak by 2^{3, 2, 1, 0} |
| vpshufb %zmm8,%zmm0,%zmm1 |
| vpsllvq const_dq3210(%rip),%zmm0,%zmm4 |
| vpsrlvq const_dq5678(%rip),%zmm1,%zmm2 |
| vpclmulqdq \$0x0,$ZPOLY,%zmm2,%zmm3 |
| vpxorq %zmm2,%zmm4,%zmm4{%k2} |
| vpxord %zmm4,%zmm3,%zmm9 |
| |
| # Mult tweak by 2^{7, 6, 5, 4} |
| vpsllvq const_dq7654(%rip),%zmm0,%zmm5 |
| vpsrlvq const_dq1234(%rip),%zmm1,%zmm6 |
| vpclmulqdq \$0x0,%zmm25,%zmm6,%zmm7 |
| vpxorq %zmm6,%zmm5,%zmm5{%k2} |
| vpxord %zmm5,%zmm7,%zmm10 |
| |
| # Make next 8 tweak values by all x 2^8 |
| vpsrldq \$0xf,%zmm9,%zmm13 |
| vpclmulqdq \$0x0,%zmm25,%zmm13,%zmm14 |
| vpslldq \$0x1,%zmm9,%zmm11 |
| vpxord %zmm14,%zmm11,%zmm11 |
| |
| vpsrldq \$0xf,%zmm10,%zmm15 |
| vpclmulqdq \$0x0,%zmm25,%zmm15,%zmm16 |
| vpslldq \$0x1,%zmm10,%zmm12 |
| vpxord %zmm16,%zmm12,%zmm12 |
| |
| .L_main_loop_run_16_${rndsuffix}: |
| vmovdqu8 ($input),%zmm1 |
| vmovdqu8 0x40($input),%zmm2 |
| vmovdqu8 0x80($input),%zmm3 |
| vmovdqu8 0xc0($input),%zmm4 |
| vmovdqu8 0xf0($input),%xmm5 |
| add \$0x100,$input |
| ___ |
| } |
| |
| decrypt_by_16_zmm("%zmm1", "%zmm2", "%zmm3", "%zmm4", "%zmm9", |
| "%zmm10", "%zmm11", "%zmm12", "%zmm0", 0, $is_128); |
| |
| { |
| $code .= <<___; |
| vmovdqu8 %zmm1,($output) |
| vmovdqu8 %zmm2,0x40($output) |
| vmovdqu8 %zmm3,0x80($output) |
| vmovdqu8 %zmm4,0xc0($output) |
| add \$0x100,$output |
| sub \$0x100,$length |
| cmp \$0x100,$length |
| jge .L_main_loop_run_16_${rndsuffix} |
| |
| cmp \$0x80,$length |
| jge .L_main_loop_run_8_${rndsuffix} |
| jmp .L_do_n_blocks_${rndsuffix} |
| |
| .L_start_by8_${rndsuffix}: |
| # Make first 7 tweak values |
| vbroadcasti32x4 ($TW),%zmm0 |
| vbroadcasti32x4 shufb_15_7(%rip),%zmm8 |
| mov \$0xaa,$tmp1 |
| kmovq $tmp1,%k2 |
| |
| # Mult tweak by 2^{3, 2, 1, 0} |
| vpshufb %zmm8,%zmm0,%zmm1 |
| vpsllvq const_dq3210(%rip),%zmm0,%zmm4 |
| vpsrlvq const_dq5678(%rip),%zmm1,%zmm2 |
| vpclmulqdq \$0x0,%zmm25,%zmm2,%zmm3 |
| vpxorq %zmm2,%zmm4,%zmm4{%k2} |
| vpxord %zmm4,%zmm3,%zmm9 |
| |
| # Mult tweak by 2^{7, 6, 5, 4} |
| vpsllvq const_dq7654(%rip),%zmm0,%zmm5 |
| vpsrlvq const_dq1234(%rip),%zmm1,%zmm6 |
| vpclmulqdq \$0x0,%zmm25,%zmm6,%zmm7 |
| vpxorq %zmm6,%zmm5,%zmm5{%k2} |
| vpxord %zmm5,%zmm7,%zmm10 |
| |
| .L_main_loop_run_8_${rndsuffix}: |
| vmovdqu8 ($input),%zmm1 |
| vmovdqu8 0x40($input),%zmm2 |
| vmovdqu8 0x70($input),%xmm5 |
| add \$0x80,$input |
| ___ |
| } |
| |
| |
| decrypt_by_eight_zmm("%zmm1", "%zmm2", "%zmm9", "%zmm10", "%zmm0", 0, $is_128); |
| |
| { |
| $code .= <<___; |
| vmovdqu8 %zmm1,($output) |
| vmovdqu8 %zmm2,0x40($output) |
| add \$0x80,$output |
| sub \$0x80,$length |
| cmp \$0x80,$length |
| jge .L_main_loop_run_8_${rndsuffix} |
| jmp .L_do_n_blocks_${rndsuffix} |
| |
| .L_steal_cipher_${rndsuffix}: |
| # start cipher stealing simplified: xmm8-last cipher block, xmm0-next tweak |
| vmovdqa %xmm8,%xmm2 |
| |
| # shift xmm8 to the left by 16-N_val bytes |
| lea vpshufb_shf_table(%rip),$TEMPLOW |
| vmovdqu ($TEMPLOW,$length,1),%xmm10 |
| vpshufb %xmm10,%xmm8,%xmm8 |
| |
| |
| vmovdqu -0x10($input,$length,1),%xmm3 |
| vmovdqu %xmm8,-0x10($output,$length,1) |
| |
| # shift xmm3 to the right by 16-N_val bytes |
| lea vpshufb_shf_table(%rip), $TEMPLOW |
| add \$16, $TEMPLOW |
| sub $length,$TEMPLOW |
| vmovdqu ($TEMPLOW),%xmm10 |
| vpxor mask1(%rip),%xmm10,%xmm10 |
| vpshufb %xmm10,%xmm3,%xmm3 |
| |
| vpblendvb %xmm10,%xmm2,%xmm3,%xmm3 |
| |
| # xor Tweak value |
| vpxor %xmm0,%xmm3,%xmm8 |
| |
| # decrypt last block with cipher stealing |
| vpxor ($key1),%xmm8,%xmm8 |
| vaesdec 0x10($key1),%xmm8,%xmm8 |
| vaesdec 0x20($key1),%xmm8,%xmm8 |
| vaesdec 0x30($key1),%xmm8,%xmm8 |
| vaesdec 0x40($key1),%xmm8,%xmm8 |
| vaesdec 0x50($key1),%xmm8,%xmm8 |
| vaesdec 0x60($key1),%xmm8,%xmm8 |
| vaesdec 0x70($key1),%xmm8,%xmm8 |
| vaesdec 0x80($key1),%xmm8,%xmm8 |
| vaesdec 0x90($key1),%xmm8,%xmm8 |
| ___ |
| if ($is_128) { |
| $code .= "vaesdeclast 0xa0($key1),%xmm8,%xmm8\n"; |
| } else { |
| $code .= <<___; |
| vaesdec 0xa0($key1),%xmm8,%xmm8 |
| vaesdec 0xb0($key1),%xmm8,%xmm8 |
| vaesdec 0xc0($key1),%xmm8,%xmm8 |
| vaesdec 0xd0($key1),%xmm8,%xmm8 |
| vaesdeclast 0xe0($key1),%xmm8,%xmm8 |
| ___ |
| } |
| $code .= <<___ |
| # xor Tweak value |
| vpxor %xmm0,%xmm8,%xmm8 |
| |
| .L_done_${rndsuffix}: |
| # store last ciphertext value |
| vmovdqu %xmm8,-0x10($output) |
| ___ |
| } |
| |
| { |
| $code .= <<___; |
| .L_ret_${rndsuffix}: |
| mov $GP_STORAGE($TW),%rbx |
| xor $tmp1,$tmp1 |
| mov $tmp1,$GP_STORAGE($TW) |
| # Zero-out the whole of `%zmm0`. |
| vpxorq %zmm0,%zmm0,%zmm0 |
| ___ |
| } |
| |
| if ($win64) { |
| $code .= <<___; |
| mov $GP_STORAGE + 8*1($TW),%rdi |
| mov $tmp1,$GP_STORAGE + 8*1($TW) |
| mov $GP_STORAGE + 8*2($TW),%rsi |
| mov $tmp1,$GP_STORAGE + 8*2($TW) |
| |
| vmovdqa $XMM_STORAGE + 16 * 0($TW), %xmm6 |
| vmovdqa $XMM_STORAGE + 16 * 1($TW), %xmm7 |
| vmovdqa $XMM_STORAGE + 16 * 2($TW), %xmm8 |
| vmovdqa $XMM_STORAGE + 16 * 3($TW), %xmm9 |
| |
| # Zero the 64 bytes we just restored to the xmm registers. |
| vmovdqa64 %zmm0,$XMM_STORAGE($TW) |
| |
| vmovdqa $XMM_STORAGE + 16 * 4($TW), %xmm10 |
| vmovdqa $XMM_STORAGE + 16 * 5($TW), %xmm11 |
| vmovdqa $XMM_STORAGE + 16 * 6($TW), %xmm12 |
| vmovdqa $XMM_STORAGE + 16 * 7($TW), %xmm13 |
| |
| # And again. |
| vmovdqa64 %zmm0,$XMM_STORAGE + 16 * 4($TW) |
| |
| vmovdqa $XMM_STORAGE + 16 * 8($TW), %xmm14 |
| vmovdqa $XMM_STORAGE + 16 * 9($TW), %xmm15 |
| |
| # Last round is only 32 bytes (256-bits), so we use `%ymm` as the |
| # source operand. |
| vmovdqa %ymm0,$XMM_STORAGE + 16 * 8($TW) |
| ___ |
| } |
| |
| { |
| $code .= <<___; |
| mov %rbp,$TW |
| pop %rbp |
| vzeroupper |
| ret |
| |
| .L_less_than_128_bytes_${rndsuffix}: |
| cmp \$0x10,$length |
| jb .L_ret_${rndsuffix} |
| |
| mov $length,$tmp1 |
| and \$0x70,$tmp1 |
| cmp \$0x60,$tmp1 |
| je .L_num_blocks_is_6_${rndsuffix} |
| cmp \$0x50,$tmp1 |
| je .L_num_blocks_is_5_${rndsuffix} |
| cmp \$0x40,$tmp1 |
| je .L_num_blocks_is_4_${rndsuffix} |
| cmp \$0x30,$tmp1 |
| je .L_num_blocks_is_3_${rndsuffix} |
| cmp \$0x20,$tmp1 |
| je .L_num_blocks_is_2_${rndsuffix} |
| cmp \$0x10,$tmp1 |
| je .L_num_blocks_is_1_${rndsuffix} |
| ___ |
| } |
| |
| $code .= "\n.L_num_blocks_is_7_${rndsuffix}:\n"; |
| initialize("%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", |
| "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12", |
| "%xmm13", "%xmm14", "%xmm15", 7); |
| |
| { |
| $code .= <<___; |
| add \$0x70,$input |
| and \$0xf,$length |
| je .L_done_7_${rndsuffix} |
| |
| .L_steal_cipher_7_${rndsuffix}: |
| xor $gf_poly_8b_temp, $gf_poly_8b_temp |
| shl \$1, $TEMPLOW |
| adc $TEMPHIGH, $TEMPHIGH |
| cmovc $gf_poly_8b, $gf_poly_8b_temp |
| xor $gf_poly_8b_temp, $TEMPLOW |
| mov $TEMPLOW,0x10($TW) |
| mov $TEMPHIGH,0x18($TW) |
| vmovdqa64 %xmm15,%xmm16 |
| vmovdqa 0x10($TW),%xmm15 |
| ___ |
| } |
| |
| decrypt_initial("%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", |
| "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12", |
| "%xmm13", "%xmm14", "%xmm15", "%xmm0", 7, 1, $is_128); |
| |
| { |
| $code .= <<___; |
| vmovdqu %xmm1,($output) |
| vmovdqu %xmm2,0x10($output) |
| vmovdqu %xmm3,0x20($output) |
| vmovdqu %xmm4,0x30($output) |
| vmovdqu %xmm5,0x40($output) |
| vmovdqu %xmm6,0x50($output) |
| add \$0x70,$output |
| vmovdqa64 %xmm16,%xmm0 |
| vmovdqa %xmm7,%xmm8 |
| jmp .L_steal_cipher_${rndsuffix} |
| ___ |
| } |
| |
| $code .= "\n.L_done_7_${rndsuffix}:\n"; |
| decrypt_initial("%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", |
| "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12", |
| "%xmm13", "%xmm14", "%xmm15", "%xmm0", 7, 1, $is_128); |
| |
| { |
| $code .= <<___; |
| vmovdqu %xmm1,($output) |
| vmovdqu %xmm2,0x10($output) |
| vmovdqu %xmm3,0x20($output) |
| vmovdqu %xmm4,0x30($output) |
| vmovdqu %xmm5,0x40($output) |
| vmovdqu %xmm6,0x50($output) |
| add \$0x70,$output |
| vmovdqa %xmm7,%xmm8 |
| jmp .L_done_${rndsuffix} |
| ___ |
| } |
| |
| $code .= "\n.L_num_blocks_is_6_${rndsuffix}:\n"; |
| initialize("%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", |
| "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12", |
| "%xmm13", "%xmm14", "%xmm15", 6); |
| |
| { |
| $code .= <<___; |
| add \$0x60,$input |
| and \$0xf,$length |
| je .L_done_6_${rndsuffix} |
| |
| .L_steal_cipher_6_${rndsuffix}: |
| xor $gf_poly_8b_temp, $gf_poly_8b_temp |
| shl \$1, $TEMPLOW |
| adc $TEMPHIGH, $TEMPHIGH |
| cmovc $gf_poly_8b, $gf_poly_8b_temp |
| xor $gf_poly_8b_temp, $TEMPLOW |
| mov $TEMPLOW,0x10($TW) |
| mov $TEMPHIGH,0x18($TW) |
| vmovdqa64 %xmm14,%xmm15 |
| vmovdqa 0x10($TW),%xmm14 |
| ___ |
| } |
| |
| decrypt_initial("%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", |
| "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12", |
| "%xmm13", "%xmm14", "%xmm15", "%xmm0", 6, 1, $is_128); |
| |
| { |
| $code .= <<___; |
| vmovdqu %xmm1,($output) |
| vmovdqu %xmm2,0x10($output) |
| vmovdqu %xmm3,0x20($output) |
| vmovdqu %xmm4,0x30($output) |
| vmovdqu %xmm5,0x40($output) |
| add \$0x60,$output |
| vmovdqa %xmm15,%xmm0 |
| vmovdqa %xmm6,%xmm8 |
| jmp .L_steal_cipher_${rndsuffix} |
| ___ |
| } |
| $code .= "\n.L_done_6_${rndsuffix}:\n"; |
| decrypt_initial("%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", |
| "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12", |
| "%xmm13", "%xmm14", "%xmm15", "%xmm0", 6, 1, $is_128); |
| |
| { |
| $code .= <<___; |
| vmovdqu %xmm1,($output) |
| vmovdqu %xmm2,0x10($output) |
| vmovdqu %xmm3,0x20($output) |
| vmovdqu %xmm4,0x30($output) |
| vmovdqu %xmm5,0x40($output) |
| add \$0x60,$output |
| vmovdqa %xmm6,%xmm8 |
| jmp .L_done_${rndsuffix} |
| ___ |
| } |
| |
| $code .= "\n.L_num_blocks_is_5_${rndsuffix}:\n"; |
| initialize("%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", |
| "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12", |
| "%xmm13", "%xmm14", "%xmm15", 5); |
| |
| { |
| $code .= <<___; |
| add \$0x50,$input |
| and \$0xf,$length |
| je .L_done_5_${rndsuffix} |
| |
| .L_steal_cipher_5_${rndsuffix}: |
| xor $gf_poly_8b_temp, $gf_poly_8b_temp |
| shl \$1, $TEMPLOW |
| adc $TEMPHIGH, $TEMPHIGH |
| cmovc $gf_poly_8b, $gf_poly_8b_temp |
| xor $gf_poly_8b_temp, $TEMPLOW |
| mov $TEMPLOW,0x10($TW) |
| mov $TEMPHIGH,0x18($TW) |
| vmovdqa64 %xmm13,%xmm14 |
| vmovdqa 0x10($TW),%xmm13 |
| ___ |
| } |
| |
| decrypt_initial("%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", |
| "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12", |
| "%xmm13", "%xmm14", "%xmm15", "%xmm0", 5, 1, $is_128); |
| |
| { |
| $code .= <<___; |
| vmovdqu %xmm1,($output) |
| vmovdqu %xmm2,0x10($output) |
| vmovdqu %xmm3,0x20($output) |
| vmovdqu %xmm4,0x30($output) |
| add \$0x50,$output |
| vmovdqa %xmm14,%xmm0 |
| vmovdqa %xmm5,%xmm8 |
| jmp .L_steal_cipher_${rndsuffix} |
| ___ |
| } |
| |
| $code .= "\n.L_done_5_${rndsuffix}:\n"; |
| decrypt_initial("%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", |
| "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12", |
| "%xmm13", "%xmm14", "%xmm15", "%xmm0", 5, 1, $is_128); |
| |
| { |
| $code .= <<___; |
| vmovdqu %xmm1,($output) |
| vmovdqu %xmm2,0x10($output) |
| vmovdqu %xmm3,0x20($output) |
| vmovdqu %xmm4,0x30($output) |
| add \$0x50,$output |
| vmovdqa %xmm5,%xmm8 |
| jmp .L_done_${rndsuffix} |
| ___ |
| } |
| |
| $code .= "\n.L_num_blocks_is_4_${rndsuffix}:\n"; |
| |
| initialize("%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", |
| "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12", |
| "%xmm13", "%xmm14", "%xmm15", 4); |
| |
| { |
| $code .= <<___; |
| add \$0x40,$input |
| and \$0xf,$length |
| je .L_done_4_${rndsuffix} |
| |
| .L_steal_cipher_4_${rndsuffix}: |
| xor $gf_poly_8b_temp, $gf_poly_8b_temp |
| shl \$1, $TEMPLOW |
| adc $TEMPHIGH, $TEMPHIGH |
| cmovc $gf_poly_8b, $gf_poly_8b_temp |
| xor $gf_poly_8b_temp, $TEMPLOW |
| mov $TEMPLOW,0x10($TW) |
| mov $TEMPHIGH,0x18($TW) |
| vmovdqa64 %xmm12,%xmm13 |
| vmovdqa 0x10($TW),%xmm12 |
| ___ |
| } |
| |
| decrypt_initial("%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", |
| "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12", |
| "%xmm13", "%xmm14", "%xmm15", "%xmm0", 4, 1, $is_128); |
| |
| { |
| $code .= <<___; |
| vmovdqu %xmm1,($output) |
| vmovdqu %xmm2,0x10($output) |
| vmovdqu %xmm3,0x20($output) |
| add \$0x40,$output |
| vmovdqa %xmm13,%xmm0 |
| vmovdqa %xmm4,%xmm8 |
| jmp .L_steal_cipher_${rndsuffix} |
| ___ |
| } |
| |
| $code .= "\n.L_done_4_${rndsuffix}:\n"; |
| decrypt_initial("%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", |
| "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12", |
| "%xmm13", "%xmm14", "%xmm15", "%xmm0", 4, 1, $is_128); |
| |
| { |
| $code .= <<___; |
| vmovdqu %xmm1,($output) |
| vmovdqu %xmm2,0x10($output) |
| vmovdqu %xmm3,0x20($output) |
| add \$0x40,$output |
| vmovdqa %xmm4,%xmm8 |
| jmp .L_done_${rndsuffix} |
| ___ |
| } |
| |
| $code .= "\n.L_num_blocks_is_3_${rndsuffix}:\n"; |
| |
| initialize("%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", |
| "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12", |
| "%xmm13", "%xmm14", "%xmm15", 3); |
| |
| { |
| $code .= <<___; |
| add \$0x30,$input |
| and \$0xf,$length |
| je .L_done_3_${rndsuffix} |
| |
| .L_steal_cipher_3_${rndsuffix}: |
| xor $gf_poly_8b_temp, $gf_poly_8b_temp |
| shl \$1, $TEMPLOW |
| adc $TEMPHIGH, $TEMPHIGH |
| cmovc $gf_poly_8b, $gf_poly_8b_temp |
| xor $gf_poly_8b_temp, $TEMPLOW |
| mov $TEMPLOW,0x10($TW) |
| mov $TEMPHIGH,0x18($TW) |
| vmovdqa64 %xmm11,%xmm12 |
| vmovdqa 0x10($TW),%xmm11 |
| ___ |
| } |
| |
| decrypt_initial("%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", |
| "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12", |
| "%xmm13", "%xmm14", "%xmm15", "%xmm0", 3, 1, $is_128); |
| |
| { |
| $code .= <<___; |
| vmovdqu %xmm1,($output) |
| vmovdqu %xmm2,0x10($output) |
| add \$0x30,$output |
| vmovdqa %xmm12,%xmm0 |
| vmovdqa %xmm3,%xmm8 |
| jmp .L_steal_cipher_${rndsuffix} |
| ___ |
| } |
| $code .= "\n.L_done_3_${rndsuffix}:\n"; |
| decrypt_initial("%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", |
| "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12", |
| "%xmm13", "%xmm14", "%xmm15", "%xmm0", 3, 1, $is_128); |
| |
| { |
| $code .= <<___; |
| vmovdqu %xmm1,($output) |
| vmovdqu %xmm2,0x10($output) |
| add \$0x30,$output |
| vmovdqa %xmm3,%xmm8 |
| jmp .L_done_${rndsuffix} |
| ___ |
| } |
| |
| $code .= "\n.L_num_blocks_is_2_${rndsuffix}:\n"; |
| |
| initialize("%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", |
| "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12", |
| "%xmm13", "%xmm14", "%xmm15", 2); |
| |
| { |
| $code .= <<___; |
| add \$0x20,$input |
| and \$0xf,$length |
| je .L_done_2_${rndsuffix} |
| |
| .L_steal_cipher_2_${rndsuffix}: |
| xor $gf_poly_8b_temp, $gf_poly_8b_temp |
| shl \$1, $TEMPLOW |
| adc $TEMPHIGH, $TEMPHIGH |
| cmovc $gf_poly_8b, $gf_poly_8b_temp |
| xor $gf_poly_8b_temp, $TEMPLOW |
| mov $TEMPLOW,0x10($TW) |
| mov $TEMPHIGH,0x18($TW) |
| vmovdqa64 %xmm10,%xmm11 |
| vmovdqa 0x10($TW),%xmm10 |
| ___ |
| } |
| |
| decrypt_initial("%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", |
| "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12", |
| "%xmm13", "%xmm14", "%xmm15", "%xmm0", 2, 1, $is_128); |
| |
| { |
| $code .= <<___; |
| vmovdqu %xmm1,($output) |
| add \$0x20,$output |
| vmovdqa %xmm11,%xmm0 |
| vmovdqa %xmm2,%xmm8 |
| jmp .L_steal_cipher_${rndsuffix} |
| ___ |
| } |
| |
| $code .= "\n.L_done_2_${rndsuffix}:\n"; |
| decrypt_initial("%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", |
| "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12", |
| "%xmm13", "%xmm14", "%xmm15", "%xmm0", 2, 1, $is_128); |
| |
| { |
| $code .= <<___; |
| vmovdqu %xmm1,($output) |
| add \$0x20,$output |
| vmovdqa %xmm2,%xmm8 |
| jmp .L_done_${rndsuffix} |
| ___ |
| } |
| |
| $code .= "\n.L_num_blocks_is_1_${rndsuffix}:\n"; |
| |
| initialize("%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", |
| "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12", |
| "%xmm13", "%xmm14", "%xmm15", 1); |
| |
| { |
| $code .= <<___; |
| add \$0x10,$input |
| and \$0xf,$length |
| je .L_done_1_${rndsuffix} |
| |
| .L_steal_cipher_1_${rndsuffix}: |
| xor $gf_poly_8b_temp, $gf_poly_8b_temp |
| shl \$1, $TEMPLOW |
| adc $TEMPHIGH, $TEMPHIGH |
| cmovc $gf_poly_8b, $gf_poly_8b_temp |
| xor $gf_poly_8b_temp, $TEMPLOW |
| mov $TEMPLOW,0x10($TW) |
| mov $TEMPHIGH,0x18($TW) |
| vmovdqa64 %xmm9,%xmm10 |
| vmovdqa 0x10($TW),%xmm9 |
| ___ |
| } |
| decrypt_initial("%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", |
| "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12", |
| "%xmm13", "%xmm14", "%xmm15", "%xmm0", 1, 1, $is_128); |
| |
| { |
| $code .= <<___; |
| add \$0x10,$output |
| vmovdqa %xmm10,%xmm0 |
| vmovdqa %xmm1,%xmm8 |
| jmp .L_steal_cipher_${rndsuffix} |
| ___ |
| } |
| $code .= "\n.L_done_1_${rndsuffix}:\n"; |
| decrypt_initial("%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", |
| "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12", |
| "%xmm13", "%xmm14", "%xmm15", "%xmm0", 1, 1, $is_128); |
| |
| { |
| $code .= <<___; |
| add \$0x10,$output |
| vmovdqa %xmm1,%xmm8 |
| jmp .L_done_${rndsuffix} |
| .cfi_endproc |
| ___ |
| } |
| |
| } |
| |
| # The only difference between AES-XTS-128 and -256 is the number of rounds, |
| # so we generate from the same perlasm base, extending to 14 rounds when |
| # `$is_128' is 0. |
| |
| enc(1); |
| dec(1); |
| |
| enc(0); |
| dec(0); |
| |
| $code .= <<___; |
| .section .rodata |
| .align 16 |
| |
| vpshufb_shf_table: |
| .quad 0x8786858483828100, 0x8f8e8d8c8b8a8988 |
| .quad 0x0706050403020100, 0x000e0d0c0b0a0908 |
| |
| mask1: |
| .quad 0x8080808080808080, 0x8080808080808080 |
| |
| const_dq3210: |
| .quad 0, 0, 1, 1, 2, 2, 3, 3 |
| const_dq5678: |
| .quad 8, 8, 7, 7, 6, 6, 5, 5 |
| const_dq7654: |
| .quad 4, 4, 5, 5, 6, 6, 7, 7 |
| const_dq1234: |
| .quad 4, 4, 3, 3, 2, 2, 1, 1 |
| |
| shufb_15_7: |
| .byte 15, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 7, 0xff, 0xff |
| .byte 0xff, 0xff, 0xff, 0xff, 0xff |
| |
| .text |
| ___ |
| |
| } else { |
| $code .= <<___; |
| .text |
| .globl aesni_xts_128_encrypt_avx512 |
| .globl aesni_xts_128_decrypt_avx512 |
| |
| aesni_xts_128_encrypt_avx512: |
| aesni_xts_128_decrypt_avx512: |
| .byte 0x0f,0x0b # ud2 |
| ret |
| |
| .globl aesni_xts_256_encrypt_avx512 |
| .globl aesni_xts_256_decrypt_avx512 |
| |
| aesni_xts_256_encrypt_avx512: |
| aesni_xts_256_decrypt_avx512: |
| .byte 0x0f,0x0b # ud2 |
| ret |
| |
| .globl aesni_xts_avx512_eligible |
| .type aesni_xts_avx512_eligible,\@abi-omnipotent |
| aesni_xts_avx512_eligible: |
| xor %eax,%eax |
| ret |
| .size aesni_xts_avx512_eligible, .-aesni_xts_avx512_eligible |
| |
| ___ |
| } |
| |
| print $code; |
| |
| close STDOUT or die "error closing STDOUT: $!"; |