| #!/usr/bin/env perl |
| # Copyright 2020-2021 The OpenSSL Project Authors. All Rights Reserved. |
| # |
| # Licensed under the Apache License 2.0 (the "License"). You may not use |
| # this file except in compliance with the License. You can obtain a copy |
| # in the file LICENSE in the source distribution or at |
| # https://www.openssl.org/source/license.html |
| |
| use strict; |
| |
| my $output = $#ARGV >= 0 && $ARGV[$#ARGV] =~ m|\.\w+$| ? pop : undef; |
| my $flavour = $#ARGV >= 0 && $ARGV[0] !~ m|\.| ? shift : undef; |
| my $xlate; |
| |
| $0 =~ m/(.*[\/\\])[^\/\\]+$/; my $dir=$1; |
| ( $xlate="${dir}arm-xlate.pl" and -f $xlate ) or |
| ( $xlate="${dir}../../perlasm/arm-xlate.pl" and -f $xlate ) or |
| die "can't locate arm-xlate.pl"; |
| |
| open OUT,"| \"$^X\" $xlate $flavour $output"; |
| *STDOUT=*OUT; |
| |
| my $code = data(); |
| print $code; |
| |
| close STDOUT or die "error closing STDOUT: $!"; # enforce flush |
| |
| sub data |
| { |
| local $/; |
| return <DATA>; |
| } |
| |
| __END__ |
| // Copyright 2021 The OpenSSL Project Authors. All Rights Reserved. |
| // |
| // Licensed under the OpenSSL license (the "License"). You may not use |
| // this file except in compliance with the License. You can obtain a copy |
| // in the file LICENSE in the source distribution or at |
| // https://www.openssl.org/source/license.html |
| // |
| // ==================================================================== |
| // Written by Ben Avison <bavison@riscosopen.org> for the OpenSSL |
| // project. Rights for redistribution and usage in source and binary |
| // forms are granted according to the OpenSSL license. |
| // ==================================================================== |
| // |
| // This implementation is a translation of bsaes-armv7 for AArch64. |
| // No attempt has been made to carry across the build switches for |
| // kernel targets, since the Linux kernel crypto support has moved on |
| // from when it was based on OpenSSL. |
| |
| // A lot of hand-scheduling has been performed. Consequently, this code |
| // doesn't factor out neatly into macros in the same way that the |
| // AArch32 version did, and there is little to be gained by wrapping it |
| // up in Perl, and it is presented as pure assembly. |
| |
| |
| #include "crypto/arm_arch.h" |
| |
| .text |
| |
| .extern AES_cbc_encrypt |
| .extern AES_encrypt |
| .extern AES_decrypt |
| |
| .type _bsaes_decrypt8,%function |
| .align 4 |
| // On entry: |
| // x9 -> key (previously expanded using _bsaes_key_convert) |
| // x10 = number of rounds |
| // v0-v7 input data |
| // On exit: |
| // x9-x11 corrupted |
| // other general-purpose registers preserved |
| // v0-v7 output data |
| // v11-v15 preserved |
| // other SIMD registers corrupted |
| _bsaes_decrypt8: |
| ldr q8, [x9], #16 |
| adr x11, .LM0ISR |
| movi v9.16b, #0x55 |
| ldr q10, [x11], #16 |
| movi v16.16b, #0x33 |
| movi v17.16b, #0x0f |
| sub x10, x10, #1 |
| eor v0.16b, v0.16b, v8.16b |
| eor v1.16b, v1.16b, v8.16b |
| eor v2.16b, v2.16b, v8.16b |
| eor v4.16b, v4.16b, v8.16b |
| eor v3.16b, v3.16b, v8.16b |
| eor v5.16b, v5.16b, v8.16b |
| tbl v0.16b, {v0.16b}, v10.16b |
| tbl v1.16b, {v1.16b}, v10.16b |
| tbl v2.16b, {v2.16b}, v10.16b |
| tbl v4.16b, {v4.16b}, v10.16b |
| eor v6.16b, v6.16b, v8.16b |
| eor v7.16b, v7.16b, v8.16b |
| tbl v3.16b, {v3.16b}, v10.16b |
| tbl v5.16b, {v5.16b}, v10.16b |
| tbl v6.16b, {v6.16b}, v10.16b |
| ushr v8.2d, v0.2d, #1 |
| tbl v7.16b, {v7.16b}, v10.16b |
| ushr v10.2d, v4.2d, #1 |
| ushr v18.2d, v2.2d, #1 |
| eor v8.16b, v8.16b, v1.16b |
| ushr v19.2d, v6.2d, #1 |
| eor v10.16b, v10.16b, v5.16b |
| eor v18.16b, v18.16b, v3.16b |
| and v8.16b, v8.16b, v9.16b |
| eor v19.16b, v19.16b, v7.16b |
| and v10.16b, v10.16b, v9.16b |
| and v18.16b, v18.16b, v9.16b |
| eor v1.16b, v1.16b, v8.16b |
| shl v8.2d, v8.2d, #1 |
| and v9.16b, v19.16b, v9.16b |
| eor v5.16b, v5.16b, v10.16b |
| shl v10.2d, v10.2d, #1 |
| eor v3.16b, v3.16b, v18.16b |
| shl v18.2d, v18.2d, #1 |
| eor v0.16b, v0.16b, v8.16b |
| shl v8.2d, v9.2d, #1 |
| eor v7.16b, v7.16b, v9.16b |
| eor v4.16b, v4.16b, v10.16b |
| eor v2.16b, v2.16b, v18.16b |
| ushr v9.2d, v1.2d, #2 |
| eor v6.16b, v6.16b, v8.16b |
| ushr v8.2d, v0.2d, #2 |
| ushr v10.2d, v5.2d, #2 |
| ushr v18.2d, v4.2d, #2 |
| eor v9.16b, v9.16b, v3.16b |
| eor v8.16b, v8.16b, v2.16b |
| eor v10.16b, v10.16b, v7.16b |
| eor v18.16b, v18.16b, v6.16b |
| and v9.16b, v9.16b, v16.16b |
| and v8.16b, v8.16b, v16.16b |
| and v10.16b, v10.16b, v16.16b |
| and v16.16b, v18.16b, v16.16b |
| eor v3.16b, v3.16b, v9.16b |
| shl v9.2d, v9.2d, #2 |
| eor v2.16b, v2.16b, v8.16b |
| shl v8.2d, v8.2d, #2 |
| eor v7.16b, v7.16b, v10.16b |
| shl v10.2d, v10.2d, #2 |
| eor v6.16b, v6.16b, v16.16b |
| shl v16.2d, v16.2d, #2 |
| eor v1.16b, v1.16b, v9.16b |
| eor v0.16b, v0.16b, v8.16b |
| eor v5.16b, v5.16b, v10.16b |
| eor v4.16b, v4.16b, v16.16b |
| ushr v8.2d, v3.2d, #4 |
| ushr v9.2d, v2.2d, #4 |
| ushr v10.2d, v1.2d, #4 |
| ushr v16.2d, v0.2d, #4 |
| eor v8.16b, v8.16b, v7.16b |
| eor v9.16b, v9.16b, v6.16b |
| eor v10.16b, v10.16b, v5.16b |
| eor v16.16b, v16.16b, v4.16b |
| and v8.16b, v8.16b, v17.16b |
| and v9.16b, v9.16b, v17.16b |
| and v10.16b, v10.16b, v17.16b |
| and v16.16b, v16.16b, v17.16b |
| eor v7.16b, v7.16b, v8.16b |
| shl v8.2d, v8.2d, #4 |
| eor v6.16b, v6.16b, v9.16b |
| shl v9.2d, v9.2d, #4 |
| eor v5.16b, v5.16b, v10.16b |
| shl v10.2d, v10.2d, #4 |
| eor v4.16b, v4.16b, v16.16b |
| shl v16.2d, v16.2d, #4 |
| eor v3.16b, v3.16b, v8.16b |
| eor v2.16b, v2.16b, v9.16b |
| eor v1.16b, v1.16b, v10.16b |
| eor v0.16b, v0.16b, v16.16b |
| b .Ldec_sbox |
| .align 4 |
| .Ldec_loop: |
| ld1 {v16.16b, v17.16b, v18.16b, v19.16b}, [x9], #64 |
| ldp q8, q9, [x9], #32 |
| eor v0.16b, v16.16b, v0.16b |
| ldr q10, [x9], #16 |
| eor v1.16b, v17.16b, v1.16b |
| ldr q16, [x9], #16 |
| eor v2.16b, v18.16b, v2.16b |
| eor v3.16b, v19.16b, v3.16b |
| eor v4.16b, v8.16b, v4.16b |
| eor v5.16b, v9.16b, v5.16b |
| eor v6.16b, v10.16b, v6.16b |
| eor v7.16b, v16.16b, v7.16b |
| tbl v0.16b, {v0.16b}, v28.16b |
| tbl v1.16b, {v1.16b}, v28.16b |
| tbl v2.16b, {v2.16b}, v28.16b |
| tbl v3.16b, {v3.16b}, v28.16b |
| tbl v4.16b, {v4.16b}, v28.16b |
| tbl v5.16b, {v5.16b}, v28.16b |
| tbl v6.16b, {v6.16b}, v28.16b |
| tbl v7.16b, {v7.16b}, v28.16b |
| .Ldec_sbox: |
| eor v1.16b, v1.16b, v4.16b |
| eor v3.16b, v3.16b, v4.16b |
| subs x10, x10, #1 |
| eor v4.16b, v4.16b, v7.16b |
| eor v2.16b, v2.16b, v7.16b |
| eor v1.16b, v1.16b, v6.16b |
| eor v6.16b, v6.16b, v4.16b |
| eor v2.16b, v2.16b, v5.16b |
| eor v0.16b, v0.16b, v1.16b |
| eor v7.16b, v7.16b, v6.16b |
| eor v8.16b, v6.16b, v2.16b |
| and v9.16b, v4.16b, v6.16b |
| eor v10.16b, v2.16b, v6.16b |
| eor v3.16b, v3.16b, v0.16b |
| eor v5.16b, v5.16b, v0.16b |
| eor v16.16b, v7.16b, v4.16b |
| eor v17.16b, v4.16b, v0.16b |
| and v18.16b, v0.16b, v2.16b |
| eor v19.16b, v7.16b, v4.16b |
| eor v1.16b, v1.16b, v3.16b |
| eor v20.16b, v3.16b, v0.16b |
| eor v21.16b, v5.16b, v2.16b |
| eor v22.16b, v3.16b, v7.16b |
| and v8.16b, v17.16b, v8.16b |
| orr v17.16b, v3.16b, v5.16b |
| eor v23.16b, v1.16b, v6.16b |
| eor v24.16b, v20.16b, v16.16b |
| eor v25.16b, v1.16b, v5.16b |
| orr v26.16b, v20.16b, v21.16b |
| and v20.16b, v20.16b, v21.16b |
| and v27.16b, v7.16b, v1.16b |
| eor v21.16b, v21.16b, v23.16b |
| orr v28.16b, v16.16b, v23.16b |
| orr v29.16b, v22.16b, v25.16b |
| eor v26.16b, v26.16b, v8.16b |
| and v16.16b, v16.16b, v23.16b |
| and v22.16b, v22.16b, v25.16b |
| and v21.16b, v24.16b, v21.16b |
| eor v8.16b, v28.16b, v8.16b |
| eor v23.16b, v5.16b, v2.16b |
| eor v24.16b, v1.16b, v6.16b |
| eor v16.16b, v16.16b, v22.16b |
| eor v22.16b, v3.16b, v0.16b |
| eor v25.16b, v29.16b, v21.16b |
| eor v21.16b, v26.16b, v21.16b |
| eor v8.16b, v8.16b, v20.16b |
| eor v26.16b, v23.16b, v24.16b |
| eor v16.16b, v16.16b, v20.16b |
| eor v28.16b, v22.16b, v19.16b |
| eor v20.16b, v25.16b, v20.16b |
| eor v9.16b, v21.16b, v9.16b |
| eor v8.16b, v8.16b, v18.16b |
| eor v18.16b, v5.16b, v1.16b |
| eor v21.16b, v16.16b, v17.16b |
| eor v16.16b, v16.16b, v17.16b |
| eor v17.16b, v20.16b, v27.16b |
| eor v20.16b, v3.16b, v7.16b |
| eor v25.16b, v9.16b, v8.16b |
| eor v27.16b, v0.16b, v4.16b |
| and v29.16b, v9.16b, v17.16b |
| eor v30.16b, v8.16b, v29.16b |
| eor v31.16b, v21.16b, v29.16b |
| eor v29.16b, v21.16b, v29.16b |
| bsl v30.16b, v17.16b, v21.16b |
| bsl v31.16b, v9.16b, v8.16b |
| bsl v16.16b, v30.16b, v29.16b |
| bsl v21.16b, v29.16b, v30.16b |
| eor v8.16b, v31.16b, v30.16b |
| and v1.16b, v1.16b, v31.16b |
| and v9.16b, v16.16b, v31.16b |
| and v6.16b, v6.16b, v30.16b |
| eor v16.16b, v17.16b, v21.16b |
| and v4.16b, v4.16b, v30.16b |
| eor v17.16b, v8.16b, v30.16b |
| and v21.16b, v24.16b, v8.16b |
| eor v9.16b, v9.16b, v25.16b |
| and v19.16b, v19.16b, v8.16b |
| eor v24.16b, v30.16b, v16.16b |
| eor v25.16b, v30.16b, v16.16b |
| and v7.16b, v7.16b, v17.16b |
| and v10.16b, v10.16b, v16.16b |
| eor v29.16b, v9.16b, v16.16b |
| eor v30.16b, v31.16b, v9.16b |
| and v0.16b, v24.16b, v0.16b |
| and v9.16b, v18.16b, v9.16b |
| and v2.16b, v25.16b, v2.16b |
| eor v10.16b, v10.16b, v6.16b |
| eor v18.16b, v29.16b, v16.16b |
| and v5.16b, v30.16b, v5.16b |
| eor v24.16b, v8.16b, v29.16b |
| and v25.16b, v26.16b, v29.16b |
| and v26.16b, v28.16b, v29.16b |
| eor v8.16b, v8.16b, v29.16b |
| eor v17.16b, v17.16b, v18.16b |
| eor v5.16b, v1.16b, v5.16b |
| and v23.16b, v24.16b, v23.16b |
| eor v21.16b, v21.16b, v25.16b |
| eor v19.16b, v19.16b, v26.16b |
| eor v0.16b, v4.16b, v0.16b |
| and v3.16b, v17.16b, v3.16b |
| eor v1.16b, v9.16b, v1.16b |
| eor v9.16b, v25.16b, v23.16b |
| eor v5.16b, v5.16b, v21.16b |
| eor v2.16b, v6.16b, v2.16b |
| and v6.16b, v8.16b, v22.16b |
| eor v3.16b, v7.16b, v3.16b |
| and v8.16b, v20.16b, v18.16b |
| eor v10.16b, v10.16b, v9.16b |
| eor v0.16b, v0.16b, v19.16b |
| eor v9.16b, v1.16b, v9.16b |
| eor v1.16b, v2.16b, v21.16b |
| eor v3.16b, v3.16b, v19.16b |
| and v16.16b, v27.16b, v16.16b |
| eor v17.16b, v26.16b, v6.16b |
| eor v6.16b, v8.16b, v7.16b |
| eor v7.16b, v1.16b, v9.16b |
| eor v1.16b, v5.16b, v3.16b |
| eor v2.16b, v10.16b, v3.16b |
| eor v4.16b, v16.16b, v4.16b |
| eor v8.16b, v6.16b, v17.16b |
| eor v5.16b, v9.16b, v3.16b |
| eor v9.16b, v0.16b, v1.16b |
| eor v6.16b, v7.16b, v1.16b |
| eor v0.16b, v4.16b, v17.16b |
| eor v4.16b, v8.16b, v7.16b |
| eor v7.16b, v9.16b, v2.16b |
| eor v8.16b, v3.16b, v0.16b |
| eor v7.16b, v7.16b, v5.16b |
| eor v3.16b, v4.16b, v7.16b |
| eor v4.16b, v7.16b, v0.16b |
| eor v7.16b, v8.16b, v3.16b |
| bcc .Ldec_done |
| ext v8.16b, v0.16b, v0.16b, #8 |
| ext v9.16b, v1.16b, v1.16b, #8 |
| ldr q28, [x11] // load from .LISR in common case (x10 > 0) |
| ext v10.16b, v6.16b, v6.16b, #8 |
| ext v16.16b, v3.16b, v3.16b, #8 |
| ext v17.16b, v5.16b, v5.16b, #8 |
| ext v18.16b, v4.16b, v4.16b, #8 |
| eor v8.16b, v8.16b, v0.16b |
| eor v9.16b, v9.16b, v1.16b |
| eor v10.16b, v10.16b, v6.16b |
| eor v16.16b, v16.16b, v3.16b |
| eor v17.16b, v17.16b, v5.16b |
| ext v19.16b, v2.16b, v2.16b, #8 |
| ext v20.16b, v7.16b, v7.16b, #8 |
| eor v18.16b, v18.16b, v4.16b |
| eor v6.16b, v6.16b, v8.16b |
| eor v8.16b, v2.16b, v10.16b |
| eor v4.16b, v4.16b, v9.16b |
| eor v2.16b, v19.16b, v2.16b |
| eor v9.16b, v20.16b, v7.16b |
| eor v0.16b, v0.16b, v16.16b |
| eor v1.16b, v1.16b, v16.16b |
| eor v6.16b, v6.16b, v17.16b |
| eor v8.16b, v8.16b, v16.16b |
| eor v7.16b, v7.16b, v18.16b |
| eor v4.16b, v4.16b, v16.16b |
| eor v2.16b, v3.16b, v2.16b |
| eor v1.16b, v1.16b, v17.16b |
| eor v3.16b, v5.16b, v9.16b |
| eor v5.16b, v8.16b, v17.16b |
| eor v7.16b, v7.16b, v17.16b |
| ext v8.16b, v0.16b, v0.16b, #12 |
| ext v9.16b, v6.16b, v6.16b, #12 |
| ext v10.16b, v4.16b, v4.16b, #12 |
| ext v16.16b, v1.16b, v1.16b, #12 |
| ext v17.16b, v5.16b, v5.16b, #12 |
| ext v18.16b, v7.16b, v7.16b, #12 |
| eor v0.16b, v0.16b, v8.16b |
| eor v6.16b, v6.16b, v9.16b |
| eor v4.16b, v4.16b, v10.16b |
| ext v19.16b, v2.16b, v2.16b, #12 |
| ext v20.16b, v3.16b, v3.16b, #12 |
| eor v1.16b, v1.16b, v16.16b |
| eor v5.16b, v5.16b, v17.16b |
| eor v7.16b, v7.16b, v18.16b |
| eor v2.16b, v2.16b, v19.16b |
| eor v16.16b, v16.16b, v0.16b |
| eor v3.16b, v3.16b, v20.16b |
| eor v17.16b, v17.16b, v4.16b |
| eor v10.16b, v10.16b, v6.16b |
| ext v0.16b, v0.16b, v0.16b, #8 |
| eor v9.16b, v9.16b, v1.16b |
| ext v1.16b, v1.16b, v1.16b, #8 |
| eor v8.16b, v8.16b, v3.16b |
| eor v16.16b, v16.16b, v3.16b |
| eor v18.16b, v18.16b, v5.16b |
| eor v19.16b, v19.16b, v7.16b |
| ext v21.16b, v5.16b, v5.16b, #8 |
| ext v5.16b, v7.16b, v7.16b, #8 |
| eor v7.16b, v20.16b, v2.16b |
| ext v4.16b, v4.16b, v4.16b, #8 |
| ext v20.16b, v3.16b, v3.16b, #8 |
| eor v17.16b, v17.16b, v3.16b |
| ext v2.16b, v2.16b, v2.16b, #8 |
| eor v3.16b, v10.16b, v3.16b |
| ext v10.16b, v6.16b, v6.16b, #8 |
| eor v0.16b, v0.16b, v8.16b |
| eor v1.16b, v1.16b, v16.16b |
| eor v5.16b, v5.16b, v18.16b |
| eor v3.16b, v3.16b, v4.16b |
| eor v7.16b, v20.16b, v7.16b |
| eor v6.16b, v2.16b, v19.16b |
| eor v4.16b, v21.16b, v17.16b |
| eor v2.16b, v10.16b, v9.16b |
| bne .Ldec_loop |
| ldr q28, [x11, #16]! // load from .LISRM0 on last round (x10 == 0) |
| b .Ldec_loop |
| .align 4 |
| .Ldec_done: |
| ushr v8.2d, v0.2d, #1 |
| movi v9.16b, #0x55 |
| ldr q10, [x9] |
| ushr v16.2d, v2.2d, #1 |
| movi v17.16b, #0x33 |
| ushr v18.2d, v6.2d, #1 |
| movi v19.16b, #0x0f |
| eor v8.16b, v8.16b, v1.16b |
| ushr v20.2d, v3.2d, #1 |
| eor v16.16b, v16.16b, v7.16b |
| eor v18.16b, v18.16b, v4.16b |
| and v8.16b, v8.16b, v9.16b |
| eor v20.16b, v20.16b, v5.16b |
| and v16.16b, v16.16b, v9.16b |
| and v18.16b, v18.16b, v9.16b |
| shl v21.2d, v8.2d, #1 |
| eor v1.16b, v1.16b, v8.16b |
| and v8.16b, v20.16b, v9.16b |
| eor v7.16b, v7.16b, v16.16b |
| shl v9.2d, v16.2d, #1 |
| eor v4.16b, v4.16b, v18.16b |
| shl v16.2d, v18.2d, #1 |
| eor v0.16b, v0.16b, v21.16b |
| shl v18.2d, v8.2d, #1 |
| eor v5.16b, v5.16b, v8.16b |
| eor v2.16b, v2.16b, v9.16b |
| eor v6.16b, v6.16b, v16.16b |
| ushr v8.2d, v1.2d, #2 |
| eor v3.16b, v3.16b, v18.16b |
| ushr v9.2d, v0.2d, #2 |
| ushr v16.2d, v7.2d, #2 |
| ushr v18.2d, v2.2d, #2 |
| eor v8.16b, v8.16b, v4.16b |
| eor v9.16b, v9.16b, v6.16b |
| eor v16.16b, v16.16b, v5.16b |
| eor v18.16b, v18.16b, v3.16b |
| and v8.16b, v8.16b, v17.16b |
| and v9.16b, v9.16b, v17.16b |
| and v16.16b, v16.16b, v17.16b |
| and v17.16b, v18.16b, v17.16b |
| eor v4.16b, v4.16b, v8.16b |
| shl v8.2d, v8.2d, #2 |
| eor v6.16b, v6.16b, v9.16b |
| shl v9.2d, v9.2d, #2 |
| eor v5.16b, v5.16b, v16.16b |
| shl v16.2d, v16.2d, #2 |
| eor v3.16b, v3.16b, v17.16b |
| shl v17.2d, v17.2d, #2 |
| eor v1.16b, v1.16b, v8.16b |
| eor v0.16b, v0.16b, v9.16b |
| eor v7.16b, v7.16b, v16.16b |
| eor v2.16b, v2.16b, v17.16b |
| ushr v8.2d, v4.2d, #4 |
| ushr v9.2d, v6.2d, #4 |
| ushr v16.2d, v1.2d, #4 |
| ushr v17.2d, v0.2d, #4 |
| eor v8.16b, v8.16b, v5.16b |
| eor v9.16b, v9.16b, v3.16b |
| eor v16.16b, v16.16b, v7.16b |
| eor v17.16b, v17.16b, v2.16b |
| and v8.16b, v8.16b, v19.16b |
| and v9.16b, v9.16b, v19.16b |
| and v16.16b, v16.16b, v19.16b |
| and v17.16b, v17.16b, v19.16b |
| eor v5.16b, v5.16b, v8.16b |
| shl v8.2d, v8.2d, #4 |
| eor v3.16b, v3.16b, v9.16b |
| shl v9.2d, v9.2d, #4 |
| eor v7.16b, v7.16b, v16.16b |
| shl v16.2d, v16.2d, #4 |
| eor v2.16b, v2.16b, v17.16b |
| shl v17.2d, v17.2d, #4 |
| eor v4.16b, v4.16b, v8.16b |
| eor v6.16b, v6.16b, v9.16b |
| eor v7.16b, v7.16b, v10.16b |
| eor v1.16b, v1.16b, v16.16b |
| eor v2.16b, v2.16b, v10.16b |
| eor v0.16b, v0.16b, v17.16b |
| eor v4.16b, v4.16b, v10.16b |
| eor v6.16b, v6.16b, v10.16b |
| eor v3.16b, v3.16b, v10.16b |
| eor v5.16b, v5.16b, v10.16b |
| eor v1.16b, v1.16b, v10.16b |
| eor v0.16b, v0.16b, v10.16b |
| ret |
| .size _bsaes_decrypt8,.-_bsaes_decrypt8 |
| |
| .type _bsaes_const,%object |
| .align 6 |
| _bsaes_const: |
| // InvShiftRows constants |
| // Used in _bsaes_decrypt8, which assumes contiguity |
| // .LM0ISR used with round 0 key |
| // .LISR used with middle round keys |
| // .LISRM0 used with final round key |
| .LM0ISR: |
| .quad 0x0a0e0206070b0f03, 0x0004080c0d010509 |
| .LISR: |
| .quad 0x0504070602010003, 0x0f0e0d0c080b0a09 |
| .LISRM0: |
| .quad 0x01040b0e0205080f, 0x0306090c00070a0d |
| |
| // ShiftRows constants |
| // Used in _bsaes_encrypt8, which assumes contiguity |
| // .LM0SR used with round 0 key |
| // .LSR used with middle round keys |
| // .LSRM0 used with final round key |
| .LM0SR: |
| .quad 0x0a0e02060f03070b, 0x0004080c05090d01 |
| .LSR: |
| .quad 0x0504070600030201, 0x0f0e0d0c0a09080b |
| .LSRM0: |
| .quad 0x0304090e00050a0f, 0x01060b0c0207080d |
| |
| .LM0_bigendian: |
| .quad 0x02060a0e03070b0f, 0x0004080c0105090d |
| .LM0_littleendian: |
| .quad 0x0105090d0004080c, 0x03070b0f02060a0e |
| |
| // Used in ossl_bsaes_ctr32_encrypt_blocks, prior to dropping into |
| // _bsaes_encrypt8_alt, for round 0 key in place of .LM0SR |
| .LREVM0SR: |
| .quad 0x090d01050c000408, 0x03070b0f060a0e02 |
| |
| .align 6 |
| .size _bsaes_const,.-_bsaes_const |
| |
| .type _bsaes_encrypt8,%function |
| .align 4 |
| // On entry: |
| // x9 -> key (previously expanded using _bsaes_key_convert) |
| // x10 = number of rounds |
| // v0-v7 input data |
| // On exit: |
| // x9-x11 corrupted |
| // other general-purpose registers preserved |
| // v0-v7 output data |
| // v11-v15 preserved |
| // other SIMD registers corrupted |
| _bsaes_encrypt8: |
| ldr q8, [x9], #16 |
| adr x11, .LM0SR |
| ldr q9, [x11], #16 |
| _bsaes_encrypt8_alt: |
| eor v0.16b, v0.16b, v8.16b |
| eor v1.16b, v1.16b, v8.16b |
| sub x10, x10, #1 |
| eor v2.16b, v2.16b, v8.16b |
| eor v4.16b, v4.16b, v8.16b |
| eor v3.16b, v3.16b, v8.16b |
| eor v5.16b, v5.16b, v8.16b |
| tbl v0.16b, {v0.16b}, v9.16b |
| tbl v1.16b, {v1.16b}, v9.16b |
| tbl v2.16b, {v2.16b}, v9.16b |
| tbl v4.16b, {v4.16b}, v9.16b |
| eor v6.16b, v6.16b, v8.16b |
| eor v7.16b, v7.16b, v8.16b |
| tbl v3.16b, {v3.16b}, v9.16b |
| tbl v5.16b, {v5.16b}, v9.16b |
| tbl v6.16b, {v6.16b}, v9.16b |
| ushr v8.2d, v0.2d, #1 |
| movi v10.16b, #0x55 |
| tbl v7.16b, {v7.16b}, v9.16b |
| ushr v9.2d, v4.2d, #1 |
| movi v16.16b, #0x33 |
| ushr v17.2d, v2.2d, #1 |
| eor v8.16b, v8.16b, v1.16b |
| movi v18.16b, #0x0f |
| ushr v19.2d, v6.2d, #1 |
| eor v9.16b, v9.16b, v5.16b |
| eor v17.16b, v17.16b, v3.16b |
| and v8.16b, v8.16b, v10.16b |
| eor v19.16b, v19.16b, v7.16b |
| and v9.16b, v9.16b, v10.16b |
| and v17.16b, v17.16b, v10.16b |
| eor v1.16b, v1.16b, v8.16b |
| shl v8.2d, v8.2d, #1 |
| and v10.16b, v19.16b, v10.16b |
| eor v5.16b, v5.16b, v9.16b |
| shl v9.2d, v9.2d, #1 |
| eor v3.16b, v3.16b, v17.16b |
| shl v17.2d, v17.2d, #1 |
| eor v0.16b, v0.16b, v8.16b |
| shl v8.2d, v10.2d, #1 |
| eor v7.16b, v7.16b, v10.16b |
| eor v4.16b, v4.16b, v9.16b |
| eor v2.16b, v2.16b, v17.16b |
| ushr v9.2d, v1.2d, #2 |
| eor v6.16b, v6.16b, v8.16b |
| ushr v8.2d, v0.2d, #2 |
| ushr v10.2d, v5.2d, #2 |
| ushr v17.2d, v4.2d, #2 |
| eor v9.16b, v9.16b, v3.16b |
| eor v8.16b, v8.16b, v2.16b |
| eor v10.16b, v10.16b, v7.16b |
| eor v17.16b, v17.16b, v6.16b |
| and v9.16b, v9.16b, v16.16b |
| and v8.16b, v8.16b, v16.16b |
| and v10.16b, v10.16b, v16.16b |
| and v16.16b, v17.16b, v16.16b |
| eor v3.16b, v3.16b, v9.16b |
| shl v9.2d, v9.2d, #2 |
| eor v2.16b, v2.16b, v8.16b |
| shl v8.2d, v8.2d, #2 |
| eor v7.16b, v7.16b, v10.16b |
| shl v10.2d, v10.2d, #2 |
| eor v6.16b, v6.16b, v16.16b |
| shl v16.2d, v16.2d, #2 |
| eor v1.16b, v1.16b, v9.16b |
| eor v0.16b, v0.16b, v8.16b |
| eor v5.16b, v5.16b, v10.16b |
| eor v4.16b, v4.16b, v16.16b |
| ushr v8.2d, v3.2d, #4 |
| ushr v9.2d, v2.2d, #4 |
| ushr v10.2d, v1.2d, #4 |
| ushr v16.2d, v0.2d, #4 |
| eor v8.16b, v8.16b, v7.16b |
| eor v9.16b, v9.16b, v6.16b |
| eor v10.16b, v10.16b, v5.16b |
| eor v16.16b, v16.16b, v4.16b |
| and v8.16b, v8.16b, v18.16b |
| and v9.16b, v9.16b, v18.16b |
| and v10.16b, v10.16b, v18.16b |
| and v16.16b, v16.16b, v18.16b |
| eor v7.16b, v7.16b, v8.16b |
| shl v8.2d, v8.2d, #4 |
| eor v6.16b, v6.16b, v9.16b |
| shl v9.2d, v9.2d, #4 |
| eor v5.16b, v5.16b, v10.16b |
| shl v10.2d, v10.2d, #4 |
| eor v4.16b, v4.16b, v16.16b |
| shl v16.2d, v16.2d, #4 |
| eor v3.16b, v3.16b, v8.16b |
| eor v2.16b, v2.16b, v9.16b |
| eor v1.16b, v1.16b, v10.16b |
| eor v0.16b, v0.16b, v16.16b |
| b .Lenc_sbox |
| .align 4 |
| .Lenc_loop: |
| ld1 {v16.16b, v17.16b, v18.16b, v19.16b}, [x9], #64 |
| ldp q8, q9, [x9], #32 |
| eor v0.16b, v16.16b, v0.16b |
| ldr q10, [x9], #16 |
| eor v1.16b, v17.16b, v1.16b |
| ldr q16, [x9], #16 |
| eor v2.16b, v18.16b, v2.16b |
| eor v3.16b, v19.16b, v3.16b |
| eor v4.16b, v8.16b, v4.16b |
| eor v5.16b, v9.16b, v5.16b |
| eor v6.16b, v10.16b, v6.16b |
| eor v7.16b, v16.16b, v7.16b |
| tbl v0.16b, {v0.16b}, v28.16b |
| tbl v1.16b, {v1.16b}, v28.16b |
| tbl v2.16b, {v2.16b}, v28.16b |
| tbl v3.16b, {v3.16b}, v28.16b |
| tbl v4.16b, {v4.16b}, v28.16b |
| tbl v5.16b, {v5.16b}, v28.16b |
| tbl v6.16b, {v6.16b}, v28.16b |
| tbl v7.16b, {v7.16b}, v28.16b |
| .Lenc_sbox: |
| eor v5.16b, v5.16b, v6.16b |
| eor v3.16b, v3.16b, v0.16b |
| subs x10, x10, #1 |
| eor v2.16b, v2.16b, v1.16b |
| eor v5.16b, v5.16b, v0.16b |
| eor v8.16b, v3.16b, v7.16b |
| eor v6.16b, v6.16b, v2.16b |
| eor v7.16b, v7.16b, v5.16b |
| eor v8.16b, v8.16b, v4.16b |
| eor v3.16b, v6.16b, v3.16b |
| eor v4.16b, v4.16b, v5.16b |
| eor v6.16b, v1.16b, v5.16b |
| eor v2.16b, v2.16b, v7.16b |
| eor v1.16b, v8.16b, v1.16b |
| eor v8.16b, v7.16b, v4.16b |
| eor v9.16b, v3.16b, v0.16b |
| eor v10.16b, v7.16b, v6.16b |
| eor v16.16b, v5.16b, v3.16b |
| eor v17.16b, v6.16b, v2.16b |
| eor v18.16b, v5.16b, v1.16b |
| eor v19.16b, v2.16b, v4.16b |
| eor v20.16b, v1.16b, v0.16b |
| orr v21.16b, v8.16b, v9.16b |
| orr v22.16b, v10.16b, v16.16b |
| eor v23.16b, v8.16b, v17.16b |
| eor v24.16b, v9.16b, v18.16b |
| and v19.16b, v19.16b, v20.16b |
| orr v20.16b, v17.16b, v18.16b |
| and v8.16b, v8.16b, v9.16b |
| and v9.16b, v17.16b, v18.16b |
| and v17.16b, v23.16b, v24.16b |
| and v10.16b, v10.16b, v16.16b |
| eor v16.16b, v21.16b, v19.16b |
| eor v18.16b, v20.16b, v19.16b |
| and v19.16b, v2.16b, v1.16b |
| and v20.16b, v6.16b, v5.16b |
| eor v21.16b, v22.16b, v17.16b |
| eor v9.16b, v9.16b, v10.16b |
| eor v10.16b, v16.16b, v17.16b |
| eor v16.16b, v18.16b, v8.16b |
| and v17.16b, v4.16b, v0.16b |
| orr v18.16b, v7.16b, v3.16b |
| eor v21.16b, v21.16b, v8.16b |
| eor v8.16b, v9.16b, v8.16b |
| eor v9.16b, v10.16b, v19.16b |
| eor v10.16b, v3.16b, v0.16b |
| eor v16.16b, v16.16b, v17.16b |
| eor v17.16b, v5.16b, v1.16b |
| eor v19.16b, v21.16b, v20.16b |
| eor v20.16b, v8.16b, v18.16b |
| eor v8.16b, v8.16b, v18.16b |
| eor v18.16b, v7.16b, v4.16b |
| eor v21.16b, v9.16b, v16.16b |
| eor v22.16b, v6.16b, v2.16b |
| and v23.16b, v9.16b, v19.16b |
| eor v24.16b, v10.16b, v17.16b |
| eor v25.16b, v0.16b, v1.16b |
| eor v26.16b, v7.16b, v6.16b |
| eor v27.16b, v18.16b, v22.16b |
| eor v28.16b, v3.16b, v5.16b |
| eor v29.16b, v16.16b, v23.16b |
| eor v30.16b, v20.16b, v23.16b |
| eor v23.16b, v20.16b, v23.16b |
| eor v31.16b, v4.16b, v2.16b |
| bsl v29.16b, v19.16b, v20.16b |
| bsl v30.16b, v9.16b, v16.16b |
| bsl v8.16b, v29.16b, v23.16b |
| bsl v20.16b, v23.16b, v29.16b |
| eor v9.16b, v30.16b, v29.16b |
| and v5.16b, v5.16b, v30.16b |
| and v8.16b, v8.16b, v30.16b |
| and v1.16b, v1.16b, v29.16b |
| eor v16.16b, v19.16b, v20.16b |
| and v2.16b, v2.16b, v29.16b |
| eor v19.16b, v9.16b, v29.16b |
| and v17.16b, v17.16b, v9.16b |
| eor v8.16b, v8.16b, v21.16b |
| and v20.16b, v22.16b, v9.16b |
| eor v21.16b, v29.16b, v16.16b |
| eor v22.16b, v29.16b, v16.16b |
| and v23.16b, v25.16b, v16.16b |
| and v6.16b, v6.16b, v19.16b |
| eor v25.16b, v8.16b, v16.16b |
| eor v29.16b, v30.16b, v8.16b |
| and v4.16b, v21.16b, v4.16b |
| and v8.16b, v28.16b, v8.16b |
| and v0.16b, v22.16b, v0.16b |
| eor v21.16b, v23.16b, v1.16b |
| eor v22.16b, v9.16b, v25.16b |
| eor v9.16b, v9.16b, v25.16b |
| eor v23.16b, v25.16b, v16.16b |
| and v3.16b, v29.16b, v3.16b |
| and v24.16b, v24.16b, v25.16b |
| and v25.16b, v27.16b, v25.16b |
| and v10.16b, v22.16b, v10.16b |
| and v9.16b, v9.16b, v18.16b |
| eor v18.16b, v19.16b, v23.16b |
| and v19.16b, v26.16b, v23.16b |
| eor v3.16b, v5.16b, v3.16b |
| eor v17.16b, v17.16b, v24.16b |
| eor v10.16b, v24.16b, v10.16b |
| and v16.16b, v31.16b, v16.16b |
| eor v20.16b, v20.16b, v25.16b |
| eor v9.16b, v25.16b, v9.16b |
| eor v4.16b, v2.16b, v4.16b |
| and v7.16b, v18.16b, v7.16b |
| eor v18.16b, v19.16b, v6.16b |
| eor v5.16b, v8.16b, v5.16b |
| eor v0.16b, v1.16b, v0.16b |
| eor v1.16b, v21.16b, v10.16b |
| eor v8.16b, v3.16b, v17.16b |
| eor v2.16b, v16.16b, v2.16b |
| eor v3.16b, v6.16b, v7.16b |
| eor v6.16b, v18.16b, v9.16b |
| eor v4.16b, v4.16b, v20.16b |
| eor v10.16b, v5.16b, v10.16b |
| eor v0.16b, v0.16b, v17.16b |
| eor v9.16b, v2.16b, v9.16b |
| eor v3.16b, v3.16b, v20.16b |
| eor v7.16b, v6.16b, v1.16b |
| eor v5.16b, v8.16b, v4.16b |
| eor v6.16b, v10.16b, v1.16b |
| eor v2.16b, v4.16b, v0.16b |
| eor v4.16b, v3.16b, v10.16b |
| eor v9.16b, v9.16b, v7.16b |
| eor v3.16b, v0.16b, v5.16b |
| eor v0.16b, v1.16b, v4.16b |
| eor v1.16b, v4.16b, v8.16b |
| eor v4.16b, v9.16b, v5.16b |
| eor v6.16b, v6.16b, v3.16b |
| bcc .Lenc_done |
| ext v8.16b, v0.16b, v0.16b, #12 |
| ext v9.16b, v4.16b, v4.16b, #12 |
| ldr q28, [x11] |
| ext v10.16b, v6.16b, v6.16b, #12 |
| ext v16.16b, v1.16b, v1.16b, #12 |
| ext v17.16b, v3.16b, v3.16b, #12 |
| ext v18.16b, v7.16b, v7.16b, #12 |
| eor v0.16b, v0.16b, v8.16b |
| eor v4.16b, v4.16b, v9.16b |
| eor v6.16b, v6.16b, v10.16b |
| ext v19.16b, v2.16b, v2.16b, #12 |
| ext v20.16b, v5.16b, v5.16b, #12 |
| eor v1.16b, v1.16b, v16.16b |
| eor v3.16b, v3.16b, v17.16b |
| eor v7.16b, v7.16b, v18.16b |
| eor v2.16b, v2.16b, v19.16b |
| eor v16.16b, v16.16b, v0.16b |
| eor v5.16b, v5.16b, v20.16b |
| eor v17.16b, v17.16b, v6.16b |
| eor v10.16b, v10.16b, v4.16b |
| ext v0.16b, v0.16b, v0.16b, #8 |
| eor v9.16b, v9.16b, v1.16b |
| ext v1.16b, v1.16b, v1.16b, #8 |
| eor v8.16b, v8.16b, v5.16b |
| eor v16.16b, v16.16b, v5.16b |
| eor v18.16b, v18.16b, v3.16b |
| eor v19.16b, v19.16b, v7.16b |
| ext v3.16b, v3.16b, v3.16b, #8 |
| ext v7.16b, v7.16b, v7.16b, #8 |
| eor v20.16b, v20.16b, v2.16b |
| ext v6.16b, v6.16b, v6.16b, #8 |
| ext v21.16b, v5.16b, v5.16b, #8 |
| eor v17.16b, v17.16b, v5.16b |
| ext v2.16b, v2.16b, v2.16b, #8 |
| eor v10.16b, v10.16b, v5.16b |
| ext v22.16b, v4.16b, v4.16b, #8 |
| eor v0.16b, v0.16b, v8.16b |
| eor v1.16b, v1.16b, v16.16b |
| eor v5.16b, v7.16b, v18.16b |
| eor v4.16b, v3.16b, v17.16b |
| eor v3.16b, v6.16b, v10.16b |
| eor v7.16b, v21.16b, v20.16b |
| eor v6.16b, v2.16b, v19.16b |
| eor v2.16b, v22.16b, v9.16b |
| bne .Lenc_loop |
| ldr q28, [x11, #16]! // load from .LSRM0 on last round (x10 == 0) |
| b .Lenc_loop |
| .align 4 |
| .Lenc_done: |
| ushr v8.2d, v0.2d, #1 |
| movi v9.16b, #0x55 |
| ldr q10, [x9] |
| ushr v16.2d, v3.2d, #1 |
| movi v17.16b, #0x33 |
| ushr v18.2d, v4.2d, #1 |
| movi v19.16b, #0x0f |
| eor v8.16b, v8.16b, v1.16b |
| ushr v20.2d, v2.2d, #1 |
| eor v16.16b, v16.16b, v7.16b |
| eor v18.16b, v18.16b, v6.16b |
| and v8.16b, v8.16b, v9.16b |
| eor v20.16b, v20.16b, v5.16b |
| and v16.16b, v16.16b, v9.16b |
| and v18.16b, v18.16b, v9.16b |
| shl v21.2d, v8.2d, #1 |
| eor v1.16b, v1.16b, v8.16b |
| and v8.16b, v20.16b, v9.16b |
| eor v7.16b, v7.16b, v16.16b |
| shl v9.2d, v16.2d, #1 |
| eor v6.16b, v6.16b, v18.16b |
| shl v16.2d, v18.2d, #1 |
| eor v0.16b, v0.16b, v21.16b |
| shl v18.2d, v8.2d, #1 |
| eor v5.16b, v5.16b, v8.16b |
| eor v3.16b, v3.16b, v9.16b |
| eor v4.16b, v4.16b, v16.16b |
| ushr v8.2d, v1.2d, #2 |
| eor v2.16b, v2.16b, v18.16b |
| ushr v9.2d, v0.2d, #2 |
| ushr v16.2d, v7.2d, #2 |
| ushr v18.2d, v3.2d, #2 |
| eor v8.16b, v8.16b, v6.16b |
| eor v9.16b, v9.16b, v4.16b |
| eor v16.16b, v16.16b, v5.16b |
| eor v18.16b, v18.16b, v2.16b |
| and v8.16b, v8.16b, v17.16b |
| and v9.16b, v9.16b, v17.16b |
| and v16.16b, v16.16b, v17.16b |
| and v17.16b, v18.16b, v17.16b |
| eor v6.16b, v6.16b, v8.16b |
| shl v8.2d, v8.2d, #2 |
| eor v4.16b, v4.16b, v9.16b |
| shl v9.2d, v9.2d, #2 |
| eor v5.16b, v5.16b, v16.16b |
| shl v16.2d, v16.2d, #2 |
| eor v2.16b, v2.16b, v17.16b |
| shl v17.2d, v17.2d, #2 |
| eor v1.16b, v1.16b, v8.16b |
| eor v0.16b, v0.16b, v9.16b |
| eor v7.16b, v7.16b, v16.16b |
| eor v3.16b, v3.16b, v17.16b |
| ushr v8.2d, v6.2d, #4 |
| ushr v9.2d, v4.2d, #4 |
| ushr v16.2d, v1.2d, #4 |
| ushr v17.2d, v0.2d, #4 |
| eor v8.16b, v8.16b, v5.16b |
| eor v9.16b, v9.16b, v2.16b |
| eor v16.16b, v16.16b, v7.16b |
| eor v17.16b, v17.16b, v3.16b |
| and v8.16b, v8.16b, v19.16b |
| and v9.16b, v9.16b, v19.16b |
| and v16.16b, v16.16b, v19.16b |
| and v17.16b, v17.16b, v19.16b |
| eor v5.16b, v5.16b, v8.16b |
| shl v8.2d, v8.2d, #4 |
| eor v2.16b, v2.16b, v9.16b |
| shl v9.2d, v9.2d, #4 |
| eor v7.16b, v7.16b, v16.16b |
| shl v16.2d, v16.2d, #4 |
| eor v3.16b, v3.16b, v17.16b |
| shl v17.2d, v17.2d, #4 |
| eor v6.16b, v6.16b, v8.16b |
| eor v4.16b, v4.16b, v9.16b |
| eor v7.16b, v7.16b, v10.16b |
| eor v1.16b, v1.16b, v16.16b |
| eor v3.16b, v3.16b, v10.16b |
| eor v0.16b, v0.16b, v17.16b |
| eor v6.16b, v6.16b, v10.16b |
| eor v4.16b, v4.16b, v10.16b |
| eor v2.16b, v2.16b, v10.16b |
| eor v5.16b, v5.16b, v10.16b |
| eor v1.16b, v1.16b, v10.16b |
| eor v0.16b, v0.16b, v10.16b |
| ret |
| .size _bsaes_encrypt8,.-_bsaes_encrypt8 |
| |
| .type _bsaes_key_convert,%function |
| .align 4 |
| // On entry: |
| // x9 -> input key (big-endian) |
| // x10 = number of rounds |
| // x17 -> output key (native endianness) |
| // On exit: |
| // x9, x10 corrupted |
| // x11 -> .LM0_bigendian |
| // x17 -> last quadword of output key |
| // other general-purpose registers preserved |
| // v2-v6 preserved |
| // v7.16b[] = 0x63 |
| // v8-v14 preserved |
| // v15 = last round key (converted to native endianness) |
| // other SIMD registers corrupted |
| _bsaes_key_convert: |
| #ifdef __AARCH64EL__ |
| adr x11, .LM0_littleendian |
| #else |
| adr x11, .LM0_bigendian |
| #endif |
| ldr q0, [x9], #16 // load round 0 key |
| ldr q1, [x11] // .LM0 |
| ldr q15, [x9], #16 // load round 1 key |
| |
| movi v7.16b, #0x63 // compose .L63 |
| movi v16.16b, #0x01 // bit masks |
| movi v17.16b, #0x02 |
| movi v18.16b, #0x04 |
| movi v19.16b, #0x08 |
| movi v20.16b, #0x10 |
| movi v21.16b, #0x20 |
| movi v22.16b, #0x40 |
| movi v23.16b, #0x80 |
| |
| #ifdef __AARCH64EL__ |
| rev32 v0.16b, v0.16b |
| #endif |
| sub x10, x10, #1 |
| str q0, [x17], #16 // save round 0 key |
| |
| .align 4 |
| .Lkey_loop: |
| tbl v0.16b, {v15.16b}, v1.16b |
| ldr q15, [x9], #16 // load next round key |
| |
| eor v0.16b, v0.16b, v7.16b |
| cmtst v24.16b, v0.16b, v16.16b |
| cmtst v25.16b, v0.16b, v17.16b |
| cmtst v26.16b, v0.16b, v18.16b |
| cmtst v27.16b, v0.16b, v19.16b |
| cmtst v28.16b, v0.16b, v20.16b |
| cmtst v29.16b, v0.16b, v21.16b |
| cmtst v30.16b, v0.16b, v22.16b |
| cmtst v31.16b, v0.16b, v23.16b |
| sub x10, x10, #1 |
| st1 {v24.16b-v27.16b}, [x17], #64 // write bit-sliced round key |
| st1 {v28.16b-v31.16b}, [x17], #64 |
| cbnz x10, .Lkey_loop |
| |
| // don't save last round key |
| #ifdef __AARCH64EL__ |
| rev32 v15.16b, v15.16b |
| adr x11, .LM0_bigendian |
| #endif |
| ret |
| .size _bsaes_key_convert,.-_bsaes_key_convert |
| |
| .globl ossl_bsaes_cbc_encrypt |
| .type ossl_bsaes_cbc_encrypt,%function |
| .align 4 |
| // On entry: |
| // x0 -> input ciphertext |
| // x1 -> output plaintext |
| // x2 = size of ciphertext and plaintext in bytes (assumed a multiple of 16) |
| // x3 -> key |
| // x4 -> 128-bit initialisation vector (or preceding 128-bit block of ciphertext if continuing after an earlier call) |
| // w5 must be == 0 |
| // On exit: |
| // Output plaintext filled in |
| // Initialisation vector overwritten with last quadword of ciphertext |
| // No output registers, usual AAPCS64 register preservation |
| ossl_bsaes_cbc_encrypt: |
| cmp x2, #128 |
| #ifdef __APPLE__ |
| bhs .Lcbc_do_bsaes |
| b AES_cbc_encrypt |
| .Lcbc_do_bsaes: |
| #else |
| blo AES_cbc_encrypt |
| #endif |
| |
| // it is up to the caller to make sure we are called with enc == 0 |
| |
| stp fp, lr, [sp, #-48]! |
| stp d8, d9, [sp, #16] |
| stp d10, d15, [sp, #32] |
| lsr x2, x2, #4 // len in 16 byte blocks |
| |
| ldr w15, [x3, #240] // get # of rounds |
| mov x14, sp |
| |
| // allocate the key schedule on the stack |
| add x17, sp, #96 |
| sub x17, x17, x15, lsl #7 // 128 bytes per inner round key, less 96 bytes |
| |
| // populate the key schedule |
| mov x9, x3 // pass key |
| mov x10, x15 // pass # of rounds |
| mov sp, x17 // sp is sp |
| bl _bsaes_key_convert |
| ldr q6, [sp] |
| str q15, [x17] // save last round key |
| eor v6.16b, v6.16b, v7.16b // fix up round 0 key (by XORing with 0x63) |
| str q6, [sp] |
| |
| ldr q15, [x4] // load IV |
| b .Lcbc_dec_loop |
| |
| .align 4 |
| .Lcbc_dec_loop: |
| subs x2, x2, #0x8 |
| bmi .Lcbc_dec_loop_finish |
| |
| ldr q0, [x0], #16 // load input |
| mov x9, sp // pass the key |
| ldr q1, [x0], #16 |
| mov x10, x15 |
| ldr q2, [x0], #16 |
| ldr q3, [x0], #16 |
| ldr q4, [x0], #16 |
| ldr q5, [x0], #16 |
| ldr q6, [x0], #16 |
| ldr q7, [x0], #-7*16 |
| |
| bl _bsaes_decrypt8 |
| |
| ldr q16, [x0], #16 // reload input |
| eor v0.16b, v0.16b, v15.16b // ^= IV |
| eor v1.16b, v1.16b, v16.16b |
| str q0, [x1], #16 // write output |
| ldr q0, [x0], #16 |
| str q1, [x1], #16 |
| ldr q1, [x0], #16 |
| eor v1.16b, v4.16b, v1.16b |
| ldr q4, [x0], #16 |
| eor v2.16b, v2.16b, v4.16b |
| eor v0.16b, v6.16b, v0.16b |
| ldr q4, [x0], #16 |
| str q0, [x1], #16 |
| str q1, [x1], #16 |
| eor v0.16b, v7.16b, v4.16b |
| ldr q1, [x0], #16 |
| str q2, [x1], #16 |
| ldr q2, [x0], #16 |
| ldr q15, [x0], #16 |
| str q0, [x1], #16 |
| eor v0.16b, v5.16b, v2.16b |
| eor v1.16b, v3.16b, v1.16b |
| str q1, [x1], #16 |
| str q0, [x1], #16 |
| |
| b .Lcbc_dec_loop |
| |
| .Lcbc_dec_loop_finish: |
| adds x2, x2, #8 |
| beq .Lcbc_dec_done |
| |
| ldr q0, [x0], #16 // load input |
| cmp x2, #2 |
| blo .Lcbc_dec_one |
| ldr q1, [x0], #16 |
| mov x9, sp // pass the key |
| mov x10, x15 |
| beq .Lcbc_dec_two |
| ldr q2, [x0], #16 |
| cmp x2, #4 |
| blo .Lcbc_dec_three |
| ldr q3, [x0], #16 |
| beq .Lcbc_dec_four |
| ldr q4, [x0], #16 |
| cmp x2, #6 |
| blo .Lcbc_dec_five |
| ldr q5, [x0], #16 |
| beq .Lcbc_dec_six |
| ldr q6, [x0], #-6*16 |
| |
| bl _bsaes_decrypt8 |
| |
| ldr q5, [x0], #16 // reload input |
| eor v0.16b, v0.16b, v15.16b // ^= IV |
| ldr q8, [x0], #16 |
| ldr q9, [x0], #16 |
| ldr q10, [x0], #16 |
| str q0, [x1], #16 // write output |
| ldr q0, [x0], #16 |
| eor v1.16b, v1.16b, v5.16b |
| ldr q5, [x0], #16 |
| eor v6.16b, v6.16b, v8.16b |
| ldr q15, [x0] |
| eor v4.16b, v4.16b, v9.16b |
| eor v2.16b, v2.16b, v10.16b |
| str q1, [x1], #16 |
| eor v0.16b, v7.16b, v0.16b |
| str q6, [x1], #16 |
| eor v1.16b, v3.16b, v5.16b |
| str q4, [x1], #16 |
| str q2, [x1], #16 |
| str q0, [x1], #16 |
| str q1, [x1] |
| b .Lcbc_dec_done |
| .align 4 |
| .Lcbc_dec_six: |
| sub x0, x0, #0x60 |
| bl _bsaes_decrypt8 |
| ldr q3, [x0], #16 // reload input |
| eor v0.16b, v0.16b, v15.16b // ^= IV |
| ldr q5, [x0], #16 |
| ldr q8, [x0], #16 |
| ldr q9, [x0], #16 |
| str q0, [x1], #16 // write output |
| ldr q0, [x0], #16 |
| eor v1.16b, v1.16b, v3.16b |
| ldr q15, [x0] |
| eor v3.16b, v6.16b, v5.16b |
| eor v4.16b, v4.16b, v8.16b |
| eor v2.16b, v2.16b, v9.16b |
| str q1, [x1], #16 |
| eor v0.16b, v7.16b, v0.16b |
| str q3, [x1], #16 |
| str q4, [x1], #16 |
| str q2, [x1], #16 |
| str q0, [x1] |
| b .Lcbc_dec_done |
| .align 4 |
| .Lcbc_dec_five: |
| sub x0, x0, #0x50 |
| bl _bsaes_decrypt8 |
| ldr q3, [x0], #16 // reload input |
| eor v0.16b, v0.16b, v15.16b // ^= IV |
| ldr q5, [x0], #16 |
| ldr q7, [x0], #16 |
| ldr q8, [x0], #16 |
| str q0, [x1], #16 // write output |
| ldr q15, [x0] |
| eor v0.16b, v1.16b, v3.16b |
| eor v1.16b, v6.16b, v5.16b |
| eor v3.16b, v4.16b, v7.16b |
| str q0, [x1], #16 |
| eor v0.16b, v2.16b, v8.16b |
| str q1, [x1], #16 |
| str q3, [x1], #16 |
| str q0, [x1] |
| b .Lcbc_dec_done |
| .align 4 |
| .Lcbc_dec_four: |
| sub x0, x0, #0x40 |
| bl _bsaes_decrypt8 |
| ldr q2, [x0], #16 // reload input |
| eor v0.16b, v0.16b, v15.16b // ^= IV |
| ldr q3, [x0], #16 |
| ldr q5, [x0], #16 |
| str q0, [x1], #16 // write output |
| ldr q15, [x0] |
| eor v0.16b, v1.16b, v2.16b |
| eor v1.16b, v6.16b, v3.16b |
| eor v2.16b, v4.16b, v5.16b |
| str q0, [x1], #16 |
| str q1, [x1], #16 |
| str q2, [x1] |
| b .Lcbc_dec_done |
| .align 4 |
| .Lcbc_dec_three: |
| sub x0, x0, #0x30 |
| bl _bsaes_decrypt8 |
| ldr q2, [x0], #16 // reload input |
| eor v0.16b, v0.16b, v15.16b // ^= IV |
| ldr q3, [x0], #16 |
| ldr q15, [x0] |
| str q0, [x1], #16 // write output |
| eor v0.16b, v1.16b, v2.16b |
| eor v1.16b, v6.16b, v3.16b |
| str q0, [x1], #16 |
| str q1, [x1] |
| b .Lcbc_dec_done |
| .align 4 |
| .Lcbc_dec_two: |
| sub x0, x0, #0x20 |
| bl _bsaes_decrypt8 |
| ldr q2, [x0], #16 // reload input |
| eor v0.16b, v0.16b, v15.16b // ^= IV |
| ldr q15, [x0] |
| str q0, [x1], #16 // write output |
| eor v0.16b, v1.16b, v2.16b |
| str q0, [x1] |
| b .Lcbc_dec_done |
| .align 4 |
| .Lcbc_dec_one: |
| sub x0, x0, #0x10 |
| stp x1, x4, [sp, #-32]! |
| str x14, [sp, #16] |
| mov v8.16b, v15.16b |
| mov v15.16b, v0.16b |
| mov x2, x3 |
| bl AES_decrypt |
| ldr x14, [sp, #16] |
| ldp x1, x4, [sp], #32 |
| ldr q0, [x1] // load result |
| eor v0.16b, v0.16b, v8.16b // ^= IV |
| str q0, [x1] // write output |
| |
| .align 4 |
| .Lcbc_dec_done: |
| movi v0.16b, #0 |
| movi v1.16b, #0 |
| .Lcbc_dec_bzero:// wipe key schedule [if any] |
| stp q0, q1, [sp], #32 |
| cmp sp, x14 |
| bne .Lcbc_dec_bzero |
| str q15, [x4] // return IV |
| ldp d8, d9, [sp, #16] |
| ldp d10, d15, [sp, #32] |
| ldp fp, lr, [sp], #48 |
| ret |
| .size ossl_bsaes_cbc_encrypt,.-ossl_bsaes_cbc_encrypt |
| |
| .globl ossl_bsaes_ctr32_encrypt_blocks |
| .type ossl_bsaes_ctr32_encrypt_blocks,%function |
| .align 4 |
| // On entry: |
| // x0 -> input text (whole 16-byte blocks) |
| // x1 -> output text (whole 16-byte blocks) |
| // x2 = number of 16-byte blocks to encrypt/decrypt (> 0) |
| // x3 -> key |
| // x4 -> initial value of 128-bit counter (stored big-endian) which increments, modulo 2^32, for each block |
| // On exit: |
| // Output text filled in |
| // No output registers, usual AAPCS64 register preservation |
| ossl_bsaes_ctr32_encrypt_blocks: |
| |
| cmp x2, #8 // use plain AES for |
| blo .Lctr_enc_short // small sizes |
| |
| stp fp, lr, [sp, #-80]! |
| stp d8, d9, [sp, #16] |
| stp d10, d11, [sp, #32] |
| stp d12, d13, [sp, #48] |
| stp d14, d15, [sp, #64] |
| |
| ldr w15, [x3, #240] // get # of rounds |
| mov x14, sp |
| |
| // allocate the key schedule on the stack |
| add x17, sp, #96 |
| sub x17, x17, x15, lsl #7 // 128 bytes per inner round key, less 96 bytes |
| |
| // populate the key schedule |
| mov x9, x3 // pass key |
| mov x10, x15 // pass # of rounds |
| mov sp, x17 // sp is sp |
| bl _bsaes_key_convert |
| eor v7.16b, v7.16b, v15.16b // fix up last round key |
| str q7, [x17] // save last round key |
| |
| ldr q0, [x4] // load counter |
| add x13, x11, #.LREVM0SR-.LM0_bigendian |
| ldr q4, [sp] // load round0 key |
| |
| movi v8.4s, #1 // compose 1<<96 |
| movi v9.16b, #0 |
| rev32 v15.16b, v0.16b |
| rev32 v0.16b, v0.16b |
| ext v11.16b, v9.16b, v8.16b, #4 |
| rev32 v4.16b, v4.16b |
| add v12.4s, v11.4s, v11.4s // compose 2<<96 |
| str q4, [sp] // save adjusted round0 key |
| add v13.4s, v11.4s, v12.4s // compose 3<<96 |
| add v14.4s, v12.4s, v12.4s // compose 4<<96 |
| b .Lctr_enc_loop |
| |
| .align 4 |
| .Lctr_enc_loop: |
| // Intermix prologue from _bsaes_encrypt8 to use the opportunity |
| // to flip byte order in 32-bit counter |
| |
| add v1.4s, v15.4s, v11.4s // +1 |
| add x9, sp, #0x10 // pass next round key |
| add v2.4s, v15.4s, v12.4s // +2 |
| ldr q9, [x13] // .LREVM0SR |
| ldr q8, [sp] // load round0 key |
| add v3.4s, v15.4s, v13.4s // +3 |
| mov x10, x15 // pass rounds |
| sub x11, x13, #.LREVM0SR-.LSR // pass constants |
| add v6.4s, v2.4s, v14.4s |
| add v4.4s, v15.4s, v14.4s // +4 |
| add v7.4s, v3.4s, v14.4s |
| add v15.4s, v4.4s, v14.4s // next counter |
| add v5.4s, v1.4s, v14.4s |
| |
| bl _bsaes_encrypt8_alt |
| |
| subs x2, x2, #8 |
| blo .Lctr_enc_loop_done |
| |
| ldr q16, [x0], #16 |
| ldr q17, [x0], #16 |
| eor v1.16b, v1.16b, v17.16b |
| ldr q17, [x0], #16 |
| eor v0.16b, v0.16b, v16.16b |
| eor v4.16b, v4.16b, v17.16b |
| str q0, [x1], #16 |
| ldr q16, [x0], #16 |
| str q1, [x1], #16 |
| mov v0.16b, v15.16b |
| str q4, [x1], #16 |
| ldr q1, [x0], #16 |
| eor v4.16b, v6.16b, v16.16b |
| eor v1.16b, v3.16b, v1.16b |
| ldr q3, [x0], #16 |
| eor v3.16b, v7.16b, v3.16b |
| ldr q6, [x0], #16 |
| eor v2.16b, v2.16b, v6.16b |
| ldr q6, [x0], #16 |
| eor v5.16b, v5.16b, v6.16b |
| str q4, [x1], #16 |
| str q1, [x1], #16 |
| str q3, [x1], #16 |
| str q2, [x1], #16 |
| str q5, [x1], #16 |
| |
| bne .Lctr_enc_loop |
| b .Lctr_enc_done |
| |
| .align 4 |
| .Lctr_enc_loop_done: |
| add x2, x2, #8 |
| ldr q16, [x0], #16 // load input |
| eor v0.16b, v0.16b, v16.16b |
| str q0, [x1], #16 // write output |
| cmp x2, #2 |
| blo .Lctr_enc_done |
| ldr q17, [x0], #16 |
| eor v1.16b, v1.16b, v17.16b |
| str q1, [x1], #16 |
| beq .Lctr_enc_done |
| ldr q18, [x0], #16 |
| eor v4.16b, v4.16b, v18.16b |
| str q4, [x1], #16 |
| cmp x2, #4 |
| blo .Lctr_enc_done |
| ldr q19, [x0], #16 |
| eor v6.16b, v6.16b, v19.16b |
| str q6, [x1], #16 |
| beq .Lctr_enc_done |
| ldr q20, [x0], #16 |
| eor v3.16b, v3.16b, v20.16b |
| str q3, [x1], #16 |
| cmp x2, #6 |
| blo .Lctr_enc_done |
| ldr q21, [x0], #16 |
| eor v7.16b, v7.16b, v21.16b |
| str q7, [x1], #16 |
| beq .Lctr_enc_done |
| ldr q22, [x0] |
| eor v2.16b, v2.16b, v22.16b |
| str q2, [x1], #16 |
| |
| .Lctr_enc_done: |
| movi v0.16b, #0 |
| movi v1.16b, #0 |
| .Lctr_enc_bzero: // wipe key schedule [if any] |
| stp q0, q1, [sp], #32 |
| cmp sp, x14 |
| bne .Lctr_enc_bzero |
| |
| ldp d8, d9, [sp, #16] |
| ldp d10, d11, [sp, #32] |
| ldp d12, d13, [sp, #48] |
| ldp d14, d15, [sp, #64] |
| ldp fp, lr, [sp], #80 |
| ret |
| |
| .Lctr_enc_short: |
| stp fp, lr, [sp, #-96]! |
| stp x19, x20, [sp, #16] |
| stp x21, x22, [sp, #32] |
| str x23, [sp, #48] |
| |
| mov x19, x0 // copy arguments |
| mov x20, x1 |
| mov x21, x2 |
| mov x22, x3 |
| ldr w23, [x4, #12] // load counter .LSW |
| ldr q1, [x4] // load whole counter value |
| #ifdef __AARCH64EL__ |
| rev w23, w23 |
| #endif |
| str q1, [sp, #80] // copy counter value |
| |
| .Lctr_enc_short_loop: |
| add x0, sp, #80 // input counter value |
| add x1, sp, #64 // output on the stack |
| mov x2, x22 // key |
| |
| bl AES_encrypt |
| |
| ldr q0, [x19], #16 // load input |
| ldr q1, [sp, #64] // load encrypted counter |
| add x23, x23, #1 |
| #ifdef __AARCH64EL__ |
| rev w0, w23 |
| str w0, [sp, #80+12] // next counter value |
| #else |
| str w23, [sp, #80+12] // next counter value |
| #endif |
| eor v0.16b, v0.16b, v1.16b |
| str q0, [x20], #16 // store output |
| subs x21, x21, #1 |
| bne .Lctr_enc_short_loop |
| |
| movi v0.16b, #0 |
| movi v1.16b, #0 |
| stp q0, q1, [sp, #64] |
| |
| ldr x23, [sp, #48] |
| ldp x21, x22, [sp, #32] |
| ldp x19, x20, [sp, #16] |
| ldp fp, lr, [sp], #96 |
| ret |
| .size ossl_bsaes_ctr32_encrypt_blocks,.-ossl_bsaes_ctr32_encrypt_blocks |
| |
| .globl ossl_bsaes_xts_encrypt |
| .type ossl_bsaes_xts_encrypt,%function |
| .align 4 |
| // On entry: |
| // x0 -> input plaintext |
| // x1 -> output ciphertext |
| // x2 -> length of text in bytes (must be at least 16) |
| // x3 -> key1 (used to encrypt the XORed plaintext blocks) |
| // x4 -> key2 (used to encrypt the initial vector to yield the initial tweak) |
| // x5 -> 16-byte initial vector (typically, sector number) |
| // On exit: |
| // Output ciphertext filled in |
| // No output registers, usual AAPCS64 register preservation |
| ossl_bsaes_xts_encrypt: |
| // Stack layout: |
| // sp -> |
| // nrounds*128-96 bytes: key schedule |
| // x19 -> |
| // 16 bytes: frame record |
| // 4*16 bytes: tweak storage across _bsaes_encrypt8 |
| // 6*8 bytes: storage for 5 callee-saved general-purpose registers |
| // 8*8 bytes: storage for 8 callee-saved SIMD registers |
| stp fp, lr, [sp, #-192]! |
| stp x19, x20, [sp, #80] |
| stp x21, x22, [sp, #96] |
| str x23, [sp, #112] |
| stp d8, d9, [sp, #128] |
| stp d10, d11, [sp, #144] |
| stp d12, d13, [sp, #160] |
| stp d14, d15, [sp, #176] |
| |
| mov x19, sp |
| mov x20, x0 |
| mov x21, x1 |
| mov x22, x2 |
| mov x23, x3 |
| |
| // generate initial tweak |
| sub sp, sp, #16 |
| mov x0, x5 // iv[] |
| mov x1, sp |
| mov x2, x4 // key2 |
| bl AES_encrypt |
| ldr q11, [sp], #16 |
| |
| ldr w1, [x23, #240] // get # of rounds |
| // allocate the key schedule on the stack |
| add x17, sp, #96 |
| sub x17, x17, x1, lsl #7 // 128 bytes per inner round key, less 96 bytes |
| |
| // populate the key schedule |
| mov x9, x23 // pass key |
| mov x10, x1 // pass # of rounds |
| mov sp, x17 |
| bl _bsaes_key_convert |
| eor v15.16b, v15.16b, v7.16b // fix up last round key |
| str q15, [x17] // save last round key |
| |
| subs x22, x22, #0x80 |
| blo .Lxts_enc_short |
| b .Lxts_enc_loop |
| |
| .align 4 |
| .Lxts_enc_loop: |
| ldr q8, .Lxts_magic |
| mov x10, x1 // pass rounds |
| add x2, x19, #16 |
| ldr q0, [x20], #16 |
| sshr v1.2d, v11.2d, #63 |
| mov x9, sp // pass key schedule |
| ldr q6, .Lxts_magic+16 |
| add v2.2d, v11.2d, v11.2d |
| cmtst v3.2d, v11.2d, v6.2d |
| and v1.16b, v1.16b, v8.16b |
| ext v1.16b, v1.16b, v1.16b, #8 |
| and v3.16b, v3.16b, v8.16b |
| ldr q4, [x20], #16 |
| eor v12.16b, v2.16b, v1.16b |
| eor v1.16b, v4.16b, v12.16b |
| eor v0.16b, v0.16b, v11.16b |
| cmtst v2.2d, v12.2d, v6.2d |
| add v4.2d, v12.2d, v12.2d |
| add x0, x19, #16 |
| ext v3.16b, v3.16b, v3.16b, #8 |
| and v2.16b, v2.16b, v8.16b |
| eor v13.16b, v4.16b, v3.16b |
| ldr q3, [x20], #16 |
| ext v4.16b, v2.16b, v2.16b, #8 |
| eor v2.16b, v3.16b, v13.16b |
| ldr q3, [x20], #16 |
| add v5.2d, v13.2d, v13.2d |
| cmtst v7.2d, v13.2d, v6.2d |
| and v7.16b, v7.16b, v8.16b |
| ldr q9, [x20], #16 |
| ext v7.16b, v7.16b, v7.16b, #8 |
| ldr q10, [x20], #16 |
| eor v14.16b, v5.16b, v4.16b |
| ldr q16, [x20], #16 |
| add v4.2d, v14.2d, v14.2d |
| eor v3.16b, v3.16b, v14.16b |
| eor v15.16b, v4.16b, v7.16b |
| add v5.2d, v15.2d, v15.2d |
| ldr q7, [x20], #16 |
| cmtst v4.2d, v14.2d, v6.2d |
| and v17.16b, v4.16b, v8.16b |
| cmtst v18.2d, v15.2d, v6.2d |
| eor v4.16b, v9.16b, v15.16b |
| ext v9.16b, v17.16b, v17.16b, #8 |
| eor v9.16b, v5.16b, v9.16b |
| add v17.2d, v9.2d, v9.2d |
| and v18.16b, v18.16b, v8.16b |
| eor v5.16b, v10.16b, v9.16b |
| str q9, [x2], #16 |
| ext v10.16b, v18.16b, v18.16b, #8 |
| cmtst v9.2d, v9.2d, v6.2d |
| and v9.16b, v9.16b, v8.16b |
| eor v10.16b, v17.16b, v10.16b |
| cmtst v17.2d, v10.2d, v6.2d |
| eor v6.16b, v16.16b, v10.16b |
| str q10, [x2], #16 |
| ext v9.16b, v9.16b, v9.16b, #8 |
| add v10.2d, v10.2d, v10.2d |
| eor v9.16b, v10.16b, v9.16b |
| str q9, [x2], #16 |
| eor v7.16b, v7.16b, v9.16b |
| add v9.2d, v9.2d, v9.2d |
| and v8.16b, v17.16b, v8.16b |
| ext v8.16b, v8.16b, v8.16b, #8 |
| eor v8.16b, v9.16b, v8.16b |
| str q8, [x2] // next round tweak |
| |
| bl _bsaes_encrypt8 |
| |
| ldr q8, [x0], #16 |
| eor v0.16b, v0.16b, v11.16b |
| eor v1.16b, v1.16b, v12.16b |
| ldr q9, [x0], #16 |
| eor v4.16b, v4.16b, v13.16b |
| eor v6.16b, v6.16b, v14.16b |
| ldr q10, [x0], #16 |
| eor v3.16b, v3.16b, v15.16b |
| subs x22, x22, #0x80 |
| str q0, [x21], #16 |
| ldr q11, [x0] // next round tweak |
| str q1, [x21], #16 |
| eor v0.16b, v7.16b, v8.16b |
| eor v1.16b, v2.16b, v9.16b |
| str q4, [x21], #16 |
| eor v2.16b, v5.16b, v10.16b |
| str q6, [x21], #16 |
| str q3, [x21], #16 |
| str q0, [x21], #16 |
| str q1, [x21], #16 |
| str q2, [x21], #16 |
| bpl .Lxts_enc_loop |
| |
| .Lxts_enc_short: |
| adds x22, x22, #0x70 |
| bmi .Lxts_enc_done |
| |
| ldr q8, .Lxts_magic |
| sshr v1.2d, v11.2d, #63 |
| add v2.2d, v11.2d, v11.2d |
| ldr q9, .Lxts_magic+16 |
| subs x22, x22, #0x10 |
| ldr q0, [x20], #16 |
| and v1.16b, v1.16b, v8.16b |
| cmtst v3.2d, v11.2d, v9.2d |
| ext v1.16b, v1.16b, v1.16b, #8 |
| and v3.16b, v3.16b, v8.16b |
| eor v12.16b, v2.16b, v1.16b |
| ext v1.16b, v3.16b, v3.16b, #8 |
| add v2.2d, v12.2d, v12.2d |
| cmtst v3.2d, v12.2d, v9.2d |
| eor v13.16b, v2.16b, v1.16b |
| and v22.16b, v3.16b, v8.16b |
| bmi .Lxts_enc_1 |
| |
| ext v2.16b, v22.16b, v22.16b, #8 |
| add v3.2d, v13.2d, v13.2d |
| ldr q1, [x20], #16 |
| cmtst v4.2d, v13.2d, v9.2d |
| subs x22, x22, #0x10 |
| eor v14.16b, v3.16b, v2.16b |
| and v23.16b, v4.16b, v8.16b |
| bmi .Lxts_enc_2 |
| |
| ext v3.16b, v23.16b, v23.16b, #8 |
| add v4.2d, v14.2d, v14.2d |
| ldr q2, [x20], #16 |
| cmtst v5.2d, v14.2d, v9.2d |
| eor v0.16b, v0.16b, v11.16b |
| subs x22, x22, #0x10 |
| eor v15.16b, v4.16b, v3.16b |
| and v24.16b, v5.16b, v8.16b |
| bmi .Lxts_enc_3 |
| |
| ext v4.16b, v24.16b, v24.16b, #8 |
| add v5.2d, v15.2d, v15.2d |
| ldr q3, [x20], #16 |
| cmtst v6.2d, v15.2d, v9.2d |
| eor v1.16b, v1.16b, v12.16b |
| subs x22, x22, #0x10 |
| eor v16.16b, v5.16b, v4.16b |
| and v25.16b, v6.16b, v8.16b |
| bmi .Lxts_enc_4 |
| |
| ext v5.16b, v25.16b, v25.16b, #8 |
| add v6.2d, v16.2d, v16.2d |
| add x0, x19, #16 |
| cmtst v7.2d, v16.2d, v9.2d |
| ldr q4, [x20], #16 |
| eor v2.16b, v2.16b, v13.16b |
| str q16, [x0], #16 |
| subs x22, x22, #0x10 |
| eor v17.16b, v6.16b, v5.16b |
| and v26.16b, v7.16b, v8.16b |
| bmi .Lxts_enc_5 |
| |
| ext v7.16b, v26.16b, v26.16b, #8 |
| add v18.2d, v17.2d, v17.2d |
| ldr q5, [x20], #16 |
| eor v3.16b, v3.16b, v14.16b |
| str q17, [x0], #16 |
| subs x22, x22, #0x10 |
| eor v18.16b, v18.16b, v7.16b |
| bmi .Lxts_enc_6 |
| |
| ldr q6, [x20], #16 |
| eor v4.16b, v4.16b, v15.16b |
| eor v5.16b, v5.16b, v16.16b |
| str q18, [x0] // next round tweak |
| mov x9, sp // pass key schedule |
| mov x10, x1 |
| add x0, x19, #16 |
| sub x22, x22, #0x10 |
| eor v6.16b, v6.16b, v17.16b |
| |
| bl _bsaes_encrypt8 |
| |
| ldr q16, [x0], #16 |
| eor v0.16b, v0.16b, v11.16b |
| eor v1.16b, v1.16b, v12.16b |
| ldr q17, [x0], #16 |
| eor v4.16b, v4.16b, v13.16b |
| eor v6.16b, v6.16b, v14.16b |
| eor v3.16b, v3.16b, v15.16b |
| ldr q11, [x0] // next round tweak |
| str q0, [x21], #16 |
| str q1, [x21], #16 |
| eor v0.16b, v7.16b, v16.16b |
| eor v1.16b, v2.16b, v17.16b |
| str q4, [x21], #16 |
| str q6, [x21], #16 |
| str q3, [x21], #16 |
| str q0, [x21], #16 |
| str q1, [x21], #16 |
| b .Lxts_enc_done |
| |
| .align 4 |
| .Lxts_enc_6: |
| eor v4.16b, v4.16b, v15.16b |
| eor v5.16b, v5.16b, v16.16b |
| mov x9, sp // pass key schedule |
| mov x10, x1 // pass rounds |
| add x0, x19, #16 |
| |
| bl _bsaes_encrypt8 |
| |
| ldr q16, [x0], #16 |
| eor v0.16b, v0.16b, v11.16b |
| eor v1.16b, v1.16b, v12.16b |
| eor v4.16b, v4.16b, v13.16b |
| eor v6.16b, v6.16b, v14.16b |
| ldr q11, [x0] // next round tweak |
| eor v3.16b, v3.16b, v15.16b |
| str q0, [x21], #16 |
| str q1, [x21], #16 |
| eor v0.16b, v7.16b, v16.16b |
| str q4, [x21], #16 |
| str q6, [x21], #16 |
| str q3, [x21], #16 |
| str q0, [x21], #16 |
| b .Lxts_enc_done |
| |
| .align 4 |
| .Lxts_enc_5: |
| eor v3.16b, v3.16b, v14.16b |
| eor v4.16b, v4.16b, v15.16b |
| mov x9, sp // pass key schedule |
| mov x10, x1 // pass rounds |
| add x0, x19, #16 |
| |
| bl _bsaes_encrypt8 |
| |
| eor v0.16b, v0.16b, v11.16b |
| eor v1.16b, v1.16b, v12.16b |
| ldr q11, [x0] // next round tweak |
| eor v4.16b, v4.16b, v13.16b |
| eor v6.16b, v6.16b, v14.16b |
| eor v3.16b, v3.16b, v15.16b |
| str q0, [x21], #16 |
| str q1, [x21], #16 |
| str q4, [x21], #16 |
| str q6, [x21], #16 |
| str q3, [x21], #16 |
| b .Lxts_enc_done |
| |
| .align 4 |
| .Lxts_enc_4: |
| eor v2.16b, v2.16b, v13.16b |
| eor v3.16b, v3.16b, v14.16b |
| mov x9, sp // pass key schedule |
| mov x10, x1 // pass rounds |
| add x0, x19, #16 |
| |
| bl _bsaes_encrypt8 |
| |
| eor v0.16b, v0.16b, v11.16b |
| eor v1.16b, v1.16b, v12.16b |
| eor v4.16b, v4.16b, v13.16b |
| eor v6.16b, v6.16b, v14.16b |
| mov v11.16b, v15.16b // next round tweak |
| str q0, [x21], #16 |
| str q1, [x21], #16 |
| str q4, [x21], #16 |
| str q6, [x21], #16 |
| b .Lxts_enc_done |
| |
| .align 4 |
| .Lxts_enc_3: |
| eor v1.16b, v1.16b, v12.16b |
| eor v2.16b, v2.16b, v13.16b |
| mov x9, sp // pass key schedule |
| mov x10, x1 // pass rounds |
| add x0, x19, #16 |
| |
| bl _bsaes_encrypt8 |
| |
| eor v0.16b, v0.16b, v11.16b |
| eor v1.16b, v1.16b, v12.16b |
| eor v4.16b, v4.16b, v13.16b |
| mov v11.16b, v14.16b // next round tweak |
| str q0, [x21], #16 |
| str q1, [x21], #16 |
| str q4, [x21], #16 |
| b .Lxts_enc_done |
| |
| .align 4 |
| .Lxts_enc_2: |
| eor v0.16b, v0.16b, v11.16b |
| eor v1.16b, v1.16b, v12.16b |
| mov x9, sp // pass key schedule |
| mov x10, x1 // pass rounds |
| add x0, x19, #16 |
| |
| bl _bsaes_encrypt8 |
| |
| eor v0.16b, v0.16b, v11.16b |
| eor v1.16b, v1.16b, v12.16b |
| mov v11.16b, v13.16b // next round tweak |
| str q0, [x21], #16 |
| str q1, [x21], #16 |
| b .Lxts_enc_done |
| |
| .align 4 |
| .Lxts_enc_1: |
| eor v0.16b, v0.16b, v11.16b |
| sub x0, sp, #16 |
| sub x1, sp, #16 |
| mov x2, x23 |
| mov v13.d[0], v11.d[1] // just in case AES_encrypt corrupts top half of callee-saved SIMD registers |
| mov v14.d[0], v12.d[1] |
| str q0, [sp, #-16]! |
| |
| bl AES_encrypt |
| |
| ldr q0, [sp], #16 |
| trn1 v13.2d, v11.2d, v13.2d |
| trn1 v11.2d, v12.2d, v14.2d // next round tweak |
| eor v0.16b, v0.16b, v13.16b |
| str q0, [x21], #16 |
| |
| .Lxts_enc_done: |
| adds x22, x22, #0x10 |
| beq .Lxts_enc_ret |
| |
| sub x6, x21, #0x10 |
| // Penultimate plaintext block produces final ciphertext part-block |
| // plus remaining part of final plaintext block. Move ciphertext part |
| // to final position and re-use penultimate ciphertext block buffer to |
| // construct final plaintext block |
| .Lxts_enc_steal: |
| ldrb w0, [x20], #1 |
| ldrb w1, [x21, #-0x10] |
| strb w0, [x21, #-0x10] |
| strb w1, [x21], #1 |
| |
| subs x22, x22, #1 |
| bhi .Lxts_enc_steal |
| |
| // Finally encrypt the penultimate ciphertext block using the |
| // last tweak |
| ldr q0, [x6] |
| eor v0.16b, v0.16b, v11.16b |
| str q0, [sp, #-16]! |
| mov x0, sp |
| mov x1, sp |
| mov x2, x23 |
| mov x21, x6 |
| mov v13.d[0], v11.d[1] // just in case AES_encrypt corrupts top half of callee-saved SIMD registers |
| |
| bl AES_encrypt |
| |
| trn1 v11.2d, v11.2d, v13.2d |
| ldr q0, [sp], #16 |
| eor v0.16b, v0.16b, v11.16b |
| str q0, [x21] |
| |
| .Lxts_enc_ret: |
| |
| movi v0.16b, #0 |
| movi v1.16b, #0 |
| .Lxts_enc_bzero: // wipe key schedule |
| stp q0, q1, [sp], #32 |
| cmp sp, x19 |
| bne .Lxts_enc_bzero |
| |
| ldp x19, x20, [sp, #80] |
| ldp x21, x22, [sp, #96] |
| ldr x23, [sp, #112] |
| ldp d8, d9, [sp, #128] |
| ldp d10, d11, [sp, #144] |
| ldp d12, d13, [sp, #160] |
| ldp d14, d15, [sp, #176] |
| ldp fp, lr, [sp], #192 |
| ret |
| .size ossl_bsaes_xts_encrypt,.-ossl_bsaes_xts_encrypt |
| |
| // The assembler doesn't seem capable of de-duplicating these when expressed |
| // using `ldr qd,=` syntax, so assign a symbolic address |
| .align 5 |
| .Lxts_magic: |
| .quad 1, 0x87, 0x4000000000000000, 0x4000000000000000 |
| |
| .globl ossl_bsaes_xts_decrypt |
| .type ossl_bsaes_xts_decrypt,%function |
| .align 4 |
| // On entry: |
| // x0 -> input ciphertext |
| // x1 -> output plaintext |
| // x2 -> length of text in bytes (must be at least 16) |
| // x3 -> key1 (used to decrypt the XORed ciphertext blocks) |
| // x4 -> key2 (used to encrypt the initial vector to yield the initial tweak) |
| // x5 -> 16-byte initial vector (typically, sector number) |
| // On exit: |
| // Output plaintext filled in |
| // No output registers, usual AAPCS64 register preservation |
| ossl_bsaes_xts_decrypt: |
| // Stack layout: |
| // sp -> |
| // nrounds*128-96 bytes: key schedule |
| // x19 -> |
| // 16 bytes: frame record |
| // 4*16 bytes: tweak storage across _bsaes_decrypt8 |
| // 6*8 bytes: storage for 5 callee-saved general-purpose registers |
| // 8*8 bytes: storage for 8 callee-saved SIMD registers |
| stp fp, lr, [sp, #-192]! |
| stp x19, x20, [sp, #80] |
| stp x21, x22, [sp, #96] |
| str x23, [sp, #112] |
| stp d8, d9, [sp, #128] |
| stp d10, d11, [sp, #144] |
| stp d12, d13, [sp, #160] |
| stp d14, d15, [sp, #176] |
| |
| mov x19, sp |
| mov x20, x0 |
| mov x21, x1 |
| mov x22, x2 |
| mov x23, x3 |
| |
| // generate initial tweak |
| sub sp, sp, #16 |
| mov x0, x5 // iv[] |
| mov x1, sp |
| mov x2, x4 // key2 |
| bl AES_encrypt |
| ldr q11, [sp], #16 |
| |
| ldr w1, [x23, #240] // get # of rounds |
| // allocate the key schedule on the stack |
| add x17, sp, #96 |
| sub x17, x17, x1, lsl #7 // 128 bytes per inner round key, less 96 bytes |
| |
| // populate the key schedule |
| mov x9, x23 // pass key |
| mov x10, x1 // pass # of rounds |
| mov sp, x17 |
| bl _bsaes_key_convert |
| ldr q6, [sp] |
| str q15, [x17] // save last round key |
| eor v6.16b, v6.16b, v7.16b // fix up round 0 key (by XORing with 0x63) |
| str q6, [sp] |
| |
| sub x30, x22, #0x10 |
| tst x22, #0xf // if not multiple of 16 |
| csel x22, x30, x22, ne // subtract another 16 bytes |
| subs x22, x22, #0x80 |
| |
| blo .Lxts_dec_short |
| b .Lxts_dec_loop |
| |
| .align 4 |
| .Lxts_dec_loop: |
| ldr q8, .Lxts_magic |
| mov x10, x1 // pass rounds |
| add x2, x19, #16 |
| ldr q0, [x20], #16 |
| sshr v1.2d, v11.2d, #63 |
| mov x9, sp // pass key schedule |
| ldr q6, .Lxts_magic+16 |
| add v2.2d, v11.2d, v11.2d |
| cmtst v3.2d, v11.2d, v6.2d |
| and v1.16b, v1.16b, v8.16b |
| ext v1.16b, v1.16b, v1.16b, #8 |
| and v3.16b, v3.16b, v8.16b |
| ldr q4, [x20], #16 |
| eor v12.16b, v2.16b, v1.16b |
| eor v1.16b, v4.16b, v12.16b |
| eor v0.16b, v0.16b, v11.16b |
| cmtst v2.2d, v12.2d, v6.2d |
| add v4.2d, v12.2d, v12.2d |
| add x0, x19, #16 |
| ext v3.16b, v3.16b, v3.16b, #8 |
| and v2.16b, v2.16b, v8.16b |
| eor v13.16b, v4.16b, v3.16b |
| ldr q3, [x20], #16 |
| ext v4.16b, v2.16b, v2.16b, #8 |
| eor v2.16b, v3.16b, v13.16b |
| ldr q3, [x20], #16 |
| add v5.2d, v13.2d, v13.2d |
| cmtst v7.2d, v13.2d, v6.2d |
| and v7.16b, v7.16b, v8.16b |
| ldr q9, [x20], #16 |
| ext v7.16b, v7.16b, v7.16b, #8 |
| ldr q10, [x20], #16 |
| eor v14.16b, v5.16b, v4.16b |
| ldr q16, [x20], #16 |
| add v4.2d, v14.2d, v14.2d |
| eor v3.16b, v3.16b, v14.16b |
| eor v15.16b, v4.16b, v7.16b |
| add v5.2d, v15.2d, v15.2d |
| ldr q7, [x20], #16 |
| cmtst v4.2d, v14.2d, v6.2d |
| and v17.16b, v4.16b, v8.16b |
| cmtst v18.2d, v15.2d, v6.2d |
| eor v4.16b, v9.16b, v15.16b |
| ext v9.16b, v17.16b, v17.16b, #8 |
| eor v9.16b, v5.16b, v9.16b |
| add v17.2d, v9.2d, v9.2d |
| and v18.16b, v18.16b, v8.16b |
| eor v5.16b, v10.16b, v9.16b |
| str q9, [x2], #16 |
| ext v10.16b, v18.16b, v18.16b, #8 |
| cmtst v9.2d, v9.2d, v6.2d |
| and v9.16b, v9.16b, v8.16b |
| eor v10.16b, v17.16b, v10.16b |
| cmtst v17.2d, v10.2d, v6.2d |
| eor v6.16b, v16.16b, v10.16b |
| str q10, [x2], #16 |
| ext v9.16b, v9.16b, v9.16b, #8 |
| add v10.2d, v10.2d, v10.2d |
| eor v9.16b, v10.16b, v9.16b |
| str q9, [x2], #16 |
| eor v7.16b, v7.16b, v9.16b |
| add v9.2d, v9.2d, v9.2d |
| and v8.16b, v17.16b, v8.16b |
| ext v8.16b, v8.16b, v8.16b, #8 |
| eor v8.16b, v9.16b, v8.16b |
| str q8, [x2] // next round tweak |
| |
| bl _bsaes_decrypt8 |
| |
| eor v6.16b, v6.16b, v13.16b |
| eor v0.16b, v0.16b, v11.16b |
| ldr q8, [x0], #16 |
| eor v7.16b, v7.16b, v8.16b |
| str q0, [x21], #16 |
| eor v0.16b, v1.16b, v12.16b |
| ldr q1, [x0], #16 |
| eor v1.16b, v3.16b, v1.16b |
| subs x22, x22, #0x80 |
| eor v2.16b, v2.16b, v15.16b |
| eor v3.16b, v4.16b, v14.16b |
| ldr q4, [x0], #16 |
| str q0, [x21], #16 |
| ldr q11, [x0] // next round tweak |
| eor v0.16b, v5.16b, v4.16b |
| str q6, [x21], #16 |
| str q3, [x21], #16 |
| str q2, [x21], #16 |
| str q7, [x21], #16 |
| str q1, [x21], #16 |
| str q0, [x21], #16 |
| bpl .Lxts_dec_loop |
| |
| .Lxts_dec_short: |
| adds x22, x22, #0x70 |
| bmi .Lxts_dec_done |
| |
| ldr q8, .Lxts_magic |
| sshr v1.2d, v11.2d, #63 |
| add v2.2d, v11.2d, v11.2d |
| ldr q9, .Lxts_magic+16 |
| subs x22, x22, #0x10 |
| ldr q0, [x20], #16 |
| and v1.16b, v1.16b, v8.16b |
| cmtst v3.2d, v11.2d, v9.2d |
| ext v1.16b, v1.16b, v1.16b, #8 |
| and v3.16b, v3.16b, v8.16b |
| eor v12.16b, v2.16b, v1.16b |
| ext v1.16b, v3.16b, v3.16b, #8 |
| add v2.2d, v12.2d, v12.2d |
| cmtst v3.2d, v12.2d, v9.2d |
| eor v13.16b, v2.16b, v1.16b |
| and v22.16b, v3.16b, v8.16b |
| bmi .Lxts_dec_1 |
| |
| ext v2.16b, v22.16b, v22.16b, #8 |
| add v3.2d, v13.2d, v13.2d |
| ldr q1, [x20], #16 |
| cmtst v4.2d, v13.2d, v9.2d |
| subs x22, x22, #0x10 |
| eor v14.16b, v3.16b, v2.16b |
| and v23.16b, v4.16b, v8.16b |
| bmi .Lxts_dec_2 |
| |
| ext v3.16b, v23.16b, v23.16b, #8 |
| add v4.2d, v14.2d, v14.2d |
| ldr q2, [x20], #16 |
| cmtst v5.2d, v14.2d, v9.2d |
| eor v0.16b, v0.16b, v11.16b |
| subs x22, x22, #0x10 |
| eor v15.16b, v4.16b, v3.16b |
| and v24.16b, v5.16b, v8.16b |
| bmi .Lxts_dec_3 |
| |
| ext v4.16b, v24.16b, v24.16b, #8 |
| add v5.2d, v15.2d, v15.2d |
| ldr q3, [x20], #16 |
| cmtst v6.2d, v15.2d, v9.2d |
| eor v1.16b, v1.16b, v12.16b |
| subs x22, x22, #0x10 |
| eor v16.16b, v5.16b, v4.16b |
| and v25.16b, v6.16b, v8.16b |
| bmi .Lxts_dec_4 |
| |
| ext v5.16b, v25.16b, v25.16b, #8 |
| add v6.2d, v16.2d, v16.2d |
| add x0, x19, #16 |
| cmtst v7.2d, v16.2d, v9.2d |
| ldr q4, [x20], #16 |
| eor v2.16b, v2.16b, v13.16b |
| str q16, [x0], #16 |
| subs x22, x22, #0x10 |
| eor v17.16b, v6.16b, v5.16b |
| and v26.16b, v7.16b, v8.16b |
| bmi .Lxts_dec_5 |
| |
| ext v7.16b, v26.16b, v26.16b, #8 |
| add v18.2d, v17.2d, v17.2d |
| ldr q5, [x20], #16 |
| eor v3.16b, v3.16b, v14.16b |
| str q17, [x0], #16 |
| subs x22, x22, #0x10 |
| eor v18.16b, v18.16b, v7.16b |
| bmi .Lxts_dec_6 |
| |
| ldr q6, [x20], #16 |
| eor v4.16b, v4.16b, v15.16b |
| eor v5.16b, v5.16b, v16.16b |
| str q18, [x0] // next round tweak |
| mov x9, sp // pass key schedule |
| mov x10, x1 |
| add x0, x19, #16 |
| sub x22, x22, #0x10 |
| eor v6.16b, v6.16b, v17.16b |
| |
| bl _bsaes_decrypt8 |
| |
| ldr q16, [x0], #16 |
| eor v0.16b, v0.16b, v11.16b |
| eor v1.16b, v1.16b, v12.16b |
| ldr q17, [x0], #16 |
| eor v6.16b, v6.16b, v13.16b |
| eor v4.16b, v4.16b, v14.16b |
| eor v2.16b, v2.16b, v15.16b |
| ldr q11, [x0] // next round tweak |
| str q0, [x21], #16 |
| str q1, [x21], #16 |
| eor v0.16b, v7.16b, v16.16b |
| eor v1.16b, v3.16b, v17.16b |
| str q6, [x21], #16 |
| str q4, [x21], #16 |
| str q2, [x21], #16 |
| str q0, [x21], #16 |
| str q1, [x21], #16 |
| b .Lxts_dec_done |
| |
| .align 4 |
| .Lxts_dec_6: |
| eor v4.16b, v4.16b, v15.16b |
| eor v5.16b, v5.16b, v16.16b |
| mov x9, sp // pass key schedule |
| mov x10, x1 // pass rounds |
| add x0, x19, #16 |
| |
| bl _bsaes_decrypt8 |
| |
| ldr q16, [x0], #16 |
| eor v0.16b, v0.16b, v11.16b |
| eor v1.16b, v1.16b, v12.16b |
| eor v6.16b, v6.16b, v13.16b |
| eor v4.16b, v4.16b, v14.16b |
| ldr q11, [x0] // next round tweak |
| eor v2.16b, v2.16b, v15.16b |
| str q0, [x21], #16 |
| str q1, [x21], #16 |
| eor v0.16b, v7.16b, v16.16b |
| str q6, [x21], #16 |
| str q4, [x21], #16 |
| str q2, [x21], #16 |
| str q0, [x21], #16 |
| b .Lxts_dec_done |
| |
| .align 4 |
| .Lxts_dec_5: |
| eor v3.16b, v3.16b, v14.16b |
| eor v4.16b, v4.16b, v15.16b |
| mov x9, sp // pass key schedule |
| mov x10, x1 // pass rounds |
| add x0, x19, #16 |
| |
| bl _bsaes_decrypt8 |
| |
| eor v0.16b, v0.16b, v11.16b |
| eor v1.16b, v1.16b, v12.16b |
| ldr q11, [x0] // next round tweak |
| eor v6.16b, v6.16b, v13.16b |
| eor v4.16b, v4.16b, v14.16b |
| eor v2.16b, v2.16b, v15.16b |
| str q0, [x21], #16 |
| str q1, [x21], #16 |
| str q6, [x21], #16 |
| str q4, [x21], #16 |
| str q2, [x21], #16 |
| b .Lxts_dec_done |
| |
| .align 4 |
| .Lxts_dec_4: |
| eor v2.16b, v2.16b, v13.16b |
| eor v3.16b, v3.16b, v14.16b |
| mov x9, sp // pass key schedule |
| mov x10, x1 // pass rounds |
| add x0, x19, #16 |
| |
| bl _bsaes_decrypt8 |
| |
| eor v0.16b, v0.16b, v11.16b |
| eor v1.16b, v1.16b, v12.16b |
| eor v6.16b, v6.16b, v13.16b |
| eor v4.16b, v4.16b, v14.16b |
| mov v11.16b, v15.16b // next round tweak |
| str q0, [x21], #16 |
| str q1, [x21], #16 |
| str q6, [x21], #16 |
| str q4, [x21], #16 |
| b .Lxts_dec_done |
| |
| .align 4 |
| .Lxts_dec_3: |
| eor v1.16b, v1.16b, v12.16b |
| eor v2.16b, v2.16b, v13.16b |
| mov x9, sp // pass key schedule |
| mov x10, x1 // pass rounds |
| add x0, x19, #16 |
| |
| bl _bsaes_decrypt8 |
| |
| eor v0.16b, v0.16b, v11.16b |
| eor v1.16b, v1.16b, v12.16b |
| eor v6.16b, v6.16b, v13.16b |
| mov v11.16b, v14.16b // next round tweak |
| str q0, [x21], #16 |
| str q1, [x21], #16 |
| str q6, [x21], #16 |
| b .Lxts_dec_done |
| |
| .align 4 |
| .Lxts_dec_2: |
| eor v0.16b, v0.16b, v11.16b |
| eor v1.16b, v1.16b, v12.16b |
| mov x9, sp // pass key schedule |
| mov x10, x1 // pass rounds |
| add x0, x19, #16 |
| |
| bl _bsaes_decrypt8 |
| |
| eor v0.16b, v0.16b, v11.16b |
| eor v1.16b, v1.16b, v12.16b |
| mov v11.16b, v13.16b // next round tweak |
| str q0, [x21], #16 |
| str q1, [x21], #16 |
| b .Lxts_dec_done |
| |
| .align 4 |
| .Lxts_dec_1: |
| eor v0.16b, v0.16b, v11.16b |
| sub x0, sp, #16 |
| sub x1, sp, #16 |
| mov x2, x23 |
| mov v13.d[0], v11.d[1] // just in case AES_decrypt corrupts top half of callee-saved SIMD registers |
| mov v14.d[0], v12.d[1] |
| str q0, [sp, #-16]! |
| |
| bl AES_decrypt |
| |
| ldr q0, [sp], #16 |
| trn1 v13.2d, v11.2d, v13.2d |
| trn1 v11.2d, v12.2d, v14.2d // next round tweak |
| eor v0.16b, v0.16b, v13.16b |
| str q0, [x21], #16 |
| |
| .Lxts_dec_done: |
| adds x22, x22, #0x10 |
| beq .Lxts_dec_ret |
| |
| // calculate one round of extra tweak for the stolen ciphertext |
| ldr q8, .Lxts_magic |
| sshr v6.2d, v11.2d, #63 |
| and v6.16b, v6.16b, v8.16b |
| add v12.2d, v11.2d, v11.2d |
| ext v6.16b, v6.16b, v6.16b, #8 |
| eor v12.16b, v12.16b, v6.16b |
| |
| // perform the final decryption with the last tweak value |
| ldr q0, [x20], #16 |
| eor v0.16b, v0.16b, v12.16b |
| str q0, [sp, #-16]! |
| mov x0, sp |
| mov x1, sp |
| mov x2, x23 |
| mov v13.d[0], v11.d[1] // just in case AES_decrypt corrupts top half of callee-saved SIMD registers |
| mov v14.d[0], v12.d[1] |
| |
| bl AES_decrypt |
| |
| trn1 v12.2d, v12.2d, v14.2d |
| trn1 v11.2d, v11.2d, v13.2d |
| ldr q0, [sp], #16 |
| eor v0.16b, v0.16b, v12.16b |
| str q0, [x21] |
| |
| mov x6, x21 |
| // Penultimate ciphertext block produces final plaintext part-block |
| // plus remaining part of final ciphertext block. Move plaintext part |
| // to final position and re-use penultimate plaintext block buffer to |
| // construct final ciphertext block |
| .Lxts_dec_steal: |
| ldrb w1, [x21] |
| ldrb w0, [x20], #1 |
| strb w1, [x21, #0x10] |
| strb w0, [x21], #1 |
| |
| subs x22, x22, #1 |
| bhi .Lxts_dec_steal |
| |
| // Finally decrypt the penultimate plaintext block using the |
| // penultimate tweak |
| ldr q0, [x6] |
| eor v0.16b, v0.16b, v11.16b |
| str q0, [sp, #-16]! |
| mov x0, sp |
| mov x1, sp |
| mov x2, x23 |
| mov x21, x6 |
| |
| bl AES_decrypt |
| |
| trn1 v11.2d, v11.2d, v13.2d |
| ldr q0, [sp], #16 |
| eor v0.16b, v0.16b, v11.16b |
| str q0, [x21] |
| |
| .Lxts_dec_ret: |
| |
| movi v0.16b, #0 |
| movi v1.16b, #0 |
| .Lxts_dec_bzero: // wipe key schedule |
| stp q0, q1, [sp], #32 |
| cmp sp, x19 |
| bne .Lxts_dec_bzero |
| |
| ldp x19, x20, [sp, #80] |
| ldp x21, x22, [sp, #96] |
| ldr x23, [sp, #112] |
| ldp d8, d9, [sp, #128] |
| ldp d10, d11, [sp, #144] |
| ldp d12, d13, [sp, #160] |
| ldp d14, d15, [sp, #176] |
| ldp fp, lr, [sp], #192 |
| ret |
| .size ossl_bsaes_xts_decrypt,.-ossl_bsaes_xts_decrypt |