blob: ff5809ec22df662f2cf84b355c6eee8007c599c6 [file] [log] [blame]
#! /usr/bin/env perl
# Copyright 2019-2020 The OpenSSL Project Authors. All Rights Reserved.
#
# Licensed under the Apache License 2.0 (the "License"). You may not use
# this file except in compliance with the License. You can obtain a copy
# in the file LICENSE in the source distribution or at
# https://www.openssl.org/source/license.html
#
#========================================================================
# Written by Fangming Fang <fangming.fang@arm.com> for the OpenSSL project,
# derived from https://github.com/ARM-software/AArch64cryptolib, original
# author Samuel Lee <Samuel.Lee@arm.com>. The module is, however, dual
# licensed under OpenSSL and CRYPTOGAMS licenses depending on where you
# obtain it. For further details see http://www.openssl.org/~appro/cryptogams/.
#========================================================================
#
# Approach - assume we don't want to reload constants, so reserve ~half of vector register file for constants
#
# main loop to act on 4 16B blocks per iteration, and then do modulo of the accumulated intermediate hashes from the 4 blocks
#
# ____________________________________________________
# | |
# | PRE |
# |____________________________________________________|
# | | | |
# | CTR block 4k+8 | AES block 4k+4 | GHASH block 4k+0 |
# |________________|________________|__________________|
# | | | |
# | CTR block 4k+9 | AES block 4k+5 | GHASH block 4k+1 |
# |________________|________________|__________________|
# | | | |
# | CTR block 4k+10| AES block 4k+6 | GHASH block 4k+2 |
# |________________|________________|__________________|
# | | | |
# | CTR block 4k+11| AES block 4k+7 | GHASH block 4k+3 |
# |________________|____(mostly)____|__________________|
# | |
# | MODULO |
# |____________________________________________________|
#
# PRE:
# Ensure previous generated intermediate hash is aligned and merged with result for GHASH 4k+0
# EXT low_acc, low_acc, low_acc, #8
# EOR res_curr (4k+0), res_curr (4k+0), low_acc
#
# CTR block:
# Increment and byte reverse counter in scalar registers and transfer to SIMD registers
# REV ctr32, rev_ctr32
# ORR ctr64, constctr96_top32, ctr32, LSL #32
# INS ctr_next.d[0], constctr96_bottom64 // Keeping this in scalar registers to free up space in SIMD RF
# INS ctr_next.d[1], ctr64X
# ADD rev_ctr32, #1
#
# AES block:
# Do AES encryption/decryption on CTR block X and EOR it with input block X. Take 256 bytes key below for example.
# Doing small trick here of loading input in scalar registers, EORing with last key and then transferring
# Given we are very constrained in our ASIMD registers this is quite important
#
# Encrypt:
# LDR input_low, [ input_ptr ], #8
# LDR input_high, [ input_ptr ], #8
# EOR input_low, k14_low
# EOR input_high, k14_high
# INS res_curr.d[0], input_low
# INS res_curr.d[1], input_high
# AESE ctr_curr, k0; AESMC ctr_curr, ctr_curr
# AESE ctr_curr, k1; AESMC ctr_curr, ctr_curr
# AESE ctr_curr, k2; AESMC ctr_curr, ctr_curr
# AESE ctr_curr, k3; AESMC ctr_curr, ctr_curr
# AESE ctr_curr, k4; AESMC ctr_curr, ctr_curr
# AESE ctr_curr, k5; AESMC ctr_curr, ctr_curr
# AESE ctr_curr, k6; AESMC ctr_curr, ctr_curr
# AESE ctr_curr, k7; AESMC ctr_curr, ctr_curr
# AESE ctr_curr, k8; AESMC ctr_curr, ctr_curr
# AESE ctr_curr, k9; AESMC ctr_curr, ctr_curr
# AESE ctr_curr, k10; AESMC ctr_curr, ctr_curr
# AESE ctr_curr, k11; AESMC ctr_curr, ctr_curr
# AESE ctr_curr, k12; AESMC ctr_curr, ctr_curr
# AESE ctr_curr, k13
# EOR res_curr, res_curr, ctr_curr
# ST1 { res_curr.16b }, [ output_ptr ], #16
#
# Decrypt:
# AESE ctr_curr, k0; AESMC ctr_curr, ctr_curr
# AESE ctr_curr, k1; AESMC ctr_curr, ctr_curr
# AESE ctr_curr, k2; AESMC ctr_curr, ctr_curr
# AESE ctr_curr, k3; AESMC ctr_curr, ctr_curr
# AESE ctr_curr, k4; AESMC ctr_curr, ctr_curr
# AESE ctr_curr, k5; AESMC ctr_curr, ctr_curr
# AESE ctr_curr, k6; AESMC ctr_curr, ctr_curr
# AESE ctr_curr, k7; AESMC ctr_curr, ctr_curr
# AESE ctr_curr, k8; AESMC ctr_curr, ctr_curr
# AESE ctr_curr, k9; AESMC ctr_curr, ctr_curr
# AESE ctr_curr, k10; AESMC ctr_curr, ctr_curr
# AESE ctr_curr, k11; AESMC ctr_curr, ctr_curr
# AESE ctr_curr, k12; AESMC ctr_curr, ctr_curr
# AESE ctr_curr, k13
# LDR res_curr, [ input_ptr ], #16
# EOR res_curr, res_curr, ctr_curr
# MOV output_low, res_curr.d[0]
# MOV output_high, res_curr.d[1]
# EOR output_low, k14_low
# EOR output_high, k14_high
# STP output_low, output_high, [ output_ptr ], #16
#
# GHASH block X:
# do 128b karatsuba polynomial multiplication on block
# We only have 64b->128b polynomial multipliers, naively that means we need to do 4 64b multiplies to generate a 128b
#
# multiplication:
# Pmull(A,B) == (Pmull(Ah,Bh)<<128 | Pmull(Al,Bl)) ^ (Pmull(Ah,Bl) ^ Pmull(Al,Bh))<<64
#
# The idea behind Karatsuba multiplication is that we can do just 3 64b multiplies:
# Pmull(A,B) == (Pmull(Ah,Bh)<<128 | Pmull(Al,Bl)) ^ (Pmull(Ah^Al,Bh^Bl) ^ Pmull(Ah,Bh) ^ Pmull(Al,Bl))<<64
#
# There is some complication here because the bit order of GHASH's PMULL is reversed compared to elsewhere, so we are
# multiplying with "twisted" powers of H
#
# Note: We can PMULL directly into the acc_x in first GHASH of the loop
# Note: For scheduling big cores we want to split the processing to happen over two loop iterations - otherwise the critical
# path latency dominates the performance
#
# This has a knock on effect on register pressure, so we have to be a bit more clever with our temporary registers
# than indicated here
# REV64 res_curr, res_curr
# INS t_m.d[0], res_curr.d[1]
# EOR t_m.8B, t_m.8B, res_curr.8B
# PMULL2 t_h, res_curr, HX
# PMULL t_l, res_curr, HX
# PMULL t_m, t_m, HX_k
# EOR acc_h, acc_h, t_h
# EOR acc_l, acc_l, t_l
# EOR acc_m, acc_m, t_m
#
# MODULO: take the partial accumulators (~representing sum of 256b multiplication results), from GHASH and do modulo reduction on them
# There is some complication here because the bit order of GHASH's PMULL is reversed compared to elsewhere, so we are doing modulo
# with a reversed constant
# EOR acc_m, acc_m, acc_h
# EOR acc_m, acc_m, acc_l // Finish off karatsuba processing
# PMULL t_mod, acc_h, mod_constant
# EXT acc_h, acc_h, acc_h, #8
# EOR acc_m, acc_m, acc_h
# EOR acc_m, acc_m, t_mod
# PMULL acc_h, acc_m, mod_constant
# EXT acc_m, acc_m, acc_m, #8
# EOR acc_l, acc_l, acc_h
# EOR acc_l, acc_l, acc_m
$output = $#ARGV >= 0 && $ARGV[$#ARGV] =~ m|\.\w+$| ? pop : undef;
$flavour = $#ARGV >= 0 && $ARGV[0] !~ m|\.| ? shift : undef;
$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
( $xlate="${dir}arm-xlate.pl" and -f $xlate ) or
( $xlate="${dir}../../perlasm/arm-xlate.pl" and -f $xlate ) or
die "can't locate arm-xlate.pl";
open OUT,"| \"$^X\" $xlate $flavour $output";
*STDOUT=*OUT;
$input_ptr="x0"; #argument block
$bit_length="x1";
$output_ptr="x2";
$current_tag="x3";
$counter="x16";
$cc="x8";
{
my ($end_input_ptr,$main_end_input_ptr,$input_l0,$input_h0)=map("x$_",(4..7));
my ($input_l1,$input_h1,$input_l2,$input_h2,$input_l3,$input_h3)=map("x$_",(19..24));
my ($output_l1,$output_h1,$output_l2,$output_h2,$output_l3,$output_h3)=map("x$_",(19..24));
my ($output_l0,$output_h0)=map("x$_",(6..7));
my $ctr32w="w9";
my ($ctr32x,$ctr96_b64x,$ctr96_t32x,$rctr32x,$rk10_l,$rk10_h,$len)=map("x$_",(9..15));
my ($ctr96_t32w,$rctr32w)=map("w$_",(11..12));
my ($ctr0b,$ctr1b,$ctr2b,$ctr3b,$res0b,$res1b,$res2b,$res3b)=map("v$_.16b",(0..7));
my ($ctr0,$ctr1,$ctr2,$ctr3,$res0,$res1,$res2,$res3)=map("v$_",(0..7));
my ($ctr0d,$ctr1d,$ctr2d,$ctr3d,$res0d,$res1d,$res2d,$res3d)=map("d$_",(0..7));
my ($res0q,$res1q,$res2q,$res3q)=map("q$_",(4..7));
my ($acc_hb,$acc_mb,$acc_lb)=map("v$_.16b",(9..11));
my ($acc_h,$acc_m,$acc_l)=map("v$_",(9..11));
my ($acc_hd,$acc_md,$acc_ld)=map("d$_",(9..11));
my ($h1,$h2,$h3,$h4,$h12k,$h34k)=map("v$_",(12..17));
my ($h1q,$h2q,$h3q,$h4q)=map("q$_",(12..15));
my ($h1b,$h2b,$h3b,$h4b)=map("v$_.16b",(12..15));
my $t0="v8";
my $t0d="d8";
my ($t1,$t2,$t3)=map("v$_",(28..30));
my ($t1d,$t2d,$t3d)=map("d$_",(28..30));
my $t4="v8";
my $t4d="d8";
my $t5="v28";
my $t5d="d28";
my $t6="v31";
my $t6d="d31";
my $t7="v4";
my $t7d="d4";
my $t8="v29";
my $t8d="d29";
my $t9="v30";
my $t9d="d30";
my ($ctr_t0,$ctr_t1,$ctr_t2,$ctr_t3)=map("v$_",(4..7));
my ($ctr_t0d,$ctr_t1d,$ctr_t2d,$ctr_t3d)=map("d$_",(4..7));
my ($ctr_t0b,$ctr_t1b,$ctr_t2b,$ctr_t3b)=map("v$_.16b",(4..7));
my $mod_constantd="d8";
my $mod_constant="v8";
my $mod_t="v31";
my ($rk0,$rk1,$rk2,$rk3,$rk4,$rk5,$rk6,$rk7,$rk8,$rk9)=map("v$_.16b",(18..27));
my ($rk0q,$rk1q,$rk2q,$rk3q,$rk4q,$rk5q,$rk6q,$rk7q,$rk8q,$rk9q)=map("q$_",(18..27));
my $rk2q1="v20.1q";
my $rk3q1="v21.1q";
my $rk4v="v22";
my $rk4d="d22";
$code=<<___;
#include "arm_arch.h"
#if __ARM_MAX_ARCH__>=8
___
$code.=".arch armv8-a+crypto\n.text\n" if ($flavour =~ /64/);
$code.=<<___ if ($flavour !~ /64/);
.fpu neon
#ifdef __thumb2__
.syntax unified
.thumb
# define INST(a,b,c,d) $_byte c,0xef,a,b
#else
.code 32
# define INST(a,b,c,d) $_byte a,b,c,0xf2
#endif
.text
___
#########################################################################################
# size_t aes_gcm_enc_128_kernel(const unsigned char *in,
# size_t len,
# unsigned char *out,
# const void *key,
# unsigned char ivec[16],
# u64 *Xi);
#
$code.=<<___;
.global aes_gcm_enc_128_kernel
.type aes_gcm_enc_128_kernel,%function
.align 4
aes_gcm_enc_128_kernel:
AARCH64_VALID_CALL_TARGET
cbz x1, .L128_enc_ret
stp x19, x20, [sp, #-112]!
mov x16, x4
mov x8, x5
stp x21, x22, [sp, #16]
stp x23, x24, [sp, #32]
stp d8, d9, [sp, #48]
stp d10, d11, [sp, #64]
stp d12, d13, [sp, #80]
stp d14, d15, [sp, #96]
ldp $ctr96_b64x, $ctr96_t32x, [$counter] @ ctr96_b64, ctr96_t32
ldp $rk10_l, $rk10_h, [$cc, #160] @ load rk10
ld1 {$acc_lb}, [$current_tag]
ext $acc_lb, $acc_lb, $acc_lb, #8
rev64 $acc_lb, $acc_lb
lsr $main_end_input_ptr, $bit_length, #3 @ byte_len
mov $len, $main_end_input_ptr
ldr $rk9q, [$cc, #144] @ load rk9
add $end_input_ptr, $input_ptr, $bit_length, lsr #3 @ end_input_ptr
sub $main_end_input_ptr, $main_end_input_ptr, #1 @ byte_len - 1
lsr $rctr32x, $ctr96_t32x, #32
ldr $h4q, [$current_tag, #112] @ load h4l | h4h
ext $h4b, $h4b, $h4b, #8
fmov $ctr1d, $ctr96_b64x @ CTR block 1
rev $rctr32w, $rctr32w @ rev_ctr32
add $rctr32w, $rctr32w, #1 @ increment rev_ctr32
orr $ctr96_t32w, $ctr96_t32w, $ctr96_t32w
ldr $rk0q, [$cc, #0] @ load rk0
rev $ctr32w, $rctr32w @ CTR block 1
add $rctr32w, $rctr32w, #1 @ CTR block 1
fmov $ctr3d, $ctr96_b64x @ CTR block 3
orr $ctr32x, $ctr96_t32x, $ctr32x, lsl #32 @ CTR block 1
ld1 { $ctr0b}, [$counter] @ special case vector load initial counter so we can start first AES block as quickly as possible
fmov $ctr1.d[1], $ctr32x @ CTR block 1
rev $ctr32w, $rctr32w @ CTR block 2
fmov $ctr2d, $ctr96_b64x @ CTR block 2
orr $ctr32x, $ctr96_t32x, $ctr32x, lsl #32 @ CTR block 2
add $rctr32w, $rctr32w, #1 @ CTR block 2
fmov $ctr2.d[1], $ctr32x @ CTR block 2
rev $ctr32w, $rctr32w @ CTR block 3
orr $ctr32x, $ctr96_t32x, $ctr32x, lsl #32 @ CTR block 3
ldr $rk1q, [$cc, #16] @ load rk1
add $rctr32w, $rctr32w, #1 @ CTR block 3
fmov $ctr3.d[1], $ctr32x @ CTR block 3
ldr $h3q, [$current_tag, #80] @ load h3l | h3h
ext $h3b, $h3b, $h3b, #8
aese $ctr1b, $rk0 \n aesmc $ctr1b, $ctr1b @ AES block 1 - round 0
ldr $rk2q, [$cc, #32] @ load rk2
aese $ctr2b, $rk0 \n aesmc $ctr2b, $ctr2b @ AES block 2 - round 0
ldr $h1q, [$current_tag, #32] @ load h1l | h1h
ext $h1b, $h1b, $h1b, #8
aese $ctr0b, $rk0 \n aesmc $ctr0b, $ctr0b @ AES block 0 - round 0
ldr $rk8q, [$cc, #128] @ load rk8
aese $ctr3b, $rk0 \n aesmc $ctr3b, $ctr3b @ AES block 3 - round 0
ldr $rk3q, [$cc, #48] @ load rk3
aese $ctr2b, $rk1 \n aesmc $ctr2b, $ctr2b @ AES block 2 - round 1
trn2 $h34k.2d, $h3.2d, $h4.2d @ h4l | h3l
aese $ctr0b, $rk1 \n aesmc $ctr0b, $ctr0b @ AES block 0 - round 1
ldr $rk6q, [$cc, #96] @ load rk6
aese $ctr1b, $rk1 \n aesmc $ctr1b, $ctr1b @ AES block 1 - round 1
ldr $rk7q, [$cc, #112] @ load rk7
aese $ctr3b, $rk1 \n aesmc $ctr3b, $ctr3b @ AES block 3 - round 1
trn1 $acc_h.2d, $h3.2d, $h4.2d @ h4h | h3h
aese $ctr0b, $rk2 \n aesmc $ctr0b, $ctr0b @ AES block 0 - round 2
ldr $rk5q, [$cc, #80] @ load rk5
aese $ctr1b, $rk2 \n aesmc $ctr1b, $ctr1b @ AES block 1 - round 2
ldr $h2q, [$current_tag, #64] @ load h2l | h2h
ext $h2b, $h2b, $h2b, #8
aese $ctr3b, $rk2 \n aesmc $ctr3b, $ctr3b @ AES block 3 - round 2
aese $ctr2b, $rk2 \n aesmc $ctr2b, $ctr2b @ AES block 2 - round 2
eor $h34k.16b, $h34k.16b, $acc_h.16b @ h4k | h3k
aese $ctr0b, $rk3 \n aesmc $ctr0b, $ctr0b @ AES block 0 - round 3
aese $ctr1b, $rk3 \n aesmc $ctr1b, $ctr1b @ AES block 1 - round 3
aese $ctr2b, $rk3 \n aesmc $ctr2b, $ctr2b @ AES block 2 - round 3
ldr $rk4q, [$cc, #64] @ load rk4
aese $ctr3b, $rk3 \n aesmc $ctr3b, $ctr3b @ AES block 3 - round 3
and $main_end_input_ptr, $main_end_input_ptr, #0xffffffffffffffc0 @ number of bytes to be processed in main loop (at least 1 byte must be handled by tail)
trn2 $h12k.2d, $h1.2d, $h2.2d @ h2l | h1l
aese $ctr3b, $rk4 \n aesmc $ctr3b, $ctr3b @ AES block 3 - round 4
add $main_end_input_ptr, $main_end_input_ptr, $input_ptr
aese $ctr2b, $rk4 \n aesmc $ctr2b, $ctr2b @ AES block 2 - round 4
cmp $input_ptr, $main_end_input_ptr @ check if we have <= 4 blocks
aese $ctr0b, $rk4 \n aesmc $ctr0b, $ctr0b @ AES block 0 - round 4
aese $ctr3b, $rk5 \n aesmc $ctr3b, $ctr3b @ AES block 3 - round 5
aese $ctr2b, $rk5 \n aesmc $ctr2b, $ctr2b @ AES block 2 - round 5
aese $ctr0b, $rk5 \n aesmc $ctr0b, $ctr0b @ AES block 0 - round 5
aese $ctr3b, $rk6 \n aesmc $ctr3b, $ctr3b @ AES block 3 - round 6
aese $ctr1b, $rk4 \n aesmc $ctr1b, $ctr1b @ AES block 1 - round 4
aese $ctr2b, $rk6 \n aesmc $ctr2b, $ctr2b @ AES block 2 - round 6
trn1 $t0.2d, $h1.2d, $h2.2d @ h2h | h1h
aese $ctr0b, $rk6 \n aesmc $ctr0b, $ctr0b @ AES block 0 - round 6
aese $ctr1b, $rk5 \n aesmc $ctr1b, $ctr1b @ AES block 1 - round 5
aese $ctr3b, $rk7 \n aesmc $ctr3b, $ctr3b @ AES block 3 - round 7
aese $ctr0b, $rk7 \n aesmc $ctr0b, $ctr0b @ AES block 0 - round 7
aese $ctr1b, $rk6 \n aesmc $ctr1b, $ctr1b @ AES block 1 - round 6
aese $ctr2b, $rk7 \n aesmc $ctr2b, $ctr2b @ AES block 2 - round 7
aese $ctr0b, $rk8 \n aesmc $ctr0b, $ctr0b @ AES block 0 - round 8
aese $ctr1b, $rk7 \n aesmc $ctr1b, $ctr1b @ AES block 1 - round 7
aese $ctr2b, $rk8 \n aesmc $ctr2b, $ctr2b @ AES block 2 - round 8
aese $ctr3b, $rk8 \n aesmc $ctr3b, $ctr3b @ AES block 3 - round 8
aese $ctr1b, $rk8 \n aesmc $ctr1b, $ctr1b @ AES block 1 - round 8
aese $ctr2b, $rk9 @ AES block 2 - round 9
aese $ctr0b, $rk9 @ AES block 0 - round 9
eor $h12k.16b, $h12k.16b, $t0.16b @ h2k | h1k
aese $ctr1b, $rk9 @ AES block 1 - round 9
aese $ctr3b, $rk9 @ AES block 3 - round 9
b.ge .L128_enc_tail @ handle tail
ldp $input_l0, $input_h0, [$input_ptr, #0] @ AES block 0 - load plaintext
ldp $input_l2, $input_h2, [$input_ptr, #32] @ AES block 2 - load plaintext
ldp $input_l1, $input_h1, [$input_ptr, #16] @ AES block 1 - load plaintext
ldp $input_l3, $input_h3, [$input_ptr, #48] @ AES block 3 - load plaintext
eor $input_l0, $input_l0, $rk10_l @ AES block 0 - round 10 low
eor $input_h0, $input_h0, $rk10_h @ AES block 0 - round 10 high
eor $input_l2, $input_l2, $rk10_l @ AES block 2 - round 10 low
fmov $ctr_t0d, $input_l0 @ AES block 0 - mov low
eor $input_l1, $input_l1, $rk10_l @ AES block 1 - round 10 low
eor $input_h2, $input_h2, $rk10_h @ AES block 2 - round 10 high
fmov $ctr_t0.d[1], $input_h0 @ AES block 0 - mov high
fmov $ctr_t1d, $input_l1 @ AES block 1 - mov low
eor $input_h1, $input_h1, $rk10_h @ AES block 1 - round 10 high
eor $input_l3, $input_l3, $rk10_l @ AES block 3 - round 10 low
fmov $ctr_t1.d[1], $input_h1 @ AES block 1 - mov high
fmov $ctr_t2d, $input_l2 @ AES block 2 - mov low
eor $input_h3, $input_h3, $rk10_h @ AES block 3 - round 10 high
rev $ctr32w, $rctr32w @ CTR block 4
fmov $ctr_t2.d[1], $input_h2 @ AES block 2 - mov high
orr $ctr32x, $ctr96_t32x, $ctr32x, lsl #32 @ CTR block 4
eor $res0b, $ctr_t0b, $ctr0b @ AES block 0 - result
fmov $ctr0d, $ctr96_b64x @ CTR block 4
add $rctr32w, $rctr32w, #1 @ CTR block 4
fmov $ctr0.d[1], $ctr32x @ CTR block 4
rev $ctr32w, $rctr32w @ CTR block 5
eor $res1b, $ctr_t1b, $ctr1b @ AES block 1 - result
fmov $ctr1d, $ctr96_b64x @ CTR block 5
orr $ctr32x, $ctr96_t32x, $ctr32x, lsl #32 @ CTR block 5
add $rctr32w, $rctr32w, #1 @ CTR block 5
add $input_ptr, $input_ptr, #64 @ AES input_ptr update
fmov $ctr1.d[1], $ctr32x @ CTR block 5
fmov $ctr_t3d, $input_l3 @ AES block 3 - mov low
rev $ctr32w, $rctr32w @ CTR block 6
st1 { $res0b}, [$output_ptr], #16 @ AES block 0 - store result
fmov $ctr_t3.d[1], $input_h3 @ AES block 3 - mov high
orr $ctr32x, $ctr96_t32x, $ctr32x, lsl #32 @ CTR block 6
add $rctr32w, $rctr32w, #1 @ CTR block 6
eor $res2b, $ctr_t2b, $ctr2b @ AES block 2 - result
st1 { $res1b}, [$output_ptr], #16 @ AES block 1 - store result
fmov $ctr2d, $ctr96_b64x @ CTR block 6
cmp $input_ptr, $main_end_input_ptr @ check if we have <= 8 blocks
fmov $ctr2.d[1], $ctr32x @ CTR block 6
rev $ctr32w, $rctr32w @ CTR block 7
st1 { $res2b}, [$output_ptr], #16 @ AES block 2 - store result
orr $ctr32x, $ctr96_t32x, $ctr32x, lsl #32 @ CTR block 7
eor $res3b, $ctr_t3b, $ctr3b @ AES block 3 - result
st1 { $res3b}, [$output_ptr], #16 @ AES block 3 - store result
b.ge .L128_enc_prepretail @ do prepretail
.L128_enc_main_loop: @ main loop start
ldp $input_l3, $input_h3, [$input_ptr, #48] @ AES block 4k+3 - load plaintext
rev64 $res0b, $res0b @ GHASH block 4k (only t0 is free)
rev64 $res2b, $res2b @ GHASH block 4k+2 (t0, t1, and t2 free)
aese $ctr2b, $rk0 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 0
fmov $ctr3d, $ctr96_b64x @ CTR block 4k+3
ext $acc_lb, $acc_lb, $acc_lb, #8 @ PRE 0
rev64 $res1b, $res1b @ GHASH block 4k+1 (t0 and t1 free)
aese $ctr1b, $rk0 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 0
add $rctr32w, $rctr32w, #1 @ CTR block 4k+3
fmov $ctr3.d[1], $ctr32x @ CTR block 4k+3
aese $ctr0b, $rk0 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 0
mov $t6d, $res2.d[1] @ GHASH block 4k+2 - mid
aese $ctr2b, $rk1 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 1
mov $t3d, $res1.d[1] @ GHASH block 4k+1 - mid
aese $ctr1b, $rk1 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 1
eor $res0b, $res0b, $acc_lb @ PRE 1
aese $ctr3b, $rk0 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 0
eor $input_h3, $input_h3, $rk10_h @ AES block 4k+3 - round 10 high
pmull2 $t1.1q, $res1.2d, $h3.2d @ GHASH block 4k+1 - high
eor $t6.8b, $t6.8b, $res2.8b @ GHASH block 4k+2 - mid
ldp $input_l0, $input_h0, [$input_ptr, #0] @ AES block 4k+4 - load plaintext
aese $ctr0b, $rk1 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 1
rev $ctr32w, $rctr32w @ CTR block 4k+8
eor $t3.8b, $t3.8b, $res1.8b @ GHASH block 4k+1 - mid
mov $t0d, $res0.d[1] @ GHASH block 4k - mid
orr $ctr32x, $ctr96_t32x, $ctr32x, lsl #32 @ CTR block 4k+8
pmull2 $acc_h.1q, $res0.2d, $h4.2d @ GHASH block 4k - high
add $rctr32w, $rctr32w, #1 @ CTR block 4k+8
mov $acc_md, $h34k.d[1] @ GHASH block 4k - mid
aese $ctr0b, $rk2 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 2
pmull $acc_l.1q, $res0.1d, $h4.1d @ GHASH block 4k - low
eor $t0.8b, $t0.8b, $res0.8b @ GHASH block 4k - mid
aese $ctr1b, $rk2 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 2
aese $ctr0b, $rk3 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 3
eor $acc_hb, $acc_hb, $t1.16b @ GHASH block 4k+1 - high
pmull $t5.1q, $res2.1d, $h2.1d @ GHASH block 4k+2 - low
pmull $acc_m.1q, $t0.1d, $acc_m.1d @ GHASH block 4k - mid
rev64 $res3b, $res3b @ GHASH block 4k+3 (t0, t1, t2 and t3 free)
pmull $t3.1q, $t3.1d, $h34k.1d @ GHASH block 4k+1 - mid
pmull $t2.1q, $res1.1d, $h3.1d @ GHASH block 4k+1 - low
ins $t6.d[1], $t6.d[0] @ GHASH block 4k+2 - mid
pmull2 $t4.1q, $res2.2d, $h2.2d @ GHASH block 4k+2 - high
eor $input_h0, $input_h0, $rk10_h @ AES block 4k+4 - round 10 high
eor $acc_mb, $acc_mb, $t3.16b @ GHASH block 4k+1 - mid
mov $t9d, $res3.d[1] @ GHASH block 4k+3 - mid
aese $ctr3b, $rk1 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 1
eor $acc_lb, $acc_lb, $t2.16b @ GHASH block 4k+1 - low
aese $ctr2b, $rk2 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 2
eor $input_l0, $input_l0, $rk10_l @ AES block 4k+4 - round 10 low
aese $ctr1b, $rk3 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 3
eor $t9.8b, $t9.8b, $res3.8b @ GHASH block 4k+3 - mid
pmull2 $t7.1q, $res3.2d, $h1.2d @ GHASH block 4k+3 - high
aese $ctr2b, $rk3 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 3
eor $acc_hb, $acc_hb, $t4.16b @ GHASH block 4k+2 - high
pmull2 $t6.1q, $t6.2d, $h12k.2d @ GHASH block 4k+2 - mid
pmull $t8.1q, $res3.1d, $h1.1d @ GHASH block 4k+3 - low
movi $mod_constant.8b, #0xc2
pmull $t9.1q, $t9.1d, $h12k.1d @ GHASH block 4k+3 - mid
eor $acc_lb, $acc_lb, $t5.16b @ GHASH block 4k+2 - low
aese $ctr1b, $rk4 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 4
aese $ctr3b, $rk2 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 2
shl $mod_constantd, $mod_constantd, #56 @ mod_constant
aese $ctr0b, $rk4 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 4
eor $acc_hb, $acc_hb, $t7.16b @ GHASH block 4k+3 - high
aese $ctr1b, $rk5 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 5
ldp $input_l1, $input_h1, [$input_ptr, #16] @ AES block 4k+5 - load plaintext
aese $ctr3b, $rk3 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 3
eor $acc_mb, $acc_mb, $t6.16b @ GHASH block 4k+2 - mid
aese $ctr0b, $rk5 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 5
ldp $input_l2, $input_h2, [$input_ptr, #32] @ AES block 4k+6 - load plaintext
pmull $mod_t.1q, $acc_h.1d, $mod_constant.1d @ MODULO - top 64b align with mid
eor $acc_lb, $acc_lb, $t8.16b @ GHASH block 4k+3 - low
aese $ctr2b, $rk4 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 4
eor $input_l1, $input_l1, $rk10_l @ AES block 4k+5 - round 10 low
aese $ctr3b, $rk4 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 4
eor $acc_mb, $acc_mb, $t9.16b @ GHASH block 4k+3 - mid
aese $ctr1b, $rk6 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 6
eor $input_l3, $input_l3, $rk10_l @ AES block 4k+3 - round 10 low
aese $ctr2b, $rk5 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 5
eor $t9.16b, $acc_lb, $acc_hb @ MODULO - karatsuba tidy up
fmov $ctr_t0d, $input_l0 @ AES block 4k+4 - mov low
aese $ctr0b, $rk6 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 6
fmov $ctr_t0.d[1], $input_h0 @ AES block 4k+4 - mov high
add $input_ptr, $input_ptr, #64 @ AES input_ptr update
fmov $ctr_t3d, $input_l3 @ AES block 4k+3 - mov low
ext $acc_hb, $acc_hb, $acc_hb, #8 @ MODULO - other top alignment
aese $ctr3b, $rk5 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 5
fmov $ctr_t1d, $input_l1 @ AES block 4k+5 - mov low
aese $ctr0b, $rk7 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 7
eor $acc_mb, $acc_mb, $t9.16b @ MODULO - karatsuba tidy up
aese $ctr2b, $rk6 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 6
eor $input_h1, $input_h1, $rk10_h @ AES block 4k+5 - round 10 high
aese $ctr1b, $rk7 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 7
fmov $ctr_t1.d[1], $input_h1 @ AES block 4k+5 - mov high
aese $ctr0b, $rk8 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 8
fmov $ctr_t3.d[1], $input_h3 @ AES block 4k+3 - mov high
aese $ctr3b, $rk6 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 6
cmp $input_ptr, $main_end_input_ptr @ LOOP CONTROL
aese $ctr1b, $rk8 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 8
eor $acc_mb, $acc_mb, $mod_t.16b @ MODULO - fold into mid
aese $ctr0b, $rk9 @ AES block 4k+4 - round 9
eor $input_l2, $input_l2, $rk10_l @ AES block 4k+6 - round 10 low
eor $input_h2, $input_h2, $rk10_h @ AES block 4k+6 - round 10 high
aese $ctr3b, $rk7 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 7
fmov $ctr_t2d, $input_l2 @ AES block 4k+6 - mov low
aese $ctr1b, $rk9 @ AES block 4k+5 - round 9
fmov $ctr_t2.d[1], $input_h2 @ AES block 4k+6 - mov high
aese $ctr2b, $rk7 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 7
eor $res0b, $ctr_t0b, $ctr0b @ AES block 4k+4 - result
fmov $ctr0d, $ctr96_b64x @ CTR block 4k+8
aese $ctr3b, $rk8 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 8
fmov $ctr0.d[1], $ctr32x @ CTR block 4k+8
rev $ctr32w, $rctr32w @ CTR block 4k+9
eor $acc_mb, $acc_mb, $acc_hb @ MODULO - fold into mid
aese $ctr2b, $rk8 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 8
eor $res1b, $ctr_t1b, $ctr1b @ AES block 4k+5 - result
add $rctr32w, $rctr32w, #1 @ CTR block 4k+9
orr $ctr32x, $ctr96_t32x, $ctr32x, lsl #32 @ CTR block 4k+9
fmov $ctr1d, $ctr96_b64x @ CTR block 4k+9
pmull $acc_h.1q, $acc_m.1d, $mod_constant.1d @ MODULO - mid 64b align with low
fmov $ctr1.d[1], $ctr32x @ CTR block 4k+9
rev $ctr32w, $rctr32w @ CTR block 4k+10
aese $ctr2b, $rk9 @ AES block 4k+6 - round 9
st1 { $res0b}, [$output_ptr], #16 @ AES block 4k+4 - store result
eor $res2b, $ctr_t2b, $ctr2b @ AES block 4k+6 - result
orr $ctr32x, $ctr96_t32x, $ctr32x, lsl #32 @ CTR block 4k+10
aese $ctr3b, $rk9 @ AES block 4k+7 - round 9
add $rctr32w, $rctr32w, #1 @ CTR block 4k+10
ext $acc_mb, $acc_mb, $acc_mb, #8 @ MODULO - other mid alignment
fmov $ctr2d, $ctr96_b64x @ CTR block 4k+10
eor $acc_lb, $acc_lb, $acc_hb @ MODULO - fold into low
st1 { $res1b}, [$output_ptr], #16 @ AES block 4k+5 - store result
fmov $ctr2.d[1], $ctr32x @ CTR block 4k+10
st1 { $res2b}, [$output_ptr], #16 @ AES block 4k+6 - store result
rev $ctr32w, $rctr32w @ CTR block 4k+11
orr $ctr32x, $ctr96_t32x, $ctr32x, lsl #32 @ CTR block 4k+11
eor $res3b, $ctr_t3b, $ctr3b @ AES block 4k+3 - result
eor $acc_lb, $acc_lb, $acc_mb @ MODULO - fold into low
st1 { $res3b}, [$output_ptr], #16 @ AES block 4k+3 - store result
b.lt .L128_enc_main_loop
.L128_enc_prepretail: @ PREPRETAIL
rev64 $res0b, $res0b @ GHASH block 4k (only t0 is free)
fmov $ctr3d, $ctr96_b64x @ CTR block 4k+3
rev64 $res1b, $res1b @ GHASH block 4k+1 (t0 and t1 free)
ext $acc_lb, $acc_lb, $acc_lb, #8 @ PRE 0
add $rctr32w, $rctr32w, #1 @ CTR block 4k+3
fmov $ctr3.d[1], $ctr32x @ CTR block 4k+3
aese $ctr1b, $rk0 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 0
rev64 $res2b, $res2b @ GHASH block 4k+2 (t0, t1, and t2 free)
pmull $t2.1q, $res1.1d, $h3.1d @ GHASH block 4k+1 - low
rev64 $res3b, $res3b @ GHASH block 4k+3 (t0, t1, t2 and t3 free)
eor $res0b, $res0b, $acc_lb @ PRE 1
pmull2 $t1.1q, $res1.2d, $h3.2d @ GHASH block 4k+1 - high
aese $ctr3b, $rk0 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 0
mov $t3d, $res1.d[1] @ GHASH block 4k+1 - mid
pmull $acc_l.1q, $res0.1d, $h4.1d @ GHASH block 4k - low
mov $t0d, $res0.d[1] @ GHASH block 4k - mid
mov $t6d, $res2.d[1] @ GHASH block 4k+2 - mid
mov $acc_md, $h34k.d[1] @ GHASH block 4k - mid
aese $ctr1b, $rk1 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 1
eor $t3.8b, $t3.8b, $res1.8b @ GHASH block 4k+1 - mid
eor $t0.8b, $t0.8b, $res0.8b @ GHASH block 4k - mid
pmull2 $acc_h.1q, $res0.2d, $h4.2d @ GHASH block 4k - high
eor $t6.8b, $t6.8b, $res2.8b @ GHASH block 4k+2 - mid
aese $ctr3b, $rk1 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 1
pmull $t3.1q, $t3.1d, $h34k.1d @ GHASH block 4k+1 - mid
eor $acc_lb, $acc_lb, $t2.16b @ GHASH block 4k+1 - low
pmull $acc_m.1q, $t0.1d, $acc_m.1d @ GHASH block 4k - mid
aese $ctr0b, $rk0 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 0
ins $t6.d[1], $t6.d[0] @ GHASH block 4k+2 - mid
aese $ctr2b, $rk0 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 0
eor $acc_mb, $acc_mb, $t3.16b @ GHASH block 4k+1 - mid
mov $t9d, $res3.d[1] @ GHASH block 4k+3 - mid
aese $ctr0b, $rk1 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 1
eor $acc_hb, $acc_hb, $t1.16b @ GHASH block 4k+1 - high
pmull2 $t6.1q, $t6.2d, $h12k.2d @ GHASH block 4k+2 - mid
pmull2 $t4.1q, $res2.2d, $h2.2d @ GHASH block 4k+2 - high
eor $t9.8b, $t9.8b, $res3.8b @ GHASH block 4k+3 - mid
pmull2 $t7.1q, $res3.2d, $h1.2d @ GHASH block 4k+3 - high
pmull $t5.1q, $res2.1d, $h2.1d @ GHASH block 4k+2 - low
aese $ctr2b, $rk1 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 1
eor $acc_hb, $acc_hb, $t4.16b @ GHASH block 4k+2 - high
aese $ctr0b, $rk2 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 2
pmull $t8.1q, $res3.1d, $h1.1d @ GHASH block 4k+3 - low
movi $mod_constant.8b, #0xc2
aese $ctr2b, $rk2 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 2
eor $acc_lb, $acc_lb, $t5.16b @ GHASH block 4k+2 - low
aese $ctr3b, $rk2 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 2
pmull $t9.1q, $t9.1d, $h12k.1d @ GHASH block 4k+3 - mid
eor $acc_mb, $acc_mb, $t6.16b @ GHASH block 4k+2 - mid
aese $ctr2b, $rk3 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 3
aese $ctr1b, $rk2 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 2
eor $acc_hb, $acc_hb, $t7.16b @ GHASH block 4k+3 - high
aese $ctr0b, $rk3 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 3
eor $acc_mb, $acc_mb, $t9.16b @ GHASH block 4k+3 - mid
shl $mod_constantd, $mod_constantd, #56 @ mod_constant
aese $ctr1b, $rk3 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 3
eor $acc_lb, $acc_lb, $t8.16b @ GHASH block 4k+3 - low
aese $ctr0b, $rk4 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 4
pmull $t1.1q, $acc_h.1d, $mod_constant.1d
eor $acc_mb, $acc_mb, $acc_hb @ karatsuba tidy up
aese $ctr1b, $rk4 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 4
aese $ctr0b, $rk5 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 5
ext $acc_hb, $acc_hb, $acc_hb, #8
aese $ctr3b, $rk3 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 3
aese $ctr2b, $rk4 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 4
eor $acc_mb, $acc_mb, $acc_lb
aese $ctr0b, $rk6 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 6
aese $ctr3b, $rk4 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 4
aese $ctr1b, $rk5 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 5
aese $ctr2b, $rk5 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 5
eor $acc_mb, $acc_mb, $t1.16b
aese $ctr3b, $rk5 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 5
aese $ctr1b, $rk6 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 6
aese $ctr2b, $rk6 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 6
aese $ctr3b, $rk6 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 6
eor $acc_mb, $acc_mb, $acc_hb
aese $ctr0b, $rk7 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 7
aese $ctr2b, $rk7 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 7
aese $ctr3b, $rk7 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 7
pmull $t1.1q, $acc_m.1d, $mod_constant.1d
aese $ctr1b, $rk7 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 7
ext $acc_mb, $acc_mb, $acc_mb, #8
aese $ctr3b, $rk8 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 8
aese $ctr0b, $rk8 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 8
eor $acc_lb, $acc_lb, $t1.16b
aese $ctr1b, $rk8 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 8
aese $ctr3b, $rk9 @ AES block 4k+7 - round 9
aese $ctr2b, $rk8 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 8
aese $ctr0b, $rk9 @ AES block 4k+4 - round 9
aese $ctr1b, $rk9 @ AES block 4k+5 - round 9
eor $acc_lb, $acc_lb, $acc_mb
aese $ctr2b, $rk9 @ AES block 4k+6 - round 9
.L128_enc_tail: @ TAIL
sub $main_end_input_ptr, $end_input_ptr, $input_ptr @ main_end_input_ptr is number of bytes left to process
ldp $input_l0, $input_h0, [$input_ptr], #16 @ AES block 4k+4 - load plaintext
cmp $main_end_input_ptr, #48
ext $t0.16b, $acc_lb, $acc_lb, #8 @ prepare final partial tag
eor $input_l0, $input_l0, $rk10_l @ AES block 4k+4 - round 10 low
eor $input_h0, $input_h0, $rk10_h @ AES block 4k+4 - round 10 high
fmov $ctr_t0d, $input_l0 @ AES block 4k+4 - mov low
fmov $ctr_t0.d[1], $input_h0 @ AES block 4k+4 - mov high
eor $res1b, $ctr_t0b, $ctr0b @ AES block 4k+4 - result
b.gt .L128_enc_blocks_more_than_3
sub $rctr32w, $rctr32w, #1
movi $acc_l.8b, #0
mov $ctr3b, $ctr2b
cmp $main_end_input_ptr, #32
mov $ctr2b, $ctr1b
movi $acc_h.8b, #0
movi $acc_m.8b, #0
b.gt .L128_enc_blocks_more_than_2
mov $ctr3b, $ctr1b
cmp $main_end_input_ptr, #16
sub $rctr32w, $rctr32w, #1
b.gt .L128_enc_blocks_more_than_1
sub $rctr32w, $rctr32w, #1
b .L128_enc_blocks_less_than_1
.L128_enc_blocks_more_than_3: @ blocks left > 3
st1 { $res1b}, [$output_ptr], #16 @ AES final-3 block - store result
ldp $input_l0, $input_h0, [$input_ptr], #16 @ AES final-2 block - load input low & high
rev64 $res0b, $res1b @ GHASH final-3 block
eor $res0b, $res0b, $t0.16b @ feed in partial tag
eor $input_h0, $input_h0, $rk10_h @ AES final-2 block - round 10 high
eor $input_l0, $input_l0, $rk10_l @ AES final-2 block - round 10 low
fmov $res1d, $input_l0 @ AES final-2 block - mov low
movi $t0.8b, #0 @ suppress further partial tag feed in
fmov $res1.d[1], $input_h0 @ AES final-2 block - mov high
pmull $acc_l.1q, $res0.1d, $h4.1d @ GHASH final-3 block - low
mov $rk4d, $res0.d[1] @ GHASH final-3 block - mid
pmull2 $acc_h.1q, $res0.2d, $h4.2d @ GHASH final-3 block - high
mov $acc_md, $h34k.d[1] @ GHASH final-3 block - mid
eor $res1b, $res1b, $ctr1b @ AES final-2 block - result
eor $rk4v.8b, $rk4v.8b, $res0.8b @ GHASH final-3 block - mid
pmull $acc_m.1q, $rk4v.1d, $acc_m.1d @ GHASH final-3 block - mid
.L128_enc_blocks_more_than_2: @ blocks left > 2
st1 { $res1b}, [$output_ptr], #16 @ AES final-2 block - store result
rev64 $res0b, $res1b @ GHASH final-2 block
ldp $input_l0, $input_h0, [$input_ptr], #16 @ AES final-1 block - load input low & high
eor $res0b, $res0b, $t0.16b @ feed in partial tag
eor $input_l0, $input_l0, $rk10_l @ AES final-1 block - round 10 low
fmov $res1d, $input_l0 @ AES final-1 block - mov low
eor $input_h0, $input_h0, $rk10_h @ AES final-1 block - round 10 high
pmull2 $rk2q1, $res0.2d, $h3.2d @ GHASH final-2 block - high
fmov $res1.d[1], $input_h0 @ AES final-1 block - mov high
mov $rk4d, $res0.d[1] @ GHASH final-2 block - mid
pmull $rk3q1, $res0.1d, $h3.1d @ GHASH final-2 block - low
eor $acc_hb, $acc_hb, $rk2 @ GHASH final-2 block - high
eor $rk4v.8b, $rk4v.8b, $res0.8b @ GHASH final-2 block - mid
eor $res1b, $res1b, $ctr2b @ AES final-1 block - result
eor $acc_lb, $acc_lb, $rk3 @ GHASH final-2 block - low
pmull $rk4v.1q, $rk4v.1d, $h34k.1d @ GHASH final-2 block - mid
movi $t0.8b, #0 @ suppress further partial tag feed in
eor $acc_mb, $acc_mb, $rk4v.16b @ GHASH final-2 block - mid
.L128_enc_blocks_more_than_1: @ blocks left > 1
st1 { $res1b}, [$output_ptr], #16 @ AES final-1 block - store result
rev64 $res0b, $res1b @ GHASH final-1 block
ldp $input_l0, $input_h0, [$input_ptr], #16 @ AES final block - load input low & high
eor $res0b, $res0b, $t0.16b @ feed in partial tag
eor $input_h0, $input_h0, $rk10_h @ AES final block - round 10 high
eor $input_l0, $input_l0, $rk10_l @ AES final block - round 10 low
fmov $res1d, $input_l0 @ AES final block - mov low
pmull2 $rk2q1, $res0.2d, $h2.2d @ GHASH final-1 block - high
fmov $res1.d[1], $input_h0 @ AES final block - mov high
mov $rk4d, $res0.d[1] @ GHASH final-1 block - mid
pmull $rk3q1, $res0.1d, $h2.1d @ GHASH final-1 block - low
eor $rk4v.8b, $rk4v.8b, $res0.8b @ GHASH final-1 block - mid
eor $res1b, $res1b, $ctr3b @ AES final block - result
ins $rk4v.d[1], $rk4v.d[0] @ GHASH final-1 block - mid
pmull2 $rk4v.1q, $rk4v.2d, $h12k.2d @ GHASH final-1 block - mid
eor $acc_lb, $acc_lb, $rk3 @ GHASH final-1 block - low
eor $acc_hb, $acc_hb, $rk2 @ GHASH final-1 block - high
eor $acc_mb, $acc_mb, $rk4v.16b @ GHASH final-1 block - mid
movi $t0.8b, #0 @ suppress further partial tag feed in
.L128_enc_blocks_less_than_1: @ blocks left <= 1
and $bit_length, $bit_length, #127 @ bit_length %= 128
mvn $rk10_l, xzr @ rk10_l = 0xffffffffffffffff
mvn $rk10_h, xzr @ rk10_h = 0xffffffffffffffff
sub $bit_length, $bit_length, #128 @ bit_length -= 128
neg $bit_length, $bit_length @ bit_length = 128 - #bits in input (in range [1,128])
and $bit_length, $bit_length, #127 @ bit_length %= 128
lsr $rk10_h, $rk10_h, $bit_length @ rk10_h is mask for top 64b of last block
cmp $bit_length, #64
csel $input_l0, $rk10_l, $rk10_h, lt
csel $input_h0, $rk10_h, xzr, lt
fmov $ctr0d, $input_l0 @ ctr0b is mask for last block
fmov $ctr0.d[1], $input_h0
and $res1b, $res1b, $ctr0b @ possibly partial last block has zeroes in highest bits
rev64 $res0b, $res1b @ GHASH final block
eor $res0b, $res0b, $t0.16b @ feed in partial tag
mov $t0d, $res0.d[1] @ GHASH final block - mid
pmull $rk3q1, $res0.1d, $h1.1d @ GHASH final block - low
ld1 { $rk0}, [$output_ptr] @ load existing bytes where the possibly partial last block is to be stored
eor $t0.8b, $t0.8b, $res0.8b @ GHASH final block - mid
rev $ctr32w, $rctr32w
pmull2 $rk2q1, $res0.2d, $h1.2d @ GHASH final block - high
pmull $t0.1q, $t0.1d, $h12k.1d @ GHASH final block - mid
eor $acc_lb, $acc_lb, $rk3 @ GHASH final block - low
eor $acc_hb, $acc_hb, $rk2 @ GHASH final block - high
eor $acc_mb, $acc_mb, $t0.16b @ GHASH final block - mid
movi $mod_constant.8b, #0xc2
eor $t9.16b, $acc_lb, $acc_hb @ MODULO - karatsuba tidy up
shl $mod_constantd, $mod_constantd, #56 @ mod_constant
eor $acc_mb, $acc_mb, $t9.16b @ MODULO - karatsuba tidy up
pmull $mod_t.1q, $acc_h.1d, $mod_constant.1d @ MODULO - top 64b align with mid
ext $acc_hb, $acc_hb, $acc_hb, #8 @ MODULO - other top alignment
eor $acc_mb, $acc_mb, $mod_t.16b @ MODULO - fold into mid
eor $acc_mb, $acc_mb, $acc_hb @ MODULO - fold into mid
pmull $acc_h.1q, $acc_m.1d, $mod_constant.1d @ MODULO - mid 64b align with low
ext $acc_mb, $acc_mb, $acc_mb, #8 @ MODULO - other mid alignment
bif $res1b, $rk0, $ctr0b @ insert existing bytes in top end of result before storing
eor $acc_lb, $acc_lb, $acc_hb @ MODULO - fold into low
st1 { $res1b}, [$output_ptr] @ store all 16B
str $ctr32w, [$counter, #12] @ store the updated counter
eor $acc_lb, $acc_lb, $acc_mb @ MODULO - fold into low
ext $acc_lb, $acc_lb, $acc_lb, #8
rev64 $acc_lb, $acc_lb
mov x0, $len
st1 { $acc_l.16b }, [$current_tag]
ldp x21, x22, [sp, #16]
ldp x23, x24, [sp, #32]
ldp d8, d9, [sp, #48]
ldp d10, d11, [sp, #64]
ldp d12, d13, [sp, #80]
ldp d14, d15, [sp, #96]
ldp x19, x20, [sp], #112
ret
.L128_enc_ret:
mov w0, #0x0
ret
.size aes_gcm_enc_128_kernel,.-aes_gcm_enc_128_kernel
___
#########################################################################################
# size_t aes_gcm_dec_128_kernel(const unsigned char *in,
# size_t len,
# unsigned char *out,
# const void *key,
# unsigned char ivec[16],
# u64 *Xi);
#
$code.=<<___;
.global aes_gcm_dec_128_kernel
.type aes_gcm_dec_128_kernel,%function
.align 4
aes_gcm_dec_128_kernel:
AARCH64_VALID_CALL_TARGET
cbz x1, .L128_dec_ret
stp x19, x20, [sp, #-112]!
mov x16, x4
mov x8, x5
stp x21, x22, [sp, #16]
stp x23, x24, [sp, #32]
stp d8, d9, [sp, #48]
stp d10, d11, [sp, #64]
stp d12, d13, [sp, #80]
stp d14, d15, [sp, #96]
lsr $main_end_input_ptr, $bit_length, #3 @ byte_len
mov $len, $main_end_input_ptr
ldp $ctr96_b64x, $ctr96_t32x, [$counter] @ ctr96_b64, ctr96_t32
sub $main_end_input_ptr, $main_end_input_ptr, #1 @ byte_len - 1
ldr $rk0q, [$cc, #0] @ load rk0
and $main_end_input_ptr, $main_end_input_ptr, #0xffffffffffffffc0 @ number of bytes to be processed in main loop (at least 1 byte must be handled by tail)
ld1 { $ctr0b}, [$counter] @ special case vector load initial counter so we can start first AES block as quickly as possible
ldr $h2q, [$current_tag, #64] @ load h2l | h2h
ext $h2b, $h2b, $h2b, #8
lsr $rctr32x, $ctr96_t32x, #32
fmov $ctr2d, $ctr96_b64x @ CTR block 2
ldr $rk1q, [$cc, #16] @ load rk1
orr $ctr96_t32w, $ctr96_t32w, $ctr96_t32w
rev $rctr32w, $rctr32w @ rev_ctr32
fmov $ctr1d, $ctr96_b64x @ CTR block 1
add $rctr32w, $rctr32w, #1 @ increment rev_ctr32
aese $ctr0b, $rk0 \n aesmc $ctr0b, $ctr0b @ AES block 0 - round 0
rev $ctr32w, $rctr32w @ CTR block 1
orr $ctr32x, $ctr96_t32x, $ctr32x, lsl #32 @ CTR block 1
ldr $rk2q, [$cc, #32] @ load rk2
add $rctr32w, $rctr32w, #1 @ CTR block 1
fmov $ctr1.d[1], $ctr32x @ CTR block 1
rev $ctr32w, $rctr32w @ CTR block 2
add $rctr32w, $rctr32w, #1 @ CTR block 2
aese $ctr0b, $rk1 \n aesmc $ctr0b, $ctr0b @ AES block 0 - round 1
orr $ctr32x, $ctr96_t32x, $ctr32x, lsl #32 @ CTR block 2
fmov $ctr2.d[1], $ctr32x @ CTR block 2
rev $ctr32w, $rctr32w @ CTR block 3
fmov $ctr3d, $ctr96_b64x @ CTR block 3
orr $ctr32x, $ctr96_t32x, $ctr32x, lsl #32 @ CTR block 3
add $rctr32w, $rctr32w, #1 @ CTR block 3
fmov $ctr3.d[1], $ctr32x @ CTR block 3
add $end_input_ptr, $input_ptr, $bit_length, lsr #3 @ end_input_ptr
aese $ctr1b, $rk0 \n aesmc $ctr1b, $ctr1b @ AES block 1 - round 0
ldr $rk3q, [$cc, #48] @ load rk3
aese $ctr0b, $rk2 \n aesmc $ctr0b, $ctr0b @ AES block 0 - round 2
ldr $rk6q, [$cc, #96] @ load rk6
aese $ctr2b, $rk0 \n aesmc $ctr2b, $ctr2b @ AES block 2 - round 0
ldr $rk7q, [$cc, #112] @ load rk7
aese $ctr1b, $rk1 \n aesmc $ctr1b, $ctr1b @ AES block 1 - round 1
ldr $rk4q, [$cc, #64] @ load rk4
aese $ctr3b, $rk0 \n aesmc $ctr3b, $ctr3b @ AES block 3 - round 0
aese $ctr2b, $rk1 \n aesmc $ctr2b, $ctr2b @ AES block 2 - round 1
aese $ctr1b, $rk2 \n aesmc $ctr1b, $ctr1b @ AES block 1 - round 2
ldp $rk10_l, $rk10_h, [$cc, #160] @ load rk10
aese $ctr3b, $rk1 \n aesmc $ctr3b, $ctr3b @ AES block 3 - round 1
ld1 { $acc_lb}, [$current_tag]
ext $acc_lb, $acc_lb, $acc_lb, #8
rev64 $acc_lb, $acc_lb
aese $ctr0b, $rk3 \n aesmc $ctr0b, $ctr0b @ AES block 0 - round 3
ldr $rk5q, [$cc, #80] @ load rk5
aese $ctr1b, $rk3 \n aesmc $ctr1b, $ctr1b @ AES block 1 - round 3
aese $ctr3b, $rk2 \n aesmc $ctr3b, $ctr3b @ AES block 3 - round 2
aese $ctr2b, $rk2 \n aesmc $ctr2b, $ctr2b @ AES block 2 - round 2
ldr $rk9q, [$cc, #144] @ load rk9
aese $ctr1b, $rk4 \n aesmc $ctr1b, $ctr1b @ AES block 1 - round 4
aese $ctr3b, $rk3 \n aesmc $ctr3b, $ctr3b @ AES block 3 - round 3
aese $ctr2b, $rk3 \n aesmc $ctr2b, $ctr2b @ AES block 2 - round 3
ldr $h3q, [$current_tag, #80] @ load h3l | h3h
ext $h3b, $h3b, $h3b, #8
aese $ctr0b, $rk4 \n aesmc $ctr0b, $ctr0b @ AES block 0 - round 4
ldr $rk8q, [$cc, #128] @ load rk8
aese $ctr1b, $rk5 \n aesmc $ctr1b, $ctr1b @ AES block 1 - round 5
aese $ctr2b, $rk4 \n aesmc $ctr2b, $ctr2b @ AES block 2 - round 4
aese $ctr3b, $rk4 \n aesmc $ctr3b, $ctr3b @ AES block 3 - round 4
aese $ctr0b, $rk5 \n aesmc $ctr0b, $ctr0b @ AES block 0 - round 5
aese $ctr2b, $rk5 \n aesmc $ctr2b, $ctr2b @ AES block 2 - round 5
ldr $h1q, [$current_tag, #32] @ load h1l | h1h
ext $h1b, $h1b, $h1b, #8
aese $ctr3b, $rk5 \n aesmc $ctr3b, $ctr3b @ AES block 3 - round 5
aese $ctr0b, $rk6 \n aesmc $ctr0b, $ctr0b @ AES block 0 - round 6
aese $ctr1b, $rk6 \n aesmc $ctr1b, $ctr1b @ AES block 1 - round 6
aese $ctr3b, $rk6 \n aesmc $ctr3b, $ctr3b @ AES block 3 - round 6
aese $ctr2b, $rk6 \n aesmc $ctr2b, $ctr2b @ AES block 2 - round 6
trn1 $t0.2d, $h1.2d, $h2.2d @ h2h | h1h
ldr $h4q, [$current_tag, #112] @ load h4l | h4h
ext $h4b, $h4b, $h4b, #8
trn2 $h12k.2d, $h1.2d, $h2.2d @ h2l | h1l
add $main_end_input_ptr, $main_end_input_ptr, $input_ptr
aese $ctr1b, $rk7 \n aesmc $ctr1b, $ctr1b @ AES block 1 - round 7
aese $ctr2b, $rk7 \n aesmc $ctr2b, $ctr2b @ AES block 2 - round 7
aese $ctr0b, $rk7 \n aesmc $ctr0b, $ctr0b @ AES block 0 - round 7
eor $h12k.16b, $h12k.16b, $t0.16b @ h2k | h1k
aese $ctr3b, $rk7 \n aesmc $ctr3b, $ctr3b @ AES block 3 - round 7
aese $ctr1b, $rk8 \n aesmc $ctr1b, $ctr1b @ AES block 1 - round 8
trn2 $h34k.2d, $h3.2d, $h4.2d @ h4l | h3l
aese $ctr2b, $rk8 \n aesmc $ctr2b, $ctr2b @ AES block 2 - round 8
aese $ctr3b, $rk8 \n aesmc $ctr3b, $ctr3b @ AES block 3 - round 8
aese $ctr0b, $rk8 \n aesmc $ctr0b, $ctr0b @ AES block 0 - round 8
trn1 $acc_h.2d, $h3.2d, $h4.2d @ h4h | h3h
aese $ctr2b, $rk9 @ AES block 2 - round 9
aese $ctr3b, $rk9 @ AES block 3 - round 9
aese $ctr0b, $rk9 @ AES block 0 - round 9
cmp $input_ptr, $main_end_input_ptr @ check if we have <= 4 blocks
aese $ctr1b, $rk9 @ AES block 1 - round 9
eor $h34k.16b, $h34k.16b, $acc_h.16b @ h4k | h3k
b.ge .L128_dec_tail @ handle tail
ldr $res1q, [$input_ptr, #16] @ AES block 1 - load ciphertext
ldr $res0q, [$input_ptr, #0] @ AES block 0 - load ciphertext
eor $ctr1b, $res1b, $ctr1b @ AES block 1 - result
ldr $res2q, [$input_ptr, #32] @ AES block 2 - load ciphertext
eor $ctr0b, $res0b, $ctr0b @ AES block 0 - result
rev64 $res0b, $res0b @ GHASH block 0
rev $ctr32w, $rctr32w @ CTR block 4
orr $ctr32x, $ctr96_t32x, $ctr32x, lsl #32 @ CTR block 4
add $rctr32w, $rctr32w, #1 @ CTR block 4
ldr $res3q, [$input_ptr, #48] @ AES block 3 - load ciphertext
rev64 $res1b, $res1b @ GHASH block 1
add $input_ptr, $input_ptr, #64 @ AES input_ptr update
mov $output_l1, $ctr1.d[0] @ AES block 1 - mov low
mov $output_h1, $ctr1.d[1] @ AES block 1 - mov high
mov $output_l0, $ctr0.d[0] @ AES block 0 - mov low
cmp $input_ptr, $main_end_input_ptr @ check if we have <= 8 blocks
mov $output_h0, $ctr0.d[1] @ AES block 0 - mov high
fmov $ctr0d, $ctr96_b64x @ CTR block 4
fmov $ctr0.d[1], $ctr32x @ CTR block 4
rev $ctr32w, $rctr32w @ CTR block 5
eor $output_l1, $output_l1, $rk10_l @ AES block 1 - round 10 low
fmov $ctr1d, $ctr96_b64x @ CTR block 5
add $rctr32w, $rctr32w, #1 @ CTR block 5
orr $ctr32x, $ctr96_t32x, $ctr32x, lsl #32 @ CTR block 5
fmov $ctr1.d[1], $ctr32x @ CTR block 5
rev $ctr32w, $rctr32w @ CTR block 6
add $rctr32w, $rctr32w, #1 @ CTR block 6
orr $ctr32x, $ctr96_t32x, $ctr32x, lsl #32 @ CTR block 6
eor $output_h1, $output_h1, $rk10_h @ AES block 1 - round 10 high
eor $output_l0, $output_l0, $rk10_l @ AES block 0 - round 10 low
eor $ctr2b, $res2b, $ctr2b @ AES block 2 - result
eor $output_h0, $output_h0, $rk10_h @ AES block 0 - round 10 high
stp $output_l0, $output_h0, [$output_ptr], #16 @ AES block 0 - store result
stp $output_l1, $output_h1, [$output_ptr], #16 @ AES block 1 - store result
b.ge .L128_dec_prepretail @ do prepretail
.L128_dec_main_loop: @ main loop start
eor $ctr3b, $res3b, $ctr3b @ AES block 4k+3 - result
ext $acc_lb, $acc_lb, $acc_lb, #8 @ PRE 0
mov $output_l2, $ctr2.d[0] @ AES block 4k+2 - mov low
pmull2 $t1.1q, $res1.2d, $h3.2d @ GHASH block 4k+1 - high
mov $output_h2, $ctr2.d[1] @ AES block 4k+2 - mov high
aese $ctr1b, $rk0 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 0
fmov $ctr2d, $ctr96_b64x @ CTR block 4k+6
rev64 $res2b, $res2b @ GHASH block 4k+2
fmov $ctr2.d[1], $ctr32x @ CTR block 4k+6
rev $ctr32w, $rctr32w @ CTR block 4k+7
mov $output_l3, $ctr3.d[0] @ AES block 4k+3 - mov low
eor $res0b, $res0b, $acc_lb @ PRE 1
mov $t3d, $res1.d[1] @ GHASH block 4k+1 - mid
aese $ctr1b, $rk1 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 1
rev64 $res3b, $res3b @ GHASH block 4k+3
pmull $t2.1q, $res1.1d, $h3.1d @ GHASH block 4k+1 - low
mov $output_h3, $ctr3.d[1] @ AES block 4k+3 - mov high
orr $ctr32x, $ctr96_t32x, $ctr32x, lsl #32 @ CTR block 4k+7
pmull $acc_l.1q, $res0.1d, $h4.1d @ GHASH block 4k - low
fmov $ctr3d, $ctr96_b64x @ CTR block 4k+7
eor $t3.8b, $t3.8b, $res1.8b @ GHASH block 4k+1 - mid
aese $ctr1b, $rk2 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 2
fmov $ctr3.d[1], $ctr32x @ CTR block 4k+7
aese $ctr2b, $rk0 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 0
mov $acc_md, $h34k.d[1] @ GHASH block 4k - mid
pmull2 $acc_h.1q, $res0.2d, $h4.2d @ GHASH block 4k - high
eor $acc_lb, $acc_lb, $t2.16b @ GHASH block 4k+1 - low
pmull $t8.1q, $res3.1d, $h1.1d @ GHASH block 4k+3 - low
aese $ctr1b, $rk3 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 3
mov $t0d, $res0.d[1] @ GHASH block 4k - mid
aese $ctr3b, $rk0 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 0
eor $acc_hb, $acc_hb, $t1.16b @ GHASH block 4k+1 - high
aese $ctr0b, $rk0 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 0
pmull $t5.1q, $res2.1d, $h2.1d @ GHASH block 4k+2 - low
eor $t0.8b, $t0.8b, $res0.8b @ GHASH block 4k - mid
aese $ctr3b, $rk1 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 1
eor $output_l3, $output_l3, $rk10_l @ AES block 4k+3 - round 10 low
pmull $t3.1q, $t3.1d, $h34k.1d @ GHASH block 4k+1 - mid
eor $output_h2, $output_h2, $rk10_h @ AES block 4k+2 - round 10 high
mov $t6d, $res2.d[1] @ GHASH block 4k+2 - mid
aese $ctr0b, $rk1 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 1
eor $acc_lb, $acc_lb, $t5.16b @ GHASH block 4k+2 - low
pmull $acc_m.1q, $t0.1d, $acc_m.1d @ GHASH block 4k - mid
aese $ctr3b, $rk2 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 2
eor $t6.8b, $t6.8b, $res2.8b @ GHASH block 4k+2 - mid
aese $ctr0b, $rk2 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 2
aese $ctr1b, $rk4 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 4
eor $acc_mb, $acc_mb, $t3.16b @ GHASH block 4k+1 - mid
pmull2 $t4.1q, $res2.2d, $h2.2d @ GHASH block 4k+2 - high
aese $ctr0b, $rk3 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 3
ins $t6.d[1], $t6.d[0] @ GHASH block 4k+2 - mid
pmull2 $t7.1q, $res3.2d, $h1.2d @ GHASH block 4k+3 - high
aese $ctr2b, $rk1 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 1
mov $t9d, $res3.d[1] @ GHASH block 4k+3 - mid
aese $ctr0b, $rk4 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 4
eor $acc_hb, $acc_hb, $t4.16b @ GHASH block 4k+2 - high
pmull2 $t6.1q, $t6.2d, $h12k.2d @ GHASH block 4k+2 - mid
eor $output_h3, $output_h3, $rk10_h @ AES block 4k+3 - round 10 high
aese $ctr2b, $rk2 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 2
eor $t9.8b, $t9.8b, $res3.8b @ GHASH block 4k+3 - mid
aese $ctr1b, $rk5 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 5
eor $output_l2, $output_l2, $rk10_l @ AES block 4k+2 - round 10 low
aese $ctr0b, $rk5 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 5
movi $mod_constant.8b, #0xc2
aese $ctr2b, $rk3 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 3
eor $acc_lb, $acc_lb, $t8.16b @ GHASH block 4k+3 - low
aese $ctr1b, $rk6 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 6
aese $ctr0b, $rk6 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 6
eor $acc_mb, $acc_mb, $t6.16b @ GHASH block 4k+2 - mid
aese $ctr2b, $rk4 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 4
stp $output_l2, $output_h2, [$output_ptr], #16 @ AES block 4k+2 - store result
pmull $t9.1q, $t9.1d, $h12k.1d @ GHASH block 4k+3 - mid
eor $acc_hb, $acc_hb, $t7.16b @ GHASH block 4k+3 - high
ldr $res0q, [$input_ptr, #0] @ AES block 4k+4 - load ciphertext
aese $ctr1b, $rk7 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 7
add $rctr32w, $rctr32w, #1 @ CTR block 4k+7
aese $ctr0b, $rk7 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 7
shl $mod_constantd, $mod_constantd, #56 @ mod_constant
aese $ctr2b, $rk5 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 5
eor $acc_mb, $acc_mb, $t9.16b @ GHASH block 4k+3 - mid
aese $ctr1b, $rk8 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 8
stp $output_l3, $output_h3, [$output_ptr], #16 @ AES block 4k+3 - store result
aese $ctr0b, $rk8 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 8
eor $t9.16b, $acc_lb, $acc_hb @ MODULO - karatsuba tidy up
aese $ctr3b, $rk3 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 3
rev $ctr32w, $rctr32w @ CTR block 4k+8
pmull $mod_t.1q, $acc_h.1d, $mod_constant.1d @ MODULO - top 64b align with mid
ldr $res1q, [$input_ptr, #16] @ AES block 4k+5 - load ciphertext
ext $acc_hb, $acc_hb, $acc_hb, #8 @ MODULO - other top alignment
aese $ctr0b, $rk9 @ AES block 4k+4 - round 9
orr $ctr32x, $ctr96_t32x, $ctr32x, lsl #32 @ CTR block 4k+8
aese $ctr3b, $rk4 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 4
eor $acc_mb, $acc_mb, $t9.16b @ MODULO - karatsuba tidy up
aese $ctr1b, $rk9 @ AES block 4k+5 - round 9
aese $ctr2b, $rk6 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 6
eor $ctr0b, $res0b, $ctr0b @ AES block 4k+4 - result
aese $ctr3b, $rk5 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 5
ldr $res2q, [$input_ptr, #32] @ AES block 4k+6 - load ciphertext
add $rctr32w, $rctr32w, #1 @ CTR block 4k+8
eor $acc_mb, $acc_mb, $mod_t.16b @ MODULO - fold into mid
eor $ctr1b, $res1b, $ctr1b @ AES block 4k+5 - result
aese $ctr2b, $rk7 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 7
ldr $res3q, [$input_ptr, #48] @ AES block 4k+3 - load ciphertext
aese $ctr3b, $rk6 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 6
add $input_ptr, $input_ptr, #64 @ AES input_ptr update
rev64 $res1b, $res1b @ GHASH block 4k+5
eor $acc_mb, $acc_mb, $acc_hb @ MODULO - fold into mid
mov $output_h0, $ctr0.d[1] @ AES block 4k+4 - mov high
aese $ctr2b, $rk8 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 8
mov $output_l0, $ctr0.d[0] @ AES block 4k+4 - mov low
aese $ctr3b, $rk7 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 7
fmov $ctr0d, $ctr96_b64x @ CTR block 4k+8
pmull $mod_constant.1q, $acc_m.1d, $mod_constant.1d @ MODULO - mid 64b align with low
fmov $ctr0.d[1], $ctr32x @ CTR block 4k+8
rev $ctr32w, $rctr32w @ CTR block 4k+9
aese $ctr2b, $rk9 @ AES block 4k+6 - round 9
orr $ctr32x, $ctr96_t32x, $ctr32x, lsl #32 @ CTR block 4k+9
ext $acc_mb, $acc_mb, $acc_mb, #8 @ MODULO - other mid alignment
aese $ctr3b, $rk8 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 8
eor $output_h0, $output_h0, $rk10_h @ AES block 4k+4 - round 10 high
eor $acc_lb, $acc_lb, $mod_constant.16b @ MODULO - fold into low
mov $output_h1, $ctr1.d[1] @ AES block 4k+5 - mov high
eor $output_l0, $output_l0, $rk10_l @ AES block 4k+4 - round 10 low
eor $ctr2b, $res2b, $ctr2b @ AES block 4k+6 - result
mov $output_l1, $ctr1.d[0] @ AES block 4k+5 - mov low
add $rctr32w, $rctr32w, #1 @ CTR block 4k+9
aese $ctr3b, $rk9 @ AES block 4k+7 - round 9
fmov $ctr1d, $ctr96_b64x @ CTR block 4k+9
cmp $input_ptr, $main_end_input_ptr @ LOOP CONTROL
rev64 $res0b, $res0b @ GHASH block 4k+4
eor $acc_lb, $acc_lb, $acc_mb @ MODULO - fold into low
fmov $ctr1.d[1], $ctr32x @ CTR block 4k+9
rev $ctr32w, $rctr32w @ CTR block 4k+10
add $rctr32w, $rctr32w, #1 @ CTR block 4k+10
eor $output_h1, $output_h1, $rk10_h @ AES block 4k+5 - round 10 high
stp $output_l0, $output_h0, [$output_ptr], #16 @ AES block 4k+4 - store result
eor $output_l1, $output_l1, $rk10_l @ AES block 4k+5 - round 10 low
stp $output_l1, $output_h1, [$output_ptr], #16 @ AES block 4k+5 - store result
orr $ctr32x, $ctr96_t32x, $ctr32x, lsl #32 @ CTR block 4k+10
b.lt L128_dec_main_loop
.L128_dec_prepretail: @ PREPRETAIL
ext $acc_lb, $acc_lb, $acc_lb, #8 @ PRE 0
mov $output_l2, $ctr2.d[0] @ AES block 4k+2 - mov low
mov $t3d, $res1.d[1] @ GHASH block 4k+1 - mid
aese $ctr0b, $rk0 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 0
eor $ctr3b, $res3b, $ctr3b @ AES block 4k+3 - result
aese $ctr1b, $rk0 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 0
mov $output_h2, $ctr2.d[1] @ AES block 4k+2 - mov high
eor $res0b, $res0b, $acc_lb @ PRE 1
fmov $ctr2d, $ctr96_b64x @ CTR block 4k+6
rev64 $res2b, $res2b @ GHASH block 4k+2
aese $ctr0b, $rk1 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 1
fmov $ctr2.d[1], $ctr32x @ CTR block 4k+6
rev $ctr32w, $rctr32w @ CTR block 4k+7
mov $output_l3, $ctr3.d[0] @ AES block 4k+3 - mov low
eor $t3.8b, $t3.8b, $res1.8b @ GHASH block 4k+1 - mid
pmull $acc_l.1q, $res0.1d, $h4.1d @ GHASH block 4k - low
mov $acc_md, $h34k.d[1] @ GHASH block 4k - mid
mov $output_h3, $ctr3.d[1] @ AES block 4k+3 - mov high
aese $ctr1b, $rk1 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 1
mov $t6d, $res2.d[1] @ GHASH block 4k+2 - mid
aese $ctr0b, $rk2 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 2
orr $ctr32x, $ctr96_t32x, $ctr32x, lsl #32 @ CTR block 4k+7
pmull $t2.1q, $res1.1d, $h3.1d @ GHASH block 4k+1 - low
mov $t0d, $res0.d[1] @ GHASH block 4k - mid
fmov $ctr3d, $ctr96_b64x @ CTR block 4k+7
aese $ctr2b, $rk0 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 0
fmov $ctr3.d[1], $ctr32x @ CTR block 4k+7
pmull $t3.1q, $t3.1d, $h34k.1d @ GHASH block 4k+1 - mid
eor $t6.8b, $t6.8b, $res2.8b @ GHASH block 4k+2 - mid
rev64 $res3b, $res3b @ GHASH block 4k+3
aese $ctr2b, $rk1 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 1
eor $t0.8b, $t0.8b, $res0.8b @ GHASH block 4k - mid
pmull2 $acc_h.1q, $res0.2d, $h4.2d @ GHASH block 4k - high
aese $ctr3b, $rk0 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 0
ins $t6.d[1], $t6.d[0] @ GHASH block 4k+2 - mid
pmull2 $t1.1q, $res1.2d, $h3.2d @ GHASH block 4k+1 - high
pmull $acc_m.1q, $t0.1d, $acc_m.1d @ GHASH block 4k - mid
eor $acc_lb, $acc_lb, $t2.16b @ GHASH block 4k+1 - low
pmull $t8.1q, $res3.1d, $h1.1d @ GHASH block 4k+3 - low
pmull2 $t6.1q, $t6.2d, $h12k.2d @ GHASH block 4k+2 - mid
eor $acc_hb, $acc_hb, $t1.16b @ GHASH block 4k+1 - high
eor $acc_mb, $acc_mb, $t3.16b @ GHASH block 4k+1 - mid
pmull2 $t7.1q, $res3.2d, $h1.2d @ GHASH block 4k+3 - high
pmull2 $t4.1q, $res2.2d, $h2.2d @ GHASH block 4k+2 - high
mov $t9d, $res3.d[1] @ GHASH block 4k+3 - mid
aese $ctr1b, $rk2 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 2
eor $acc_mb, $acc_mb, $t6.16b @ GHASH block 4k+2 - mid
pmull $t5.1q, $res2.1d, $h2.1d @ GHASH block 4k+2 - low
eor $acc_hb, $acc_hb, $t4.16b @ GHASH block 4k+2 - high
movi $mod_constant.8b, #0xc2
aese $ctr3b, $rk1 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 1
eor $t9.8b, $t9.8b, $res3.8b @ GHASH block 4k+3 - mid
eor $acc_lb, $acc_lb, $t5.16b @ GHASH block 4k+2 - low
aese $ctr2b, $rk2 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 2
eor $acc_hb, $acc_hb, $t7.16b @ GHASH block 4k+3 - high
aese $ctr3b, $rk2 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 2
eor $output_l3, $output_l3, $rk10_l @ AES block 4k+3 - round 10 low
pmull $t9.1q, $t9.1d, $h12k.1d @ GHASH block 4k+3 - mid
eor $output_l2, $output_l2, $rk10_l @ AES block 4k+2 - round 10 low
eor $acc_lb, $acc_lb, $t8.16b @ GHASH block 4k+3 - low
aese $ctr2b, $rk3 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 3
aese $ctr1b, $rk3 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 3
shl $mod_constantd, $mod_constantd, #56 @ mod_constant
aese $ctr0b, $rk3 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 3
aese $ctr2b, $rk4 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 4
eor $acc_mb, $acc_mb, $t9.16b @ GHASH block 4k+3 - mid
aese $ctr1b, $rk4 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 4
aese $ctr3b, $rk3 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 3
eor $t9.16b, $acc_lb, $acc_hb @ MODULO - karatsuba tidy up
aese $ctr2b, $rk5 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 5
aese $ctr1b, $rk5 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 5
aese $ctr3b, $rk4 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 4
aese $ctr0b, $rk4 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 4
eor $acc_mb, $acc_mb, $t9.16b @ MODULO - karatsuba tidy up
pmull $mod_t.1q, $acc_h.1d, $mod_constant.1d @ MODULO - top 64b align with mid
aese $ctr1b, $rk6 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 6
ext $acc_hb, $acc_hb, $acc_hb, #8 @ MODULO - other top alignment
aese $ctr3b, $rk5 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 5
aese $ctr0b, $rk5 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 5
eor $acc_mb, $acc_mb, $mod_t.16b @ MODULO - fold into mid
aese $ctr1b, $rk7 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 7
aese $ctr2b, $rk6 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 6
aese $ctr0b, $rk6 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 6
aese $ctr1b, $rk8 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 8
eor $acc_mb, $acc_mb, $acc_hb @ MODULO - fold into mid
aese $ctr3b, $rk6 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 6
aese $ctr0b, $rk7 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 7
aese $ctr1b, $rk9 @ AES block 4k+5 - round 9
pmull $mod_constant.1q, $acc_m.1d, $mod_constant.1d @ MODULO - mid 64b align with low
eor $output_h3, $output_h3, $rk10_h @ AES block 4k+3 - round 10 high
aese $ctr2b, $rk7 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 7
ext $acc_mb, $acc_mb, $acc_mb, #8 @ MODULO - other mid alignment
aese $ctr3b, $rk7 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 7
aese $ctr0b, $rk8 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 8
eor $acc_lb, $acc_lb, $mod_constant.16b @ MODULO - fold into low
aese $ctr2b, $rk8 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 8
aese $ctr3b, $rk8 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 8
eor $output_h2, $output_h2, $rk10_h @ AES block 4k+2 - round 10 high
aese $ctr0b, $rk9 @ AES block 4k+4 - round 9
stp $output_l2, $output_h2, [$output_ptr], #16 @ AES block 4k+2 - store result
aese $ctr2b, $rk9 @ AES block 4k+6 - round 9
add $rctr32w, $rctr32w, #1 @ CTR block 4k+7
stp $output_l3, $output_h3, [$output_ptr], #16 @ AES block 4k+3 - store result
aese $ctr3b, $rk9 @ AES block 4k+7 - round 9
eor $acc_lb, $acc_lb, $acc_mb @ MODULO - fold into low
.L128_dec_tail: @ TAIL
sub $main_end_input_ptr, $end_input_ptr, $input_ptr @ main_end_input_ptr is number of bytes left to process
ld1 { $res1b}, [$input_ptr], #16 @ AES block 4k+4 - load ciphertext
eor $ctr0b, $res1b, $ctr0b @ AES block 4k+4 - result
mov $output_h0, $ctr0.d[1] @ AES block 4k+4 - mov high
mov $output_l0, $ctr0.d[0] @ AES block 4k+4 - mov low
cmp $main_end_input_ptr, #48
eor $output_h0, $output_h0, $rk10_h @ AES block 4k+4 - round 10 high
ext $t0.16b, $acc_lb, $acc_lb, #8 @ prepare final partial tag
eor $output_l0, $output_l0, $rk10_l @ AES block 4k+4 - round 10 low
b.gt .L128_dec_blocks_more_than_3
mov $ctr3b, $ctr2b
sub $rctr32w, $rctr32w, #1
movi $acc_l.8b, #0
movi $acc_h.8b, #0
mov $ctr2b, $ctr1b
movi $acc_m.8b, #0
cmp $main_end_input_ptr, #32
b.gt .L128_dec_blocks_more_than_2
cmp $main_end_input_ptr, #16
mov $ctr3b, $ctr1b
sub $rctr32w, $rctr32w, #1
b.gt .L128_dec_blocks_more_than_1
sub $rctr32w, $rctr32w, #1
b .L128_dec_blocks_less_than_1
.L128_dec_blocks_more_than_3: @ blocks left > 3
rev64 $res0b, $res1b @ GHASH final-3 block
ld1 { $res1b}, [$input_ptr], #16 @ AES final-2 block - load ciphertext
eor $res0b, $res0b, $t0.16b @ feed in partial tag
mov $acc_md, $h34k.d[1] @ GHASH final-3 block - mid
stp $output_l0, $output_h0, [$output_ptr], #16 @ AES final-3 block - store result
eor $ctr0b, $res1b, $ctr1b @ AES final-2 block - result
mov $rk4d, $res0.d[1] @ GHASH final-3 block - mid
mov $output_h0, $ctr0.d[1] @ AES final-2 block - mov high
pmull $acc_l.1q, $res0.1d, $h4.1d @ GHASH final-3 block - low
mov $output_l0, $ctr0.d[0] @ AES final-2 block - mov low
pmull2 $acc_h.1q, $res0.2d, $h4.2d @ GHASH final-3 block - high
eor $rk4v.8b, $rk4v.8b, $res0.8b @ GHASH final-3 block - mid
movi $t0.8b, #0 @ suppress further partial tag feed in
eor $output_h0, $output_h0, $rk10_h @ AES final-2 block - round 10 high
pmull $acc_m.1q, $rk4v.1d, $acc_m.1d @ GHASH final-3 block - mid
eor $output_l0, $output_l0, $rk10_l @ AES final-2 block - round 10 low
.L128_dec_blocks_more_than_2: @ blocks left > 2
rev64 $res0b, $res1b @ GHASH final-2 block
ld1 { $res1b}, [$input_ptr], #16 @ AES final-1 block - load ciphertext
eor $res0b, $res0b, $t0.16b @ feed in partial tag
eor $ctr0b, $res1b, $ctr2b @ AES final-1 block - result
stp $output_l0, $output_h0, [$output_ptr], #16 @ AES final-2 block - store result
mov $rk4d, $res0.d[1] @ GHASH final-2 block - mid
pmull $rk3q1, $res0.1d, $h3.1d @ GHASH final-2 block - low
pmull2 $rk2q1, $res0.2d, $h3.2d @ GHASH final-2 block - high
mov $output_l0, $ctr0.d[0] @ AES final-1 block - mov low
mov $output_h0, $ctr0.d[1] @ AES final-1 block - mov high
eor $rk4v.8b, $rk4v.8b, $res0.8b @ GHASH final-2 block - mid
movi $t0.8b, #0 @ suppress further partial tag feed in
pmull $rk4v.1q, $rk4v.1d, $h34k.1d @ GHASH final-2 block - mid
eor $output_l0, $output_l0, $rk10_l @ AES final-1 block - round 10 low
eor $acc_lb, $acc_lb, $rk3 @ GHASH final-2 block - low
eor $acc_hb, $acc_hb, $rk2 @ GHASH final-2 block - high
eor $acc_mb, $acc_mb, $rk4v.16b @ GHASH final-2 block - mid
eor $output_h0, $output_h0, $rk10_h @ AES final-1 block - round 10 high
.L128_dec_blocks_more_than_1: @ blocks left > 1
rev64 $res0b, $res1b @ GHASH final-1 block
ld1 { $res1b}, [$input_ptr], #16 @ AES final block - load ciphertext
eor $res0b, $res0b, $t0.16b @ feed in partial tag
mov $rk4d, $res0.d[1] @ GHASH final-1 block - mid
eor $ctr0b, $res1b, $ctr3b @ AES final block - result
eor $rk4v.8b, $rk4v.8b, $res0.8b @ GHASH final-1 block - mid
stp $output_l0, $output_h0, [$output_ptr], #16 @ AES final-1 block - store result
mov $output_l0, $ctr0.d[0] @ AES final block - mov low
mov $output_h0, $ctr0.d[1] @ AES final block - mov high
ins $rk4v.d[1], $rk4v.d[0] @ GHASH final-1 block - mid
pmull $rk3q1, $res0.1d, $h2.1d @ GHASH final-1 block - low
pmull2 $rk2q1, $res0.2d, $h2.2d @ GHASH final-1 block - high
pmull2 $rk4v.1q, $rk4v.2d, $h12k.2d @ GHASH final-1 block - mid
movi $t0.8b, #0 @ suppress further partial tag feed in
eor $acc_lb, $acc_lb, $rk3 @ GHASH final-1 block - low
eor $acc_hb, $acc_hb, $rk2 @ GHASH final-1 block - high
eor $output_h0, $output_h0, $rk10_h @ AES final block - round 10 high
eor $output_l0, $output_l0, $rk10_l @ AES final block - round 10 low
eor $acc_mb, $acc_mb, $rk4v.16b @ GHASH final-1 block - mid
.L128_dec_blocks_less_than_1: @ blocks left <= 1
mvn $rk10_h, xzr @ rk10_h = 0xffffffffffffffff
and $bit_length, $bit_length, #127 @ bit_length %= 128
mvn $rk10_l, xzr @ rk10_l = 0xffffffffffffffff
sub $bit_length, $bit_length, #128 @ bit_length -= 128
neg $bit_length, $bit_length @ bit_length = 128 - #bits in input (in range [1,128])
and $bit_length, $bit_length, #127 @ bit_length %= 128
lsr $rk10_h, $rk10_h, $bit_length @ rk10_h is mask for top 64b of last block
cmp $bit_length, #64
csel $ctr96_b64x, $rk10_h, xzr, lt
csel $ctr32x, $rk10_l, $rk10_h, lt
fmov $ctr0d, $ctr32x @ ctr0b is mask for last block
mov $ctr0.d[1], $ctr96_b64x
and $res1b, $res1b, $ctr0b @ possibly partial last block has zeroes in highest bits
rev64 $res0b, $res1b @ GHASH final block
eor $res0b, $res0b, $t0.16b @ feed in partial tag
ldp $end_input_ptr, $main_end_input_ptr, [$output_ptr] @ load existing bytes we need to not overwrite
and $output_h0, $output_h0, $ctr96_b64x
pmull2 $rk2q1, $res0.2d, $h1.2d @ GHASH final block - high
mov $t0d, $res0.d[1] @ GHASH final block - mid
eor $t0.8b, $t0.8b, $res0.8b @ GHASH final block - mid
eor $acc_hb, $acc_hb, $rk2 @ GHASH final block - high
pmull $t0.1q, $t0.1d, $h12k.1d @ GHASH final block - mid
pmull $rk3q1, $res0.1d, $h1.1d @ GHASH final block - low
bic $end_input_ptr, $end_input_ptr, $ctr32x @ mask out low existing bytes
and $output_l0, $output_l0, $ctr32x
rev $ctr32w, $rctr32w
eor $acc_mb, $acc_mb, $t0.16b @ GHASH final block - mid
movi $mod_constant.8b, #0xc2
eor $acc_lb, $acc_lb, $rk3 @ GHASH final block - low
bic $main_end_input_ptr, $main_end_input_ptr, $ctr96_b64x @ mask out high existing bytes
shl $mod_constantd, $mod_constantd, #56 @ mod_constant
eor $t9.16b, $acc_lb, $acc_hb @ MODULO - karatsuba tidy up
pmull $mod_t.1q, $acc_h.1d, $mod_constant.1d @ MODULO - top 64b align with mid
eor $acc_mb, $acc_mb, $t9.16b @ MODULO - karatsuba tidy up
orr $output_l0, $output_l0, $end_input_ptr
str $ctr32w, [$counter, #12] @ store the updated counter
orr $output_h0, $output_h0, $main_end_input_ptr
stp $output_l0, $output_h0, [$output_ptr]
ext $acc_hb, $acc_hb, $acc_hb, #8 @ MODULO - other top alignment
eor $acc_mb, $acc_mb, $mod_t.16b @ MODULO - fold into mid
eor $acc_mb, $acc_mb, $acc_hb @ MODULO - fold into mid
pmull $mod_constant.1q, $acc_m.1d, $mod_constant.1d @ MODULO - mid 64b align with low
ext $acc_mb, $acc_mb, $acc_mb, #8 @ MODULO - other mid alignment
eor $acc_lb, $acc_lb, $mod_constant.16b @ MODULO - fold into low
eor $acc_lb, $acc_lb, $acc_mb @ MODULO - fold into low
ext $acc_lb, $acc_lb, $acc_lb, #8
rev64 $acc_lb, $acc_lb
mov x0, $len
st1 { $acc_l.16b }, [$current_tag]
ldp x21, x22, [sp, #16]
ldp x23, x24, [sp, #32]
ldp d8, d9, [sp, #48]
ldp d10, d11, [sp, #64]
ldp d12, d13, [sp, #80]
ldp d14, d15, [sp, #96]
ldp x19, x20, [sp], #112
ret
.L128_dec_ret:
mov w0, #0x0
ret
.size aes_gcm_dec_128_kernel,.-aes_gcm_dec_128_kernel
___
}
{
my ($end_input_ptr,$main_end_input_ptr,$input_l0,$input_h0)=map("x$_",(4..7));
my ($input_l1,$input_h1,$input_l2,$input_h2,$input_l3,$input_h3)=map("x$_",(19..24));
my ($output_l1,$output_h1,$output_l2,$output_h2,$output_l3,$output_h3)=map("x$_",(19..24));
my ($output_l0,$output_h0)=map("x$_",(6..7));
my $ctr32w="w9";
my ($ctr32x,$ctr96_b64x,$ctr96_t32x,$rctr32x,$rk12_l,$rk12_h,$len)=map("x$_",(9..15));
my ($ctr96_t32w,$rctr32w)=map("w$_",(11..12));
my ($ctr0b,$ctr1b,$ctr2b,$ctr3b,$res0b,$res1b,$res2b,$res3b)=map("v$_.16b",(0..7));
my ($ctr0,$ctr1,$ctr2,$ctr3,$res0,$res1,$res2,$res3)=map("v$_",(0..7));
my ($ctr0d,$ctr1d,$ctr2d,$ctr3d,$res0d,$res1d,$res2d,$res3d)=map("d$_",(0..7));
my ($res0q,$res1q,$res2q,$res3q)=map("q$_",(4..7));
my ($acc_hb,$acc_mb,$acc_lb)=map("v$_.16b",(9..11));
my ($acc_h,$acc_m,$acc_l)=map("v$_",(9..11));
my ($acc_hd,$acc_md,$acc_ld)=map("d$_",(9..11));
my ($h1,$h2,$h3,$h4,$h12k,$h34k)=map("v$_",(12..17));
my ($h1q,$h2q,$h3q,$h4q)=map("q$_",(12..15));
my ($h1b,$h2b,$h3b,$h4b)=map("v$_.16b",(12..15));
my $t0="v8";
my $t0d="d8";
my $t3="v4";
my $t3d="d4";
my ($t1,$t2)=map("v$_",(30..31));
my ($t1d,$t2d)=map("d$_",(30..31));
my $t4="v30";
my $t4d="d30";
my $t5="v8";
my $t5d="d8";
my $t6="v31";
my $t6d="d31";
my $t7="v5";
my $t7d="d5";
my $t8="v6";
my $t8d="d6";
my $t9="v30";
my $t9d="d30";
my ($ctr_t0,$ctr_t1,$ctr_t2,$ctr_t3)=map("v$_",(4..7));
my ($ctr_t0d,$ctr_t1d,$ctr_t2d,$ctr_t3d)=map("d$_",(4..7));
my ($ctr_t0b,$ctr_t1b,$ctr_t2b,$ctr_t3b)=map("v$_.16b",(4..7));
my $mod_constantd="d8";
my $mod_constant="v8";
my $mod_t="v31";
my ($rk0,$rk1,$rk2,$rk3,$rk4,$rk5,$rk6,$rk7,$rk8,$rk9,$rk10,$rk11)=map("v$_.16b",(18..29));
my ($rk0q,$rk1q,$rk2q,$rk3q,$rk4q,$rk5q,$rk6q,$rk7q,$rk8q,$rk9q,$rk10q,$rk11q)=map("q$_",(18..29));
my $rk2q1="v20.1q";
my $rk3q1="v21.1q";
my $rk4v="v22";
my $rk4d="d22";
#########################################################################################
# size_t aes_gcm_enc_192_kernel(const unsigned char *in,
# size_t len,
# unsigned char *out,
# const void *key,
# unsigned char ivec[16],
# u64 *Xi);
#
$code.=<<___;
.global aes_gcm_enc_192_kernel
.type aes_gcm_enc_192_kernel,%function
.align 4
aes_gcm_enc_192_kernel:
AARCH64_VALID_CALL_TARGET
cbz x1, .L192_enc_ret
stp x19, x20, [sp, #-112]!
mov x16, x4
mov x8, x5
stp x21, x22, [sp, #16]
stp x23, x24, [sp, #32]
stp d8, d9, [sp, #48]
stp d10, d11, [sp, #64]
stp d12, d13, [sp, #80]
stp d14, d15, [sp, #96]
ldp $ctr96_b64x, $ctr96_t32x, [$counter] @ ctr96_b64, ctr96_t32
ldr $rk5q, [$cc, #80] @ load rk5
ldr $rk4q, [$cc, #64] @ load rk4
ldr $rk8q, [$cc, #128] @ load rk8
lsr $rctr32x, $ctr96_t32x, #32
ldr $rk6q, [$cc, #96] @ load rk6
orr $ctr96_t32w, $ctr96_t32w, $ctr96_t32w
ldr $rk7q, [$cc, #112] @ load rk7
rev $rctr32w, $rctr32w @ rev_ctr32
add $rctr32w, $rctr32w, #1 @ increment rev_ctr32
fmov $ctr3d, $ctr96_b64x @ CTR block 3
rev $ctr32w, $rctr32w @ CTR block 1
add $rctr32w, $rctr32w, #1 @ CTR block 1
fmov $ctr1d, $ctr96_b64x @ CTR block 1
orr $ctr32x, $ctr96_t32x, $ctr32x, lsl #32 @ CTR block 1
ld1 { $ctr0b}, [$counter] @ special case vector load initial counter so we can start first AES block as quickly as possible
fmov $ctr1.d[1], $ctr32x @ CTR block 1
rev $ctr32w, $rctr32w @ CTR block 2
add $rctr32w, $rctr32w, #1 @ CTR block 2
fmov $ctr2d, $ctr96_b64x @ CTR block 2
orr $ctr32x, $ctr96_t32x, $ctr32x, lsl #32 @ CTR block 2
fmov $ctr2.d[1], $ctr32x @ CTR block 2
rev $ctr32w, $rctr32w @ CTR block 3
orr $ctr32x, $ctr96_t32x, $ctr32x, lsl #32 @ CTR block 3
ldr $rk0q, [$cc, #0] @ load rk0
fmov $ctr3.d[1], $ctr32x @ CTR block 3
ldr $rk3q, [$cc, #48] @ load rk3
ldp $rk12_l, $rk12_h, [$cc, #192] @ load rk12
ldr $rk1q, [$cc, #16] @ load rk1
aese $ctr0b, $rk0 \n aesmc $ctr0b, $ctr0b @ AES block 0 - round 0
ld1 { $acc_lb}, [$current_tag]
ext $acc_lb, $acc_lb, $acc_lb, #8
rev64 $acc_lb, $acc_lb
aese $ctr3b, $rk0 \n aesmc $ctr3b, $ctr3b @ AES block 3 - round 0
ldr $rk11q, [$cc, #176] @ load rk11
aese $ctr1b, $rk0 \n aesmc $ctr1b, $ctr1b @ AES block 1 - round 0
ldr $h4q, [$current_tag, #112] @ load h4l | h4h
ext $h4b, $h4b, $h4b, #8
aese $ctr2b, $rk0 \n aesmc $ctr2b, $ctr2b @ AES block 2 - round 0
ldr $rk2q, [$cc, #32] @ load rk2
aese $ctr0b, $rk1 \n aesmc $ctr0b, $ctr0b @ AES block 0 - round 1
ldr $rk10q, [$cc, #160] @ load rk10
aese $ctr1b, $rk1 \n aesmc $ctr1b, $ctr1b @ AES block 1 - round 1
ldr $h1q, [$current_tag, #32] @ load h1l | h1h
ext $h1b, $h1b, $h1b, #8
aese $ctr2b, $rk1 \n aesmc $ctr2b, $ctr2b @ AES block 2 - round 1
ldr $rk9q, [$cc, #144] @ load rk9
aese $ctr3b, $rk1 \n aesmc $ctr3b, $ctr3b @ AES block 3 - round 1
ldr $h3q, [$current_tag, #80] @ load h3l | h3h
ext $h3b, $h3b, $h3b, #8
aese $ctr0b, $rk2 \n aesmc $ctr0b, $ctr0b @ AES block 0 - round 2
aese $ctr2b, $rk2 \n aesmc $ctr2b, $ctr2b @ AES block 2 - round 2
aese $ctr3b, $rk2 \n aesmc $ctr3b, $ctr3b @ AES block 3 - round 2
aese $ctr0b, $rk3 \n aesmc $ctr0b, $ctr0b @ AES block 0 - round 3
trn1 $acc_h.2d, $h3.2d, $h4.2d @ h4h | h3h
aese $ctr2b, $rk3 \n aesmc $ctr2b, $ctr2b @ AES block 2 - round 3
aese $ctr1b, $rk2 \n aesmc $ctr1b, $ctr1b @ AES block 1 - round 2
trn2 $h34k.2d, $h3.2d, $h4.2d @ h4l | h3l
aese $ctr0b, $rk4 \n aesmc $ctr0b, $ctr0b @ AES block 0 - round 4
aese $ctr3b, $rk3 \n aesmc $ctr3b, $ctr3b @ AES block 3 - round 3
aese $ctr1b, $rk3 \n aesmc $ctr1b, $ctr1b @ AES block 1 - round 3
aese $ctr0b, $rk5 \n aesmc $ctr0b, $ctr0b @ AES block 0 - round 5
aese $ctr2b, $rk4 \n aesmc $ctr2b, $ctr2b @ AES block 2 - round 4
aese $ctr1b, $rk4 \n aesmc $ctr1b, $ctr1b @ AES block 1 - round 4
aese $ctr0b, $rk6 \n aesmc $ctr0b, $ctr0b @ AES block 0 - round 6
aese $ctr3b, $rk4 \n aesmc $ctr3b, $ctr3b @ AES block 3 - round 4
aese $ctr2b, $rk5 \n aesmc $ctr2b, $ctr2b @ AES block 2 - round 5
aese $ctr1b, $rk5 \n aesmc $ctr1b, $ctr1b @ AES block 1 - round 5
aese $ctr3b, $rk5 \n aesmc $ctr3b, $ctr3b @ AES block 3 - round 5
aese $ctr2b, $rk6 \n aesmc $ctr2b, $ctr2b @ AES block 2 - round 6
ldr $h2q, [$current_tag, #64] @ load h2l | h2h
ext $h2b, $h2b, $h2b, #8
aese $ctr1b, $rk6 \n aesmc $ctr1b, $ctr1b @ AES block 1 - round 6
aese $ctr3b, $rk6 \n aesmc $ctr3b, $ctr3b @ AES block 3 - round 6
aese $ctr0b, $rk7 \n aesmc $ctr0b, $ctr0b @ AES block 0 - round 7
aese $ctr1b, $rk7 \n aesmc $ctr1b, $ctr1b @ AES block 1 - round 7
trn2 $h12k.2d, $h1.2d, $h2.2d @ h2l | h1l
aese $ctr3b, $rk7 \n aesmc $ctr3b, $ctr3b @ AES block 3 - round 7
aese $ctr0b, $rk8 \n aesmc $ctr0b, $ctr0b @ AES block 0 - round 8
aese $ctr2b, $rk7 \n aesmc $ctr2b, $ctr2b @ AES block 2 - round 7
trn1 $t0.2d, $h1.2d, $h2.2d @ h2h | h1h
aese $ctr1b, $rk8 \n aesmc $ctr1b, $ctr1b @ AES block 1 - round 8
aese $ctr3b, $rk8 \n aesmc $ctr3b, $ctr3b @ AES block 3 - round 8
aese $ctr2b, $rk8 \n aesmc $ctr2b, $ctr2b @ AES block 2 - round 8
aese $ctr0b, $rk9 \n aesmc $ctr0b, $ctr0b @ AES block 0 - round 9
aese $ctr3b, $rk9 \n aesmc $ctr3b, $ctr3b @ AES block 3 - round 9
aese $ctr2b, $rk9 \n aesmc $ctr2b, $ctr2b @ AES block 2 - round 9
aese $ctr1b, $rk9 \n aesmc $ctr1b, $ctr1b @ AES block 1 - round 9
aese $ctr0b, $rk10 \n aesmc $ctr0b, $ctr0b @ AES block 0 - round 10
aese $ctr2b, $rk10 \n aesmc $ctr2b, $ctr2b @ AES block 2 - round 10
aese $ctr1b, $rk10 \n aesmc $ctr1b, $ctr1b @ AES block 1 - round 10
lsr $main_end_input_ptr, $bit_length, #3 @ byte_len
mov $len, $main_end_input_ptr
aese $ctr3b, $rk10 \n aesmc $ctr3b, $ctr3b @ AES block 3 - round 10
sub $main_end_input_ptr, $main_end_input_ptr, #1 @ byte_len - 1
eor $h12k.16b, $h12k.16b, $t0.16b @ h2k | h1k
and $main_end_input_ptr, $main_end_input_ptr, #0xffffffffffffffc0 @ number of bytes to be processed in main loop (at least 1 byte must be handled by tail)
eor $h34k.16b, $h34k.16b, $acc_h.16b @ h4k | h3k
aese $ctr2b, $rk11 @ AES block 2 - round 11
add $end_input_ptr, $input_ptr, $bit_length, lsr #3 @ end_input_ptr
add $main_end_input_ptr, $main_end_input_ptr, $input_ptr
aese $ctr1b, $rk11 @ AES block 1 - round 11
cmp $input_ptr, $main_end_input_ptr @ check if we have <= 4 blocks
aese $ctr0b, $rk11 @ AES block 0 - round 11
add $rctr32w, $rctr32w, #1 @ CTR block 3
aese $ctr3b, $rk11 @ AES block 3 - round 11
b.ge .L192_enc_tail @ handle tail
rev $ctr32w, $rctr32w @ CTR block 4
ldp $input_l0, $input_h0, [$input_ptr, #0] @ AES block 0 - load plaintext
orr $ctr32x, $ctr96_t32x, $ctr32x, lsl #32 @ CTR block 4
ldp $input_l2, $input_h2, [$input_ptr, #32] @ AES block 2 - load plaintext
ldp $input_l3, $input_h3, [$input_ptr, #48] @ AES block 3 - load plaintext
ldp $input_l1, $input_h1, [$input_ptr, #16] @ AES block 1 - load plaintext
add $input_ptr, $input_ptr, #64 @ AES input_ptr update
cmp $input_ptr, $main_end_input_ptr @ check if we have <= 8 blocks
eor $input_l0, $input_l0, $rk12_l @ AES block 0 - round 12 low
eor $input_h0, $input_h0, $rk12_h @ AES block 0 - round 12 high
eor $input_h2, $input_h2, $rk12_h @ AES block 2 - round 12 high
fmov $ctr_t0d, $input_l0 @ AES block 0 - mov low
eor $input_h3, $input_h3, $rk12_h @ AES block 3 - round 12 high
fmov $ctr_t0.d[1], $input_h0 @ AES block 0 - mov high
eor $input_l2, $input_l2, $rk12_l @ AES block 2 - round 12 low
eor $input_l1, $input_l1, $rk12_l @ AES block 1 - round 12 low
fmov $ctr_t1d, $input_l1 @ AES block 1 - mov low
eor $input_h1, $input_h1, $rk12_h @ AES block 1 - round 12 high
fmov $ctr_t1.d[1], $input_h1 @ AES block 1 - mov high
eor $input_l3, $input_l3, $rk12_l @ AES block 3 - round 12 low
fmov $ctr_t2d, $input_l2 @ AES block 2 - mov low
add $rctr32w, $rctr32w, #1 @ CTR block 4
eor $res0b, $ctr_t0b, $ctr0b @ AES block 0 - result
fmov $ctr0d, $ctr96_b64x @ CTR block 4
fmov $ctr0.d[1], $ctr32x @ CTR block 4
rev $ctr32w, $rctr32w @ CTR block 5
orr $ctr32x, $ctr96_t32x, $ctr32x, lsl #32 @ CTR block 5
add $rctr32w, $rctr32w, #1 @ CTR block 5
fmov $ctr_t3d, $input_l3 @ AES block 3 - mov low
st1 { $res0b}, [$output_ptr], #16 @ AES block 0 - store result
fmov $ctr_t2.d[1], $input_h2 @ AES block 2 - mov high
eor $res1b, $ctr_t1b, $ctr1b @ AES block 1 - result
fmov $ctr1d, $ctr96_b64x @ CTR block 5
st1 { $res1b}, [$output_ptr], #16 @ AES block 1 - store result
fmov $ctr_t3.d[1], $input_h3 @ AES block 3 - mov high
fmov $ctr1.d[1], $ctr32x @ CTR block 5
rev $ctr32w, $rctr32w @ CTR block 6
orr $ctr32x, $ctr96_t32x, $ctr32x, lsl #32 @ CTR block 6
add $rctr32w, $rctr32w, #1 @ CTR block 6
eor $res2b, $ctr_t2b, $ctr2b @ AES block 2 - result
fmov $ctr2d, $ctr96_b64x @ CTR block 6
fmov $ctr2.d[1], $ctr32x @ CTR block 6
rev $ctr32w, $rctr32w @ CTR block 7
orr $ctr32x, $ctr96_t32x, $ctr32x, lsl #32 @ CTR block 7
st1 { $res2b}, [$output_ptr], #16 @ AES block 2 - store result
eor $res3b, $ctr_t3b, $ctr3b @ AES block 3 - result
st1 { $res3b}, [$output_ptr], #16 @ AES block 3 - store result
b.ge .L192_enc_prepretail @ do prepretail
.L192_enc_main_loop: @ main loop start
aese $ctr2b, $rk0 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 0
rev64 $res1b, $res1b @ GHASH block 4k+1 (t0 and t1 free)
aese $ctr1b, $rk0 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 0
ldp $input_l1, $input_h1, [$input_ptr, #16] @ AES block 4k+5 - load plaintext
ext $acc_lb, $acc_lb, $acc_lb, #8 @ PRE 0
fmov $ctr3d, $ctr96_b64x @ CTR block 4k+3
rev64 $res0b, $res0b @ GHASH block 4k (only t0 is free)
aese $ctr2b, $rk1 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 1
fmov $ctr3.d[1], $ctr32x @ CTR block 4k+3
pmull2 $t1.1q, $res1.2d, $h3.2d @ GHASH block 4k+1 - high
rev64 $res3b, $res3b @ GHASH block 4k+3 (t0, t1, t2 and t3 free)
ldp $input_l2, $input_h2, [$input_ptr, #32] @ AES block 4k+6 - load plaintext
aese $ctr0b, $rk0 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 0
ldp $input_l3, $input_h3, [$input_ptr, #48] @ AES block 4k+3 - load plaintext
pmull $t2.1q, $res1.1d, $h3.1d @ GHASH block 4k+1 - low
eor $res0b, $res0b, $acc_lb @ PRE 1
aese $ctr1b, $rk1 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 1
aese $ctr0b, $rk1 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 1
rev64 $res2b, $res2b @ GHASH block 4k+2 (t0, t1, and t2 free)
aese $ctr3b, $rk0 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 0
eor $input_h3, $input_h3, $rk12_h @ AES block 4k+3 - round 12 high
pmull $acc_l.1q, $res0.1d, $h4.1d @ GHASH block 4k - low
mov $t0d, $res0.d[1] @ GHASH block 4k - mid
aese $ctr0b, $rk2 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 2
aese $ctr3b, $rk1 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 1
eor $input_l2, $input_l2, $rk12_l @ AES block 4k+6 - round 12 low
eor $t0.8b, $t0.8b, $res0.8b @ GHASH block 4k - mid
eor $acc_lb, $acc_lb, $t2.16b @ GHASH block 4k+1 - low
aese $ctr0b, $rk3 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 3
eor $input_l1, $input_l1, $rk12_l @ AES block 4k+5 - round 12 low
aese $ctr1b, $rk2 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 2
mov $t6d, $res2.d[1] @ GHASH block 4k+2 - mid
pmull2 $acc_h.1q, $res0.2d, $h4.2d @ GHASH block 4k - high
mov $t3d, $res1.d[1] @ GHASH block 4k+1 - mid
aese $ctr2b, $rk2 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 2
aese $ctr1b, $rk3 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 3
mov $acc_md, $h34k.d[1] @ GHASH block 4k - mid
eor $acc_hb, $acc_hb, $t1.16b @ GHASH block 4k+1 - high
aese $ctr3b, $rk2 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 2
eor $t6.8b, $t6.8b, $res2.8b @ GHASH block 4k+2 - mid
pmull2 $t4.1q, $res2.2d, $h2.2d @ GHASH block 4k+2 - high
aese $ctr0b, $rk4 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 4
eor $t3.8b, $t3.8b, $res1.8b @ GHASH block 4k+1 - mid
aese $ctr3b, $rk3 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 3
pmull2 $t7.1q, $res3.2d, $h1.2d @ GHASH block 4k+3 - high
eor $input_h1, $input_h1, $rk12_h @ AES block 4k+5 - round 12 high
ins $t6.d[1], $t6.d[0] @ GHASH block 4k+2 - mid
aese $ctr0b, $rk5 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 5
add $rctr32w, $rctr32w, #1 @ CTR block 4k+3
aese $ctr3b, $rk4 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 4
eor $acc_hb, $acc_hb, $t4.16b @ GHASH block 4k+2 - high
pmull $t3.1q, $t3.1d, $h34k.1d @ GHASH block 4k+1 - mid
eor $input_h2, $input_h2, $rk12_h @ AES block 4k+6 - round 12 high
pmull2 $t6.1q, $t6.2d, $h12k.2d @ GHASH block 4k+2 - mid
eor $input_l3, $input_l3, $rk12_l @ AES block 4k+3 - round 12 low
mov $t9d, $res3.d[1] @ GHASH block 4k+3 - mid
pmull $acc_m.1q, $t0.1d, $acc_m.1d @ GHASH block 4k - mid
rev $ctr32w, $rctr32w @ CTR block 4k+8
pmull $t5.1q, $res2.1d, $h2.1d @ GHASH block 4k+2 - low
orr $ctr32x, $ctr96_t32x, $ctr32x, lsl #32 @ CTR block 4k+8
aese $ctr2b, $rk3 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 3
eor $t9.8b, $t9.8b, $res3.8b @ GHASH block 4k+3 - mid
aese $ctr1b, $rk4 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 4
ldp $input_l0, $input_h0, [$input_ptr, #0] @ AES block 4k+4 - load plaintext
aese $ctr0b, $rk6 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 6
eor $acc_lb, $acc_lb, $t5.16b @ GHASH block 4k+2 - low
aese $ctr2b, $rk4 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 4
add $input_ptr, $input_ptr, #64 @ AES input_ptr update
aese $ctr1b, $rk5 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 5
movi $mod_constant.8b, #0xc2
pmull $t8.1q, $res3.1d, $h1.1d @ GHASH block 4k+3 - low
eor $input_h0, $input_h0, $rk12_h @ AES block 4k+4 - round 12 high
eor $acc_mb, $acc_mb, $t3.16b @ GHASH block 4k+1 - mid
aese $ctr2b, $rk5 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 5
eor $input_l0, $input_l0, $rk12_l @ AES block 4k+4 - round 12 low
aese $ctr1b, $rk6 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 6
shl $mod_constantd, $mod_constantd, #56 @ mod_constant
aese $ctr3b, $rk5 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 5
eor $acc_hb, $acc_hb, $t7.16b @ GHASH block 4k+3 - high
aese $ctr0b, $rk7 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 7
fmov $ctr_t1d, $input_l1 @ AES block 4k+5 - mov low
aese $ctr1b, $rk7 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 7
eor $acc_mb, $acc_mb, $t6.16b @ GHASH block 4k+2 - mid
aese $ctr3b, $rk6 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 6
fmov $ctr_t1.d[1], $input_h1 @ AES block 4k+5 - mov high
aese $ctr0b, $rk8 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 8
eor $acc_lb, $acc_lb, $t8.16b @ GHASH block 4k+3 - low
pmull $t9.1q, $t9.1d, $h12k.1d @ GHASH block 4k+3 - mid
cmp $input_ptr, $main_end_input_ptr @ LOOP CONTROL
fmov $ctr_t0d, $input_l0 @ AES block 4k+4 - mov low
aese $ctr2b, $rk6 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 6
fmov $ctr_t0.d[1], $input_h0 @ AES block 4k+4 - mov high
aese $ctr1b, $rk8 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 8
fmov $ctr_t3d, $input_l3 @ AES block 4k+3 - mov low
eor $acc_mb, $acc_mb, $t9.16b @ GHASH block 4k+3 - mid
eor $t9.16b, $acc_lb, $acc_hb @ MODULO - karatsuba tidy up
add $rctr32w, $rctr32w, #1 @ CTR block 4k+8
aese $ctr2b, $rk7 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 7
fmov $ctr_t3.d[1], $input_h3 @ AES block 4k+3 - mov high
pmull $mod_t.1q, $acc_h.1d, $mod_constant.1d @ MODULO - top 64b align with mid
ext $acc_hb, $acc_hb, $acc_hb, #8 @ MODULO - other top alignment
fmov $ctr_t2d, $input_l2 @ AES block 4k+6 - mov low
aese $ctr3b, $rk7 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 7
aese $ctr0b, $rk9 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 9
eor $acc_mb, $acc_mb, $t9.16b @ MODULO - karatsuba tidy up
aese $ctr2b, $rk8 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 8
aese $ctr3b, $rk8 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 8
aese $ctr1b, $rk9 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 9
aese $ctr0b, $rk10 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 10
eor $acc_mb, $acc_mb, $mod_t.16b @ MODULO - fold into mid
aese $ctr3b, $rk9 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 9
aese $ctr2b, $rk9 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 9
aese $ctr0b, $rk11 @ AES block 4k+4 - round 11
aese $ctr1b, $rk10 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 10
eor $acc_mb, $acc_mb, $acc_hb @ MODULO - fold into mid
aese $ctr2b, $rk10 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 10
eor $res0b, $ctr_t0b, $ctr0b @ AES block 4k+4 - result
fmov $ctr0d, $ctr96_b64x @ CTR block 4k+8
aese $ctr1b, $rk11 @ AES block 4k+5 - round 11
fmov $ctr0.d[1], $ctr32x @ CTR block 4k+8
rev $ctr32w, $rctr32w @ CTR block 4k+9
pmull $acc_h.1q, $acc_m.1d, $mod_constant.1d @ MODULO - mid 64b align with low
fmov $ctr_t2.d[1], $input_h2 @ AES block 4k+6 - mov high
st1 { $res0b}, [$output_ptr], #16 @ AES block 4k+4 - store result
aese $ctr3b, $rk10 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 10
orr $ctr32x, $ctr96_t32x, $ctr32x, lsl #32 @ CTR block 4k+9
eor $res1b, $ctr_t1b, $ctr1b @ AES block 4k+5 - result
add $rctr32w, $rctr32w, #1 @ CTR block 4k+9
fmov $ctr1d, $ctr96_b64x @ CTR block 4k+9
aese $ctr2b, $rk11 @ AES block 4k+6 - round 11
fmov $ctr1.d[1], $ctr32x @ CTR block 4k+9
rev $ctr32w, $rctr32w @ CTR block 4k+10
add $rctr32w, $rctr32w, #1 @ CTR block 4k+10
ext $acc_mb, $acc_mb, $acc_mb, #8 @ MODULO - other mid alignment
orr $ctr32x, $ctr96_t32x, $ctr32x, lsl #32 @ CTR block 4k+10
st1 { $res1b}, [$output_ptr], #16 @ AES block 4k+5 - store result
eor $acc_lb, $acc_lb, $acc_hb @ MODULO - fold into low
aese $ctr3b, $rk11 @ AES block 4k+7 - round 11
eor $res2b, $ctr_t2b, $ctr2b @ AES block 4k+6 - result
fmov $ctr2d, $ctr96_b64x @ CTR block 4k+10
st1 { $res2b}, [$output_ptr], #16 @ AES block 4k+6 - store result
fmov $ctr2.d[1], $ctr32x @ CTR block 4k+10
rev $ctr32w, $rctr32w @ CTR block 4k+11
eor $acc_lb, $acc_lb, $acc_mb @ MODULO - fold into low
orr $ctr32x, $ctr96_t32x, $ctr32x, lsl #32 @ CTR block 4k+11
eor $res3b, $ctr_t3b, $ctr3b @ AES block 4k+3 - result
st1 { $res3b}, [$output_ptr], #16 @ AES block 4k+3 - store result
b.lt .L192_enc_main_loop
.L192_enc_prepretail: @ PREPRETAIL
aese $ctr0b, $rk0 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 0
rev64 $res0b, $res0b @ GHASH block 4k (only t0 is free)
fmov $ctr3d, $ctr96_b64x @ CTR block 4k+3
ext $acc_lb, $acc_lb, $acc_lb, #8 @ PRE 0
add $rctr32w, $rctr32w, #1 @ CTR block 4k+3
aese $ctr1b, $rk0 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 0
rev64 $res1b, $res1b @ GHASH block 4k+1 (t0 and t1 free)
aese $ctr2b, $rk0 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 0
fmov $ctr3.d[1], $ctr32x @ CTR block 4k+3
eor $res0b, $res0b, $acc_lb @ PRE 1
mov $acc_md, $h34k.d[1] @ GHASH block 4k - mid
aese $ctr1b, $rk1 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 1
rev64 $res2b, $res2b @ GHASH block 4k+2 (t0, t1, and t2 free)
pmull2 $t1.1q, $res1.2d, $h3.2d @ GHASH block 4k+1 - high
pmull $acc_l.1q, $res0.1d, $h4.1d @ GHASH block 4k - low
mov $t0d, $res0.d[1] @ GHASH block 4k - mid
pmull $t2.1q, $res1.1d, $h3.1d @ GHASH block 4k+1 - low
rev64 $res3b, $res3b @ GHASH block 4k+3 (t0, t1, t2 and t3 free)
pmull2 $acc_h.1q, $res0.2d, $h4.2d @ GHASH block 4k - high
eor $t0.8b, $t0.8b, $res0.8b @ GHASH block 4k - mid
mov $t3d, $res1.d[1] @ GHASH block 4k+1 - mid
eor $acc_lb, $acc_lb, $t2.16b @ GHASH block 4k+1 - low
mov $t6d, $res2.d[1] @ GHASH block 4k+2 - mid
aese $ctr3b, $rk0 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 0
eor $acc_hb, $acc_hb, $t1.16b @ GHASH block 4k+1 - high
pmull2 $t4.1q, $res2.2d, $h2.2d @ GHASH block 4k+2 - high
eor $t3.8b, $t3.8b, $res1.8b @ GHASH block 4k+1 - mid
eor $t6.8b, $t6.8b, $res2.8b @ GHASH block 4k+2 - mid
aese $ctr3b, $rk1 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 1
aese $ctr2b, $rk1 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 1
eor $acc_hb, $acc_hb, $t4.16b @ GHASH block 4k+2 - high
aese $ctr0b, $rk1 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 1
aese $ctr1b, $rk2 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 2
mov $t9d, $res3.d[1] @ GHASH block 4k+3 - mid
pmull2 $t7.1q, $res3.2d, $h1.2d @ GHASH block 4k+3 - high
ins $t6.d[1], $t6.d[0] @ GHASH block 4k+2 - mid
aese $ctr0b, $rk2 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 2
pmull $acc_m.1q, $t0.1d, $acc_m.1d @ GHASH block 4k - mid
eor $t9.8b, $t9.8b, $res3.8b @ GHASH block 4k+3 - mid
aese $ctr1b, $rk3 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 3
pmull2 $t6.1q, $t6.2d, $h12k.2d @ GHASH block 4k+2 - mid
pmull $t3.1q, $t3.1d, $h34k.1d @ GHASH block 4k+1 - mid
pmull $t9.1q, $t9.1d, $h12k.1d @ GHASH block 4k+3 - mid
eor $acc_hb, $acc_hb, $t7.16b @ GHASH block 4k+3 - high
pmull $t5.1q, $res2.1d, $h2.1d @ GHASH block 4k+2 - low
aese $ctr0b, $rk3 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 3
eor $acc_mb, $acc_mb, $t3.16b @ GHASH block 4k+1 - mid
aese $ctr3b, $rk2 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 2
aese $ctr2b, $rk2 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 2
eor $acc_lb, $acc_lb, $t5.16b @ GHASH block 4k+2 - low
aese $ctr0b, $rk4 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 4
aese $ctr3b, $rk3 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 3
eor $acc_mb, $acc_mb, $t6.16b @ GHASH block 4k+2 - mid
aese $ctr2b, $rk3 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 3
pmull $t8.1q, $res3.1d, $h1.1d @ GHASH block 4k+3 - low
movi $mod_constant.8b, #0xc2
aese $ctr3b, $rk4 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 4
aese $ctr2b, $rk4 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 4
aese $ctr1b, $rk4 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 4
eor $acc_mb, $acc_mb, $t9.16b @ GHASH block 4k+3 - mid
aese $ctr3b, $rk5 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 5
aese $ctr2b, $rk5 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 5
aese $ctr1b, $rk5 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 5
eor $acc_lb, $acc_lb, $t8.16b @ GHASH block 4k+3 - low
aese $ctr0b, $rk5 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 5
aese $ctr3b, $rk6 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 6
eor $acc_mb, $acc_mb, $acc_hb @ karatsuba tidy up
aese $ctr1b, $rk6 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 6
aese $ctr0b, $rk6 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 6
shl $mod_constantd, $mod_constantd, #56 @ mod_constant
aese $ctr3b, $rk7 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 7
aese $ctr1b, $rk7 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 7
eor $acc_mb, $acc_mb, $acc_lb
aese $ctr0b, $rk7 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 7
pmull $t1.1q, $acc_h.1d, $mod_constant.1d
aese $ctr2b, $rk6 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 6
ext $acc_hb, $acc_hb, $acc_hb, #8
aese $ctr0b, $rk8 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 8
aese $ctr1b, $rk8 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 8
eor $acc_mb, $acc_mb, $t1.16b
aese $ctr2b, $rk7 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 7
aese $ctr3b, $rk8 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 8
aese $ctr0b, $rk9 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 9
aese $ctr2b, $rk8 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 8
eor $acc_mb, $acc_mb, $acc_hb
aese $ctr3b, $rk9 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 9
aese $ctr1b, $rk9 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 9
aese $ctr2b, $rk9 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 9
pmull $t1.1q, $acc_m.1d, $mod_constant.1d
ext $acc_mb, $acc_mb, $acc_mb, #8
aese $ctr3b, $rk10 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 10
aese $ctr0b, $rk10 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 10
aese $ctr2b, $rk10 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 10
aese $ctr1b, $rk10 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 10
eor $acc_lb, $acc_lb, $t1.16b
aese $ctr0b, $rk11 @ AES block 4k+4 - round 11
aese $ctr3b, $rk11 @ AES block 4k+7 - round 11
aese $ctr2b, $rk11 @ AES block 4k+6 - round 11
aese $ctr1b, $rk11 @ AES block 4k+5 - round 11
eor $acc_lb, $acc_lb, $acc_mb
.L192_enc_tail: @ TAIL
sub $main_end_input_ptr, $end_input_ptr, $input_ptr @ main_end_input_ptr is number of bytes left to process
ldp $input_l0, $input_h0, [$input_ptr], #16 @ AES block 4k+4 - load plaintext
eor $input_l0, $input_l0, $rk12_l @ AES block 4k+4 - round 12 low
eor $input_h0, $input_h0, $rk12_h @ AES block 4k+4 - round 12 high
fmov $ctr_t0d, $input_l0 @ AES block 4k+4 - mov low
fmov $ctr_t0.d[1], $input_h0 @ AES block 4k+4 - mov high
cmp $main_end_input_ptr, #48
eor $res1b, $ctr_t0b, $ctr0b @ AES block 4k+4 - result
ext $t0.16b, $acc_lb, $acc_lb, #8 @ prepare final partial tag
b.gt .L192_enc_blocks_more_than_3
sub $rctr32w, $rctr32w, #1
movi $acc_m.8b, #0
mov $ctr3b, $ctr2b
movi $acc_h.8b, #0
cmp $main_end_input_ptr, #32
mov $ctr2b, $ctr1b
movi $acc_l.8b, #0
b.gt .L192_enc_blocks_more_than_2
sub $rctr32w, $rctr32w, #1
mov $ctr3b, $ctr1b
cmp $main_end_input_ptr, #16
b.gt .L192_enc_blocks_more_than_1
sub $rctr32w, $rctr32w, #1
b .L192_enc_blocks_less_than_1
.L192_enc_blocks_more_than_3: @ blocks left > 3
st1 { $res1b}, [$output_ptr], #16 @ AES final-3 block - store result
ldp $input_l0, $input_h0, [$input_ptr], #16 @ AES final-2 block - load input low & high
rev64 $res0b, $res1b @ GHASH final-3 block
eor $input_l0, $input_l0, $rk12_l @ AES final-2 block - round 12 low
eor $res0b, $res0b, $t0.16b @ feed in partial tag
eor $input_h0, $input_h0, $rk12_h @ AES final-2 block - round 12 high
fmov $res1d, $input_l0 @ AES final-2 block - mov low
fmov $res1.d[1], $input_h0 @ AES final-2 block - mov high
mov $rk4d, $res0.d[1] @ GHASH final-3 block - mid
pmull $acc_l.1q, $res0.1d, $h4.1d @ GHASH final-3 block - low
mov $acc_md, $h34k.d[1] @ GHASH final-3 block - mid
eor $rk4v.8b, $rk4v.8b, $res0.8b @ GHASH final-3 block - mid
movi $t0.8b, #0 @ suppress further partial tag feed in
pmull2 $acc_h.1q, $res0.2d, $h4.2d @ GHASH final-3 block - high
pmull $acc_m.1q, $rk4v.1d, $acc_m.1d @ GHASH final-3 block - mid
eor $res1b, $res1b, $ctr1b @ AES final-2 block - result
.L192_enc_blocks_more_than_2: @ blocks left > 2
st1 { $res1b}, [$output_ptr], #16 @ AES final-2 block - store result
rev64 $res0b, $res1b @ GHASH final-2 block
ldp $input_l0, $input_h0, [$input_ptr], #16 @ AES final-1 block - load input low & high
eor $res0b, $res0b, $t0.16b @ feed in partial tag
eor $input_h0, $input_h0, $rk12_h @ AES final-1 block - round 12 high
pmull2 $rk2q1, $res0.2d, $h3.2d @ GHASH final-2 block - high
mov $rk4d, $res0.d[1] @ GHASH final-2 block - mid
pmull $rk3q1, $res0.1d, $h3.1d @ GHASH final-2 block - low
eor $input_l0, $input_l0, $rk12_l @ AES final-1 block - round 12 low
fmov $res1d, $input_l0 @ AES final-1 block - mov low
fmov $res1.d[1], $input_h0 @ AES final-1 block - mov high
eor $acc_hb, $acc_hb, $rk2 @ GHASH final-2 block - high
eor $rk4v.8b, $rk4v.8b, $res0.8b @ GHASH final-2 block - mid
eor $acc_lb, $acc_lb, $rk3 @ GHASH final-2 block - low
pmull $rk4v.1q, $rk4v.1d, $h34k.1d @ GHASH final-2 block - mid
movi $t0.8b, #0 @ suppress further partial tag feed in
eor $res1b, $res1b, $ctr2b @ AES final-1 block - result
eor $acc_mb, $acc_mb, $rk4v.16b @ GHASH final-2 block - mid
.L192_enc_blocks_more_than_1: @ blocks left > 1
st1 { $res1b}, [$output_ptr], #16 @ AES final-1 block - store result
ldp $input_l0, $input_h0, [$input_ptr], #16 @ AES final block - load input low & high
rev64 $res0b, $res1b @ GHASH final-1 block
eor $input_l0, $input_l0, $rk12_l @ AES final block - round 12 low
eor $res0b, $res0b, $t0.16b @ feed in partial tag
movi $t0.8b, #0 @ suppress further partial tag feed in
mov $rk4d, $res0.d[1] @ GHASH final-1 block - mid
eor $rk4v.8b, $rk4v.8b, $res0.8b @ GHASH final-1 block - mid
eor $input_h0, $input_h0, $rk12_h @ AES final block - round 12 high
fmov $res1d, $input_l0 @ AES final block - mov low
pmull2 $rk2q1, $res0.2d, $h2.2d @ GHASH final-1 block - high
fmov $res1.d[1], $input_h0 @ AES final block - mov high
ins $rk4v.d[1], $rk4v.d[0] @ GHASH final-1 block - mid
eor $acc_hb, $acc_hb, $rk2 @ GHASH final-1 block - high
pmull $rk3q1, $res0.1d, $h2.1d @ GHASH final-1 block - low
pmull2 $rk4v.1q, $rk4v.2d, $h12k.2d @ GHASH final-1 block - mid
eor $res1b, $res1b, $ctr3b @ AES final block - result
eor $acc_lb, $acc_lb, $rk3 @ GHASH final-1 block - low
eor $acc_mb, $acc_mb, $rk4v.16b @ GHASH final-1 block - mid
.L192_enc_blocks_less_than_1: @ blocks left <= 1
ld1 { $rk0}, [$output_ptr] @ load existing bytes where the possibly partial last block is to be stored
rev $ctr32w, $rctr32w
and $bit_length, $bit_length, #127 @ bit_length %= 128
sub $bit_length, $bit_length, #128 @ bit_length -= 128
mvn $rk12_h, xzr @ rk12_h = 0xffffffffffffffff
neg $bit_length, $bit_length @ bit_length = 128 - #bits in input (in range [1,128])
mvn $rk12_l, xzr @ rk12_l = 0xffffffffffffffff
and $bit_length, $bit_length, #127 @ bit_length %= 128
lsr $rk12_h, $rk12_h, $bit_length @ rk12_h is mask for top 64b of last block
cmp $bit_length, #64
csel $input_l0, $rk12_l, $rk12_h, lt
csel $input_h0, $rk12_h, xzr, lt
fmov $ctr0d, $input_l0 @ ctr0b is mask for last block
fmov $ctr0.d[1], $input_h0
and $res1b, $res1b, $ctr0b @ possibly partial last block has zeroes in highest bits
rev64 $res0b, $res1b @ GHASH final block
eor $res0b, $res0b, $t0.16b @ feed in partial tag
mov $t0d, $res0.d[1] @ GHASH final block - mid
pmull $rk3q1, $res0.1d, $h1.1d @ GHASH final block - low
pmull2 $rk2q1, $res0.2d, $h1.2d @ GHASH final block - high
eor $t0.8b, $t0.8b, $res0.8b @ GHASH final block - mid
eor $acc_lb, $acc_lb, $rk3 @ GHASH final block - low
eor $acc_hb, $acc_hb, $rk2 @ GHASH final block - high
pmull $t0.1q, $t0.1d, $h12k.1d @ GHASH final block - mid
eor $acc_mb, $acc_mb, $t0.16b @ GHASH final block - mid
movi $mod_constant.8b, #0xc2
eor $t9.16b, $acc_lb, $acc_hb @ MODULO - karatsuba tidy up
shl $mod_constantd, $mod_constantd, #56 @ mod_constant
bif $res1b, $rk0, $ctr0b @ insert existing bytes in top end of result before storing
eor $acc_mb, $acc_mb, $t9.16b @ MODULO - karatsuba tidy up
pmull $mod_t.1q, $acc_h.1d, $mod_constant.1d @ MODULO - top 64b align with mid
ext $acc_hb, $acc_hb, $acc_hb, #8 @ MODULO - other top alignment
eor $acc_mb, $acc_mb, $mod_t.16b @ MODULO - fold into mid
eor $acc_mb, $acc_mb, $acc_hb @ MODULO - fold into mid
pmull $acc_h.1q, $acc_m.1d, $mod_constant.1d @ MODULO - mid 64b align with low
ext $acc_mb, $acc_mb, $acc_mb, #8 @ MODULO - other mid alignment
eor $acc_lb, $acc_lb, $acc_hb @ MODULO - fold into low
str $ctr32w, [$counter, #12] @ store the updated counter
st1 { $res1b}, [$output_ptr] @ store all 16B
eor $acc_lb, $acc_lb, $acc_mb @ MODULO - fold into low
ext $acc_lb, $acc_lb, $acc_lb, #8
rev64 $acc_lb, $acc_lb
mov x0, $len
st1 { $acc_l.16b }, [$current_tag]
ldp x21, x22, [sp, #16]
ldp x23, x24, [sp, #32]
ldp d8, d9, [sp, #48]
ldp d10, d11, [sp, #64]
ldp d12, d13, [sp, #80]
ldp d14, d15, [sp, #96]
ldp x19, x20, [sp], #112
ret
.L192_enc_ret:
mov w0, #0x0
ret
.size aes_gcm_enc_192_kernel,.-aes_gcm_enc_192_kernel
___
#########################################################################################
# size_t aes_gcm_dec_192_kernel(const unsigned char *in,
# size_t len,
# unsigned char *out,
# const void *key,
# unsigned char ivec[16],
# u64 *Xi);
#
$code.=<<___;
.global aes_gcm_dec_192_kernel
.type aes_gcm_dec_192_kernel,%function
.align 4
aes_gcm_dec_192_kernel:
AARCH64_VALID_CALL_TARGET
cbz x1, .L192_dec_ret
stp x19, x20, [sp, #-112]!
mov x16, x4
mov x8, x5
stp x21, x22, [sp, #16]
stp x23, x24, [sp, #32]
stp d8, d9, [sp, #48]
stp d10, d11, [sp, #64]
stp d12, d13, [sp, #80]
stp d14, d15, [sp, #96]
add $end_input_ptr, $input_ptr, $bit_length, lsr #3 @ end_input_ptr
ldp $ctr96_b64x, $ctr96_t32x, [$counter] @ ctr96_b64, ctr96_t32
ld1 { $ctr0b}, [$counter] @ special case vector load initial counter so we can start first AES block as quickly as possible
ldr $rk0q, [$cc, #0] @ load rk0
lsr $main_end_input_ptr, $bit_length, #3 @ byte_len
mov $len, $main_end_input_ptr
ldr $rk2q, [$cc, #32] @ load rk2
lsr $rctr32x, $ctr96_t32x, #32
orr $ctr96_t32w, $ctr96_t32w, $ctr96_t32w
fmov $ctr3d, $ctr96_b64x @ CTR block 3
rev $rctr32w, $rctr32w @ rev_ctr32
fmov $ctr1d, $ctr96_b64x @ CTR block 1
add $rctr32w, $rctr32w, #1 @ increment rev_ctr32
ldr $rk1q, [$cc, #16] @ load rk1
aese $ctr0b, $rk0 \n aesmc $ctr0b, $ctr0b @ AES block 0 - round 0
rev $ctr32w, $rctr32w @ CTR block 1
add $rctr32w, $rctr32w, #1 @ CTR block 1
orr $ctr32x, $ctr96_t32x, $ctr32x, lsl #32 @ CTR block 1
ldr $rk3q, [$cc, #48] @ load rk3
fmov $ctr1.d[1], $ctr32x @ CTR block 1
rev $ctr32w, $rctr32w @ CTR block 2
add $rctr32w, $rctr32w, #1 @ CTR block 2
fmov $ctr2d, $ctr96_b64x @ CTR block 2
orr $ctr32x, $ctr96_t32x, $ctr32x, lsl #32 @ CTR block 2
fmov $ctr2.d[1], $ctr32x @ CTR block 2
rev $ctr32w, $rctr32w @ CTR block 3
aese $ctr0b, $rk1 \n aesmc $ctr0b, $ctr0b @ AES block 0 - round 1
orr $ctr32x, $ctr96_t32x, $ctr32x, lsl #32 @ CTR block 3
fmov $ctr3.d[1], $ctr32x @ CTR block 3
ldr $rk8q, [$cc, #128] @ load rk8
aese $ctr0b, $rk2 \n aesmc $ctr0b, $ctr0b @ AES block 0 - round 2
aese $ctr2b, $rk0 \n aesmc $ctr2b, $ctr2b @ AES block 2 - round 0
ldr $rk11q, [$cc, #176] @ load rk11
aese $ctr1b, $rk0 \n aesmc $ctr1b, $ctr1b @ AES block 1 - round 0
ldr $h4q, [$current_tag, #112] @ load h4l | h4h
ext $h4b, $h4b, $h4b, #8
aese $ctr3b, $rk0 \n aesmc $ctr3b, $ctr3b @ AES block 3 - round 0
ldr $h2q, [$current_tag, #64] @ load h2l | h2h
ext $h2b, $h2b, $h2b, #8
aese $ctr2b, $rk1 \n aesmc $ctr2b, $ctr2b @ AES block 2 - round 1
ldr $h3q, [$current_tag, #80] @ load h3l | h3h
ext $h3b, $h3b, $h3b, #8
aese $ctr1b, $rk1 \n aesmc $ctr1b, $ctr1b @ AES block 1 - round 1
ldp $rk12_l, $rk12_h, [$cc, #192] @ load rk12
aese $ctr3b, $rk1 \n aesmc $ctr3b, $ctr3b @ AES block 3 - round 1
ldr $h1q, [$current_tag, #32] @ load h1l | h1h
ext $h1b, $h1b, $h1b, #8
aese $ctr2b, $rk2 \n aesmc $ctr2b, $ctr2b @ AES block 2 - round 2
ldr $rk10q, [$cc, #160] @ load rk10
aese $ctr0b, $rk3 \n aesmc $ctr0b, $ctr0b @ AES block 0 - round 3
ldr $rk9q, [$cc, #144] @ load rk9
aese $ctr1b, $rk2 \n aesmc $ctr1b, $ctr1b @ AES block 1 - round 2
ldr $rk7q, [$cc, #112] @ load rk7
aese $ctr3b, $rk2 \n aesmc $ctr3b, $ctr3b @ AES block 3 - round 2
ldr $rk4q, [$cc, #64] @ load rk4
aese $ctr2b, $rk3 \n aesmc $ctr2b, $ctr2b @ AES block 2 - round 3
ld1 { $acc_lb}, [$current_tag]
ext $acc_lb, $acc_lb, $acc_lb, #8
rev64 $acc_lb, $acc_lb
aese $ctr1b, $rk3 \n aesmc $ctr1b, $ctr1b @ AES block 1 - round 3
add $rctr32w, $rctr32w, #1 @ CTR block 3
aese $ctr3b, $rk3 \n aesmc $ctr3b, $ctr3b @ AES block 3 - round 3
trn1 $acc_h.2d, $h3.2d, $h4.2d @ h4h | h3h
aese $ctr0b, $rk4 \n aesmc $ctr0b, $ctr0b @ AES block 0 - round 4
ldr $rk5q, [$cc, #80] @ load rk5
aese $ctr1b, $rk4 \n aesmc $ctr1b, $ctr1b @ AES block 1 - round 4
trn2 $h34k.2d, $h3.2d, $h4.2d @ h4l | h3l
aese $ctr2b, $rk4 \n aesmc $ctr2b, $ctr2b @ AES block 2 - round 4
aese $ctr3b, $rk4 \n aesmc $ctr3b, $ctr3b @ AES block 3 - round 4
trn2 $h12k.2d, $h1.2d, $h2.2d @ h2l | h1l
aese $ctr0b, $rk5 \n aesmc $ctr0b, $ctr0b @ AES block 0 - round 5
ldr $rk6q, [$cc, #96] @ load rk6
aese $ctr1b, $rk5 \n aesmc $ctr1b, $ctr1b @ AES block 1 - round 5
aese $ctr2b, $rk5 \n aesmc $ctr2b, $ctr2b @ AES block 2 - round 5
aese $ctr3b, $rk5 \n aesmc $ctr3b, $ctr3b @ AES block 3 - round 5
aese $ctr0b, $rk6 \n aesmc $ctr0b, $ctr0b @ AES block 0 - round 6
aese $ctr2b, $rk6 \n aesmc $ctr2b, $ctr2b @ AES block 2 - round 6
aese $ctr3b, $rk6 \n aesmc $ctr3b, $ctr3b @ AES block 3 - round 6
aese $ctr0b, $rk7 \n aesmc $ctr0b, $ctr0b @ AES block 0 - round 7
aese $ctr2b, $rk7 \n aesmc $ctr2b, $ctr2b @ AES block 2 - round 7
aese $ctr3b, $rk7 \n aesmc $ctr3b, $ctr3b @ AES block 3 - round 7
aese $ctr1b, $rk6 \n aesmc $ctr1b, $ctr1b @ AES block 1 - round 6
aese $ctr2b, $rk8 \n aesmc $ctr2b, $ctr2b @ AES block 2 - round 8
aese $ctr3b, $rk8 \n aesmc $ctr3b, $ctr3b @ AES block 3 - round 8
aese $ctr1b, $rk7 \n aesmc $ctr1b, $ctr1b @ AES block 1 - round 7
aese $ctr2b, $rk9 \n aesmc $ctr2b, $ctr2b @ AES block 2 - round 9
aese $ctr3b, $rk9 \n aesmc $ctr3b, $ctr3b @ AES block 3 - round 9
aese $ctr1b, $rk8 \n aesmc $ctr1b, $ctr1b @ AES block 1 - round 8
sub $main_end_input_ptr, $main_end_input_ptr, #1 @ byte_len - 1
aese $ctr0b, $rk8 \n aesmc $ctr0b, $ctr0b @ AES block 0 - round 8
and $main_end_input_ptr, $main_end_input_ptr, #0xffffffffffffffc0 @ number of bytes to be processed in main loop (at least 1 byte must be handled by tail)
aese $ctr3b, $rk10 \n aesmc $ctr3b, $ctr3b @ AES block 3 - round 10
add $main_end_input_ptr, $main_end_input_ptr, $input_ptr
aese $ctr1b, $rk9 \n aesmc $ctr1b, $ctr1b @ AES block 1 - round 9
cmp $input_ptr, $main_end_input_ptr @ check if we have <= 4 blocks
aese $ctr0b, $rk9 \n aesmc $ctr0b, $ctr0b @ AES block 0 - round 9
trn1 $t0.2d, $h1.2d, $h2.2d @ h2h | h1h
aese $ctr3b, $rk11 @ AES block 3 - round 11
aese $ctr2b, $rk10 \n aesmc $ctr2b, $ctr2b @ AES block 2 - round 10
aese $ctr1b, $rk10 \n aesmc $ctr1b, $ctr1b @ AES block 1 - round 10
aese $ctr0b, $rk10 \n aesmc $ctr0b, $ctr0b @ AES block 0 - round 10
eor $h12k.16b, $h12k.16b, $t0.16b @ h2k | h1k
aese $ctr2b, $rk11 @ AES block 2 - round 11
aese $ctr1b, $rk11 @ AES block 1 - round 11
eor $h34k.16b, $h34k.16b, $acc_h.16b @ h4k | h3k
aese $ctr0b, $rk11 @ AES block 0 - round 11
b.ge .L192_dec_tail @ handle tail
ldr $res1q, [$input_ptr, #16] @ AES block 1 - load ciphertext
ldr $res0q, [$input_ptr, #0] @ AES block 0 - load ciphertext
eor $ctr1b, $res1b, $ctr1b @ AES block 1 - result
eor $ctr0b, $res0b, $ctr0b @ AES block 0 - result
rev $ctr32w, $rctr32w @ CTR block 4
ldr $res3q, [$input_ptr, #48] @ AES block 3 - load ciphertext
ldr $res2q, [$input_ptr, #32] @ AES block 2 - load ciphertext
mov $output_l1, $ctr1.d[0] @ AES block 1 - mov low
mov $output_h1, $ctr1.d[1] @ AES block 1 - mov high
mov $output_l0, $ctr0.d[0] @ AES block 0 - mov low
orr $ctr32x, $ctr96_t32x, $ctr32x, lsl #32 @ CTR block 4
add $rctr32w, $rctr32w, #1 @ CTR block 4
mov $output_h0, $ctr0.d[1] @ AES block 0 - mov high
rev64 $res0b, $res0b @ GHASH block 0
add $input_ptr, $input_ptr, #64 @ AES input_ptr update
fmov $ctr0d, $ctr96_b64x @ CTR block 4
rev64 $res1b, $res1b @ GHASH block 1
cmp $input_ptr, $main_end_input_ptr @ check if we have <= 8 blocks
eor $output_l1, $output_l1, $rk12_l @ AES block 1 - round 12 low
fmov $ctr0.d[1], $ctr32x @ CTR block 4
rev $ctr32w, $rctr32w @ CTR block 5
orr $ctr32x, $ctr96_t32x, $ctr32x, lsl #32 @ CTR block 5
fmov $ctr1d, $ctr96_b64x @ CTR block 5
eor $output_h1, $output_h1, $rk12_h @ AES block 1 - round 12 high
add $rctr32w, $rctr32w, #1 @ CTR block 5
fmov $ctr1.d[1], $ctr32x @ CTR block 5
eor $output_l0, $output_l0, $rk12_l @ AES block 0 - round 12 low
rev $ctr32w, $rctr32w @ CTR block 6
eor $output_h0, $output_h0, $rk12_h @ AES block 0 - round 12 high
stp $output_l0, $output_h0, [$output_ptr], #16 @ AES block 0 - store result
orr $ctr32x, $ctr96_t32x, $ctr32x, lsl #32 @ CTR block 6
stp $output_l1, $output_h1, [$output_ptr], #16 @ AES block 1 - store result
add $rctr32w, $rctr32w, #1 @ CTR block 6
eor $ctr2b, $res2b, $ctr2b @ AES block 2 - result
b.ge .L192_dec_prepretail @ do prepretail
.L192_dec_main_loop: @ main loop start
aese $ctr1b, $rk0 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 0
ext $acc_lb, $acc_lb, $acc_lb, #8 @ PRE 0
pmull $t2.1q, $res1.1d, $h3.1d @ GHASH block 4k+1 - low
mov $output_l2, $ctr2.d[0] @ AES block 4k+2 - mov low
mov $output_h2, $ctr2.d[1] @ AES block 4k+2 - mov high
eor $ctr3b, $res3b, $ctr3b @ AES block 4k+3 - result
rev64 $res3b, $res3b @ GHASH block 4k+3
aese $ctr1b, $rk1 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 1
fmov $ctr2d, $ctr96_b64x @ CTR block 4k+6
aese $ctr0b, $rk0 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 0
eor $res0b, $res0b, $acc_lb @ PRE 1
pmull2 $t1.1q, $res1.2d, $h3.2d @ GHASH block 4k+1 - high
fmov $ctr2.d[1], $ctr32x @ CTR block 4k+6
aese $ctr1b, $rk2 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 2
mov $output_h3, $ctr3.d[1] @ AES block 4k+3 - mov high
aese $ctr0b, $rk1 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 1
mov $output_l3, $ctr3.d[0] @ AES block 4k+3 - mov low
pmull2 $acc_h.1q, $res0.2d, $h4.2d @ GHASH block 4k - high
fmov $ctr3d, $ctr96_b64x @ CTR block 4k+7
mov $t0d, $res0.d[1] @ GHASH block 4k - mid
pmull $acc_l.1q, $res0.1d, $h4.1d @ GHASH block 4k - low
mov $acc_md, $h34k.d[1] @ GHASH block 4k - mid
rev $ctr32w, $rctr32w @ CTR block 4k+7
aese $ctr2b, $rk0 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 0
orr $ctr32x, $ctr96_t32x, $ctr32x, lsl #32 @ CTR block 4k+7
fmov $ctr3.d[1], $ctr32x @ CTR block 4k+7
eor $t0.8b, $t0.8b, $res0.8b @ GHASH block 4k - mid
mov $t3d, $res1.d[1] @ GHASH block 4k+1 - mid
aese $ctr1b, $rk3 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 3
aese $ctr0b, $rk2 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 2
eor $output_h2, $output_h2, $rk12_h @ AES block 4k+2 - round 12 high
aese $ctr2b, $rk1 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 1
eor $t3.8b, $t3.8b, $res1.8b @ GHASH block 4k+1 - mid
pmull $acc_m.1q, $t0.1d, $acc_m.1d @ GHASH block 4k - mid
aese $ctr3b, $rk0 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 0
rev64 $res2b, $res2b @ GHASH block 4k+2
aese $ctr2b, $rk2 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 2
pmull $t3.1q, $t3.1d, $h34k.1d @ GHASH block 4k+1 - mid
eor $acc_lb, $acc_lb, $t2.16b @ GHASH block 4k+1 - low
eor $output_l2, $output_l2, $rk12_l @ AES block 4k+2 - round 12 low
aese $ctr1b, $rk4 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 4
aese $ctr0b, $rk3 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 3
eor $acc_mb, $acc_mb, $t3.16b @ GHASH block 4k+1 - mid
mov $t6d, $res2.d[1] @ GHASH block 4k+2 - mid
aese $ctr3b, $rk1 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 1
eor $acc_hb, $acc_hb, $t1.16b @ GHASH block 4k+1 - high
aese $ctr0b, $rk4 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 4
pmull2 $t4.1q, $res2.2d, $h2.2d @ GHASH block 4k+2 - high
eor $t6.8b, $t6.8b, $res2.8b @ GHASH block 4k+2 - mid
pmull $t5.1q, $res2.1d, $h2.1d @ GHASH block 4k+2 - low
aese $ctr0b, $rk5 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 5
eor $acc_hb, $acc_hb, $t4.16b @ GHASH block 4k+2 - high
mov $t9d, $res3.d[1] @ GHASH block 4k+3 - mid
aese $ctr1b, $rk5 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 5
pmull2 $t7.1q, $res3.2d, $h1.2d @ GHASH block 4k+3 - high
aese $ctr3b, $rk2 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 2
eor $t9.8b, $t9.8b, $res3.8b @ GHASH block 4k+3 - mid
aese $ctr1b, $rk6 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 6
aese $ctr0b, $rk6 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 6
ins $t6.d[1], $t6.d[0] @ GHASH block 4k+2 - mid
aese $ctr3b, $rk3 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 3
pmull $t9.1q, $t9.1d, $h12k.1d @ GHASH block 4k+3 - mid
eor $acc_lb, $acc_lb, $t5.16b @ GHASH block 4k+2 - low
aese $ctr0b, $rk7 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 7
pmull2 $t6.1q, $t6.2d, $h12k.2d @ GHASH block 4k+2 - mid
eor $acc_hb, $acc_hb, $t7.16b @ GHASH block 4k+3 - high
aese $ctr1b, $rk7 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 7
aese $ctr0b, $rk8 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 8
movi $mod_constant.8b, #0xc2
pmull $t8.1q, $res3.1d, $h1.1d @ GHASH block 4k+3 - low
aese $ctr1b, $rk8 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 8
eor $acc_mb, $acc_mb, $t6.16b @ GHASH block 4k+2 - mid
aese $ctr2b, $rk3 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 3
aese $ctr0b, $rk9 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 9
eor $acc_lb, $acc_lb, $t8.16b @ GHASH block 4k+3 - low
aese $ctr3b, $rk4 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 4
aese $ctr2b, $rk4 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 4
eor $acc_mb, $acc_mb, $t9.16b @ GHASH block 4k+3 - mid
aese $ctr0b, $rk10 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 10
aese $ctr1b, $rk9 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 9
eor $t9.16b, $acc_lb, $acc_hb @ MODULO - karatsuba tidy up
aese $ctr2b, $rk5 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 5
aese $ctr3b, $rk5 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 5
shl $mod_constantd, $mod_constantd, #56 @ mod_constant
aese $ctr1b, $rk10 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 10
aese $ctr2b, $rk6 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 6
ldr $res2q, [$input_ptr, #32] @ AES block 4k+6 - load ciphertext
aese $ctr3b, $rk6 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 6
eor $acc_mb, $acc_mb, $t9.16b @ MODULO - karatsuba tidy up
pmull $mod_t.1q, $acc_h.1d, $mod_constant.1d @ MODULO - top 64b align with mid
ldr $res3q, [$input_ptr, #48] @ AES block 4k+7 - load ciphertext
eor $output_l3, $output_l3, $rk12_l @ AES block 4k+3 - round 12 low
aese $ctr2b, $rk7 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 7
ext $acc_hb, $acc_hb, $acc_hb, #8 @ MODULO - other top alignment
aese $ctr0b, $rk11 @ AES block 4k+4 - round 11
add $rctr32w, $rctr32w, #1 @ CTR block 4k+7
aese $ctr3b, $rk7 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 7
eor $acc_mb, $acc_mb, $mod_t.16b @ MODULO - fold into mid
aese $ctr2b, $rk8 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 8
ldr $res0q, [$input_ptr, #0] @ AES block 4k+4 - load ciphertext
aese $ctr1b, $rk11 @ AES block 4k+5 - round 11
ldr $res1q, [$input_ptr, #16] @ AES block 4k+5 - load ciphertext
rev $ctr32w, $rctr32w @ CTR block 4k+8
aese $ctr3b, $rk8 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 8
stp $output_l2, $output_h2, [$output_ptr], #16 @ AES block 4k+2 - store result
aese $ctr2b, $rk9 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 9
eor $acc_mb, $acc_mb, $acc_hb @ MODULO - fold into mid
add $input_ptr, $input_ptr, #64 @ AES input_ptr update
cmp $input_ptr, $main_end_input_ptr @ LOOP CONTROL
eor $ctr0b, $res0b, $ctr0b @ AES block 4k+4 - result
eor $output_h3, $output_h3, $rk12_h @ AES block 4k+3 - round 12 high
eor $ctr1b, $res1b, $ctr1b @ AES block 4k+5 - result
aese $ctr2b, $rk10 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 10
orr $ctr32x, $ctr96_t32x, $ctr32x, lsl #32 @ CTR block 4k+8
aese $ctr3b, $rk9 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 9
pmull $mod_constant.1q, $acc_m.1d, $mod_constant.1d @ MODULO - mid 64b align with low
mov $output_l1, $ctr1.d[0] @ AES block 4k+5 - mov low
mov $output_l0, $ctr0.d[0] @ AES block 4k+4 - mov low
stp $output_l3, $output_h3, [$output_ptr], #16 @ AES block 4k+3 - store result
rev64 $res1b, $res1b @ GHASH block 4k+5
aese $ctr2b, $rk11 @ AES block 4k+6 - round 11
mov $output_h0, $ctr0.d[1] @ AES block 4k+4 - mov high
aese $ctr3b, $rk10 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 10
mov $output_h1, $ctr1.d[1] @ AES block 4k+5 - mov high
fmov $ctr0d, $ctr96_b64x @ CTR block 4k+8
add $rctr32w, $rctr32w, #1 @ CTR block 4k+8
ext $acc_mb, $acc_mb, $acc_mb, #8 @ MODULO - other mid alignment
eor $ctr2b, $res2b, $ctr2b @ AES block 4k+6 - result
fmov $ctr0.d[1], $ctr32x @ CTR block 4k+8
rev $ctr32w, $rctr32w @ CTR block 4k+9
eor $output_l0, $output_l0, $rk12_l @ AES block 4k+4 - round 12 low
orr $ctr32x, $ctr96_t32x, $ctr32x, lsl #32 @ CTR block 4k+9
eor $acc_lb, $acc_lb, $mod_constant.16b @ MODULO - fold into low
fmov $ctr1d, $ctr96_b64x @ CTR block 4k+9
add $rctr32w, $rctr32w, #1 @ CTR block 4k+9
eor $output_l1, $output_l1, $rk12_l @ AES block 4k+5 - round 12 low
fmov $ctr1.d[1], $ctr32x @ CTR block 4k+9
rev $ctr32w, $rctr32w @ CTR block 4k+10
eor $output_h1, $output_h1, $rk12_h @ AES block 4k+5 - round 12 high
eor $output_h0, $output_h0, $rk12_h @ AES block 4k+4 - round 12 high
stp $output_l0, $output_h0, [$output_ptr], #16 @ AES block 4k+4 - store result
eor $acc_lb, $acc_lb, $acc_mb @ MODULO - fold into low
add $rctr32w, $rctr32w, #1 @ CTR block 4k+10
rev64 $res0b, $res0b @ GHASH block 4k+4
orr $ctr32x, $ctr96_t32x, $ctr32x, lsl #32 @ CTR block 4k+10
aese $ctr3b, $rk11 @ AES block 4k+7 - round 11
stp $output_l1, $output_h1, [$output_ptr], #16 @ AES block 4k+5 - store result
b.lt .L192_dec_main_loop
.L192_dec_prepretail: @ PREPRETAIL
mov $output_h2, $ctr2.d[1] @ AES block 4k+2 - mov high
ext $acc_lb, $acc_lb, $acc_lb, #8 @ PRE 0
eor $ctr3b, $res3b, $ctr3b @ AES block 4k+3 - result
aese $ctr1b, $rk0 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 0
mov $output_l2, $ctr2.d[0] @ AES block 4k+2 - mov low
aese $ctr0b, $rk0 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 0
mov $acc_md, $h34k.d[1] @ GHASH block 4k - mid
eor $res0b, $res0b, $acc_lb @ PRE 1
fmov $ctr2d, $ctr96_b64x @ CTR block 4k+6
aese $ctr1b, $rk1 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 1
mov $output_l3, $ctr3.d[0] @ AES block 4k+3 - mov low
aese $ctr0b, $rk1 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 1
mov $output_h3, $ctr3.d[1] @ AES block 4k+3 - mov high
pmull $acc_l.1q, $res0.1d, $h4.1d @ GHASH block 4k - low
mov $t0d, $res0.d[1] @ GHASH block 4k - mid
fmov $ctr3d, $ctr96_b64x @ CTR block 4k+7
aese $ctr1b, $rk2 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 2
rev64 $res2b, $res2b @ GHASH block 4k+2
pmull2 $acc_h.1q, $res0.2d, $h4.2d @ GHASH block 4k - high
fmov $ctr2.d[1], $ctr32x @ CTR block 4k+6
rev $ctr32w, $rctr32w @ CTR block 4k+7
orr $ctr32x, $ctr96_t32x, $ctr32x, lsl #32 @ CTR block 4k+7
eor $t0.8b, $t0.8b, $res0.8b @ GHASH block 4k - mid
mov $t3d, $res1.d[1] @ GHASH block 4k+1 - mid
pmull $t2.1q, $res1.1d, $h3.1d @ GHASH block 4k+1 - low
eor $output_h3, $output_h3, $rk12_h @ AES block 4k+3 - round 12 high
fmov $ctr3.d[1], $ctr32x @ CTR block 4k+7
aese $ctr0b, $rk2 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 2
eor $output_l2, $output_l2, $rk12_l @ AES block 4k+2 - round 12 low
pmull2 $t1.1q, $res1.2d, $h3.2d @ GHASH block 4k+1 - high
eor $output_h2, $output_h2, $rk12_h @ AES block 4k+2 - round 12 high
eor $t3.8b, $t3.8b, $res1.8b @ GHASH block 4k+1 - mid
pmull $acc_m.1q, $t0.1d, $acc_m.1d @ GHASH block 4k - mid
eor $output_l3, $output_l3, $rk12_l @ AES block 4k+3 - round 12 low
stp $output_l2, $output_h2, [$output_ptr], #16 @ AES block 4k+2 - store result
rev64 $res3b, $res3b @ GHASH block 4k+3
stp $output_l3, $output_h3, [$output_ptr], #16 @ AES block 4k+3 - store result
aese $ctr3b, $rk0 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 0
eor $acc_hb, $acc_hb, $t1.16b @ GHASH block 4k+1 - high
pmull $t3.1q, $t3.1d, $h34k.1d @ GHASH block 4k+1 - mid
add $rctr32w, $rctr32w, #1 @ CTR block 4k+7
pmull2 $t4.1q, $res2.2d, $h2.2d @ GHASH block 4k+2 - high
eor $acc_lb, $acc_lb, $t2.16b @ GHASH block 4k+1 - low
aese $ctr2b, $rk0 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 0
eor $acc_mb, $acc_mb, $t3.16b @ GHASH block 4k+1 - mid
mov $t6d, $res2.d[1] @ GHASH block 4k+2 - mid
aese $ctr3b, $rk1 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 1
aese $ctr2b, $rk1 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 1
eor $acc_hb, $acc_hb, $t4.16b @ GHASH block 4k+2 - high
eor $t6.8b, $t6.8b, $res2.8b @ GHASH block 4k+2 - mid
pmull $t5.1q, $res2.1d, $h2.1d @ GHASH block 4k+2 - low
aese $ctr2b, $rk2 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 2
mov $t9d, $res3.d[1] @ GHASH block 4k+3 - mid
aese $ctr3b, $rk2 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 2
ins $t6.d[1], $t6.d[0] @ GHASH block 4k+2 - mid
pmull $t8.1q, $res3.1d, $h1.1d @ GHASH block 4k+3 - low
aese $ctr0b, $rk3 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 3
eor $t9.8b, $t9.8b, $res3.8b @ GHASH block 4k+3 - mid
aese $ctr1b, $rk3 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 3
pmull2 $t6.1q, $t6.2d, $h12k.2d @ GHASH block 4k+2 - mid
eor $acc_lb, $acc_lb, $t5.16b @ GHASH block 4k+2 - low
aese $ctr0b, $rk4 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 4
pmull2 $t7.1q, $res3.2d, $h1.2d @ GHASH block 4k+3 - high
movi $mod_constant.8b, #0xc2
pmull $t9.1q, $t9.1d, $h12k.1d @ GHASH block 4k+3 - mid
aese $ctr2b, $rk3 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 3
shl $mod_constantd, $mod_constantd, #56 @ mod_constant
eor $acc_hb, $acc_hb, $t7.16b @ GHASH block 4k+3 - high
aese $ctr0b, $rk5 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 5
eor $acc_mb, $acc_mb, $t6.16b @ GHASH block 4k+2 - mid
aese $ctr2b, $rk4 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 4
pmull $mod_t.1q, $acc_h.1d, $mod_constant.1d @ MODULO - top 64b align with mid
eor $acc_lb, $acc_lb, $t8.16b @ GHASH block 4k+3 - low
aese $ctr0b, $rk6 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 6
aese $ctr3b, $rk3 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 3
eor $acc_mb, $acc_mb, $t9.16b @ GHASH block 4k+3 - mid
aese $ctr2b, $rk5 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 5
aese $ctr0b, $rk7 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 7
eor $t9.16b, $acc_lb, $acc_hb @ MODULO - karatsuba tidy up
aese $ctr3b, $rk4 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 4
aese $ctr2b, $rk6 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 6
ext $acc_hb, $acc_hb, $acc_hb, #8 @ MODULO - other top alignment
aese $ctr0b, $rk8 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 8
aese $ctr3b, $rk5 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 5
eor $acc_mb, $acc_mb, $t9.16b @ MODULO - karatsuba tidy up
aese $ctr1b, $rk4 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 4
aese $ctr2b, $rk7 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 7
aese $ctr0b, $rk9 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 9
aese $ctr1b, $rk5 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 5
aese $ctr3b, $rk6 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 6
eor $acc_mb, $acc_mb, $mod_t.16b @ MODULO - fold into mid
aese $ctr0b, $rk10 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 10
aese $ctr1b, $rk6 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 6
aese $ctr3b, $rk7 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 7
aese $ctr2b, $rk8 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 8
eor $acc_mb, $acc_mb, $acc_hb @ MODULO - fold into mid
aese $ctr1b, $rk7 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 7
aese $ctr3b, $rk8 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 8
aese $ctr2b, $rk9 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 9
aese $ctr1b, $rk8 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 8
aese $ctr3b, $rk9 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 9
pmull $mod_constant.1q, $acc_m.1d, $mod_constant.1d @ MODULO - mid 64b align with low
aese $ctr1b, $rk9 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 9
aese $ctr2b, $rk10 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 10
aese $ctr3b, $rk10 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 10
ext $acc_mb, $acc_mb, $acc_mb, #8 @ MODULO - other mid alignment
aese $ctr1b, $rk10 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 10
aese $ctr0b, $rk11
eor $acc_lb, $acc_lb, $mod_constant.16b @ MODULO - fold into low
aese $ctr2b, $rk11
aese $ctr1b, $rk11
aese $ctr3b, $rk11
eor $acc_lb, $acc_lb, $acc_mb @ MODULO - fold into low
.L192_dec_tail: @ TAIL
sub $main_end_input_ptr, $end_input_ptr, $input_ptr @ main_end_input_ptr is number of bytes left to process
ld1 { $res1b}, [$input_ptr], #16 @ AES block 4k+4 - load ciphertext
eor $ctr0b, $res1b, $ctr0b @ AES block 4k+4 - result
mov $output_h0, $ctr0.d[1] @ AES block 4k+4 - mov high
mov $output_l0, $ctr0.d[0] @ AES block 4k+4 - mov low
ext $t0.16b, $acc_lb, $acc_lb, #8 @ prepare final partial tag
cmp $main_end_input_ptr, #48
eor $output_h0, $output_h0, $rk12_h @ AES block 4k+4 - round 12 high
eor $output_l0, $output_l0, $rk12_l @ AES block 4k+4 - round 12 low
b.gt .L192_dec_blocks_more_than_3
movi $acc_l.8b, #0
movi $acc_h.8b, #0
mov $ctr3b, $ctr2b
mov $ctr2b, $ctr1b
sub $rctr32w, $rctr32w, #1
movi $acc_m.8b, #0
cmp $main_end_input_ptr, #32
b.gt .L192_dec_blocks_more_than_2
mov $ctr3b, $ctr1b
cmp $main_end_input_ptr, #16
sub $rctr32w, $rctr32w, #1
b.gt .L192_dec_blocks_more_than_1
sub $rctr32w, $rctr32w, #1
b .L192_dec_blocks_less_than_1
.L192_dec_blocks_more_than_3: @ blocks left > 3
rev64 $res0b, $res1b @ GHASH final-3 block
ld1 { $res1b}, [$input_ptr], #16 @ AES final-2 block - load ciphertext
stp $output_l0, $output_h0, [$output_ptr], #16 @ AES final-3 block - store result
eor $res0b, $res0b, $t0.16b @ feed in partial tag
eor $ctr0b, $res1b, $ctr1b @ AES final-2 block - result
pmull $acc_l.1q, $res0.1d, $h4.1d @ GHASH final-3 block - low
mov $output_l0, $ctr0.d[0] @ AES final-2 block - mov low
mov $rk4d, $res0.d[1] @ GHASH final-3 block - mid
mov $output_h0, $ctr0.d[1] @ AES final-2 block - mov high
mov $acc_md, $h34k.d[1] @ GHASH final-3 block - mid
eor $rk4v.8b, $rk4v.8b, $res0.8b @ GHASH final-3 block - mid
pmull2 $acc_h.1q, $res0.2d, $h4.2d @ GHASH final-3 block - high
eor $output_l0, $output_l0, $rk12_l @ AES final-2 block - round 12 low
movi $t0.8b, #0 @ suppress further partial tag feed in
pmull $acc_m.1q, $rk4v.1d, $acc_m.1d @ GHASH final-3 block - mid
eor $output_h0, $output_h0, $rk12_h @ AES final-2 block - round 12 high
.L192_dec_blocks_more_than_2: @ blocks left > 2
rev64 $res0b, $res1b @ GHASH final-2 block
ld1 { $res1b}, [$input_ptr], #16 @ AES final-1 block - load ciphertext
eor $res0b, $res0b, $t0.16b @ feed in partial tag
movi $t0.8b, #0 @ suppress further partial tag feed in
eor $ctr0b, $res1b, $ctr2b @ AES final-1 block - result
mov $rk4d, $res0.d[1] @ GHASH final-2 block - mid
pmull $rk3q1, $res0.1d, $h3.1d @ GHASH final-2 block - low
stp $output_l0, $output_h0, [$output_ptr], #16 @ AES final-2 block - store result
eor $rk4v.8b, $rk4v.8b, $res0.8b @ GHASH final-2 block - mid
mov $output_h0, $ctr0.d[1] @ AES final-1 block - mov high
eor $acc_lb, $acc_lb, $rk3 @ GHASH final-2 block - low
mov $output_l0, $ctr0.d[0] @ AES final-1 block - mov low
pmull2 $rk2q1, $res0.2d, $h3.2d @ GHASH final-2 block - high
pmull $rk4v.1q, $rk4v.1d, $h34k.1d @ GHASH final-2 block - mid
eor $acc_hb, $acc_hb, $rk2 @ GHASH final-2 block - high
eor $output_h0, $output_h0, $rk12_h @ AES final-1 block - round 12 high
eor $output_l0, $output_l0, $rk12_l @ AES final-1 block - round 12 low
eor $acc_mb, $acc_mb, $rk4v.16b @ GHASH final-2 block - mid
.L192_dec_blocks_more_than_1: @ blocks left > 1
rev64 $res0b, $res1b @ GHASH final-1 block
eor $res0b, $res0b, $t0.16b @ feed in partial tag
ld1 { $res1b}, [$input_ptr], #16 @ AES final block - load ciphertext
mov $rk4d, $res0.d[1] @ GHASH final-1 block - mid
pmull2 $rk2q1, $res0.2d, $h2.2d @ GHASH final-1 block - high
eor $ctr0b, $res1b, $ctr3b @ AES final block - result
stp $output_l0, $output_h0, [$output_ptr], #16 @ AES final-1 block - store result
eor $rk4v.8b, $rk4v.8b, $res0.8b @ GHASH final-1 block - mid
eor $acc_hb, $acc_hb, $rk2 @ GHASH final-1 block - high
pmull $rk3q1, $res0.1d, $h2.1d @ GHASH final-1 block - low
mov $output_h0, $ctr0.d[1] @ AES final block - mov high
ins $rk4v.d[1], $rk4v.d[0] @ GHASH final-1 block - mid
mov $output_l0, $ctr0.d[0] @ AES final block - mov low
pmull2 $rk4v.1q, $rk4v.2d, $h12k.2d @ GHASH final-1 block - mid
movi $t0.8b, #0 @ suppress further partial tag feed in
eor $acc_lb, $acc_lb, $rk3 @ GHASH final-1 block - low
eor $output_h0, $output_h0, $rk12_h @ AES final block - round 12 high
eor $output_l0, $output_l0, $rk12_l @ AES final block - round 12 low
eor $acc_mb, $acc_mb, $rk4v.16b @ GHASH final-1 block - mid
.L192_dec_blocks_less_than_1: @ blocks left <= 1
mvn $rk12_l, xzr @ rk12_l = 0xffffffffffffffff
ldp $end_input_ptr, $main_end_input_ptr, [$output_ptr] @ load existing bytes we need to not overwrite
and $bit_length, $bit_length, #127 @ bit_length %= 128
sub $bit_length, $bit_length, #128 @ bit_length -= 128
neg $bit_length, $bit_length @ bit_length = 128 - #bits in input (in range [1,128])
and $bit_length, $bit_length, #127 @ bit_length %= 128
mvn $rk12_h, xzr @ rk12_h = 0xffffffffffffffff
lsr $rk12_h, $rk12_h, $bit_length @ rk12_h is mask for top 64b of last block
cmp $bit_length, #64
csel $ctr32x, $rk12_l, $rk12_h, lt
csel $ctr96_b64x, $rk12_h, xzr, lt
fmov $ctr0d, $ctr32x @ ctr0b is mask for last block
and $output_l0, $output_l0, $ctr32x
bic $end_input_ptr, $end_input_ptr, $ctr32x @ mask out low existing bytes
orr $output_l0, $output_l0, $end_input_ptr
mov $ctr0.d[1], $ctr96_b64x
rev $ctr32w, $rctr32w
and $res1b, $res1b, $ctr0b @ possibly partial last block has zeroes in highest bits
str $ctr32w, [$counter, #12] @ store the updated counter
rev64 $res0b, $res1b @ GHASH final block
eor $res0b, $res0b, $t0.16b @ feed in partial tag
bic $main_end_input_ptr, $main_end_input_ptr, $ctr96_b64x @ mask out high existing bytes
and $output_h0, $output_h0, $ctr96_b64x
pmull2 $rk2q1, $res0.2d, $h1.2d @ GHASH final block - high
mov $t0d, $res0.d[1] @ GHASH final block - mid
pmull $rk3q1, $res0.1d, $h1.1d @ GHASH final block - low
eor $t0.8b, $t0.8b, $res0.8b @ GHASH final block - mid
eor $acc_hb, $acc_hb, $rk2 @ GHASH final block - high
pmull $t0.1q, $t0.1d, $h12k.1d @ GHASH final block - mid
eor $acc_lb, $acc_lb, $rk3 @ GHASH final block - low
eor $acc_mb, $acc_mb, $t0.16b @ GHASH final block - mid
movi $mod_constant.8b, #0xc2
eor $t9.16b, $acc_lb, $acc_hb @ MODULO - karatsuba tidy up
shl $mod_constantd, $mod_constantd, #56 @ mod_constant
eor $acc_mb, $acc_mb, $t9.16b @ MODULO - karatsuba tidy up
pmull $mod_t.1q, $acc_h.1d, $mod_constant.1d @ MODULO - top 64b align with mid
orr $output_h0, $output_h0, $main_end_input_ptr
stp $output_l0, $output_h0, [$output_ptr]
ext $acc_hb, $acc_hb, $acc_hb, #8 @ MODULO - other top alignment
eor $acc_mb, $acc_mb, $mod_t.16b @ MODULO - fold into mid
eor $acc_mb, $acc_mb, $acc_hb @ MODULO - fold into mid
pmull $mod_constant.1q, $acc_m.1d, $mod_constant.1d @ MODULO - mid 64b align with low
eor $acc_lb, $acc_lb, $mod_constant.16b @ MODULO - fold into low
ext $acc_mb, $acc_mb, $acc_mb, #8 @ MODULO - other mid alignment
eor $acc_lb, $acc_lb, $acc_mb @ MODULO - fold into low
ext $acc_lb, $acc_lb, $acc_lb, #8
rev64 $acc_lb, $acc_lb
mov x0, $len
st1 { $acc_l.16b }, [$current_tag]
ldp x21, x22, [sp, #16]
ldp x23, x24, [sp, #32]
ldp d8, d9, [sp, #48]
ldp d10, d11, [sp, #64]
ldp d12, d13, [sp, #80]
ldp d14, d15, [sp, #96]
ldp x19, x20, [sp], #112
ret
.L192_dec_ret:
mov w0, #0x0
ret
.size aes_gcm_dec_192_kernel,.-aes_gcm_dec_192_kernel
___
}
{
my ($end_input_ptr,$main_end_input_ptr,$input_l0,$input_h0)=map("x$_",(4..7));
my ($input_l1,$input_h1,$input_l2,$input_h2,$input_l3,$input_h3)=map("x$_",(19..24));
my ($output_l1,$output_h1,$output_l2,$output_h2,$output_l3,$output_h3)=map("x$_",(19..24));
my ($output_l0,$output_h0)=map("x$_",(6..7));
my $ctr32w="w9";
my ($ctr32x,$ctr96_b64x,$ctr96_t32x,$rctr32x,$rk14_l,$rk14_h,$len)=map("x$_",(9..15));
my ($ctr96_t32w,$rctr32w)=map("w$_",(11..12));
my ($ctr0b,$ctr1b,$ctr2b,$ctr3b,$res0b,$res1b,$res2b,$res3b)=map("v$_.16b",(0..7));
my ($ctr0,$ctr1,$ctr2,$ctr3,$res0,$res1,$res2,$res3)=map("v$_",(0..7));
my ($ctr0d,$ctr1d,$ctr2d,$ctr3d,$res0d,$res1d,$res2d,$res3d)=map("d$_",(0..7));
my ($res0q,$res1q,$res2q,$res3q)=map("q$_",(4..7));
my ($acc_hb,$acc_mb,$acc_lb)=map("v$_.16b",(9..11));
my ($acc_h,$acc_m,$acc_l)=map("v$_",(9..11));
my ($acc_hd,$acc_md,$acc_ld)=map("d$_",(9..11));
my ($h1,$h2,$h3,$h4,$h12k,$h34k)=map("v$_",(12..17));
my ($h1q,$h2q,$h3q,$h4q)=map("q$_",(12..15));
my ($h1b,$h2b,$h3b,$h4b)=map("v$_.16b",(12..15));
my $t0="v8";
my $t0d="d8";
my $t1="v4";
my $t1d="d4";
my $t2="v8";
my $t2d="d8";
my $t3="v4";
my $t3d="d4";
my $t4="v4";
my $t4d="d4";
my $t5="v5";
my $t5d="d5";
my $t6="v8";
my $t6d="d8";
my $t7="v5";
my $t7d="d5";
my $t8="v6";
my $t8d="d6";
my $t9="v4";
my $t9d="d4";
my ($ctr_t0,$ctr_t1,$ctr_t2,$ctr_t3)=map("v$_",(4..7));
my ($ctr_t0d,$ctr_t1d,$ctr_t2d,$ctr_t3d)=map("d$_",(4..7));
my ($ctr_t0b,$ctr_t1b,$ctr_t2b,$ctr_t3b)=map("v$_.16b",(4..7));
my $mod_constantd="d8";
my $mod_constant="v8";
my $mod_t="v7";
my ($rk0,$rk1,$rk2,$rk3,$rk4,$rk5,$rk6,$rk7,$rk8,$rk9,$rk10,$rk11,$rk12,$rk13)=map("v$_.16b",(18..31));
my ($rk0q,$rk1q,$rk2q,$rk3q,$rk4q,$rk5q,$rk6q,$rk7q,$rk8q,$rk9q,$rk10q,$rk11q,$rk12q,$rk13q)=map("q$_",(18..31));
my $rk2q1="v20.1q";
my $rk3q1="v21.1q";
my $rk4v="v22";
my $rk4d="d22";
#########################################################################################
# size_t aes_gcm_enc_256_kernel(const unsigned char *in,
# size_t len,
# unsigned char *out,
# const void *key,
# unsigned char ivec[16],
# u64 *Xi);
#
$code.=<<___;
.global aes_gcm_enc_256_kernel
.type aes_gcm_enc_256_kernel,%function
.align 4
aes_gcm_enc_256_kernel:
AARCH64_VALID_CALL_TARGET
cbz x1, .L256_enc_ret
stp x19, x20, [sp, #-112]!
mov x16, x4
mov x8, x5
stp x21, x22, [sp, #16]
stp x23, x24, [sp, #32]
stp d8, d9, [sp, #48]
stp d10, d11, [sp, #64]
stp d12, d13, [sp, #80]
stp d14, d15, [sp, #96]
add $end_input_ptr, $input_ptr, $bit_length, lsr #3 @ end_input_ptr
lsr $main_end_input_ptr, $bit_length, #3 @ byte_len
mov $len, $main_end_input_ptr
ldp $ctr96_b64x, $ctr96_t32x, [$counter] @ ctr96_b64, ctr96_t32
ld1 { $ctr0b}, [$counter] @ special case vector load initial counter so we can start first AES block as quickly as possible
sub $main_end_input_ptr, $main_end_input_ptr, #1 @ byte_len - 1
ldr $rk0q, [$cc, #0] @ load rk0
and $main_end_input_ptr, $main_end_input_ptr, #0xffffffffffffffc0 @ number of bytes to be processed in main loop (at least 1 byte must be handled by tail)
ldr $rk7q, [$cc, #112] @ load rk7
add $main_end_input_ptr, $main_end_input_ptr, $input_ptr
lsr $rctr32x, $ctr96_t32x, #32
fmov $ctr2d, $ctr96_b64x @ CTR block 2
orr $ctr96_t32w, $ctr96_t32w, $ctr96_t32w
rev $rctr32w, $rctr32w @ rev_ctr32
cmp $input_ptr, $main_end_input_ptr @ check if we have <= 4 blocks
fmov $ctr1d, $ctr96_b64x @ CTR block 1
aese $ctr0b, $rk0 \n aesmc $ctr0b, $ctr0b @ AES block 0 - round 0
add $rctr32w, $rctr32w, #1 @ increment rev_ctr32
rev $ctr32w, $rctr32w @ CTR block 1
fmov $ctr3d, $ctr96_b64x @ CTR block 3
orr $ctr32x, $ctr96_t32x, $ctr32x, lsl #32 @ CTR block 1
add $rctr32w, $rctr32w, #1 @ CTR block 1
ldr $rk1q, [$cc, #16] @ load rk1
fmov $ctr1.d[1], $ctr32x @ CTR block 1
rev $ctr32w, $rctr32w @ CTR block 2
add $rctr32w, $rctr32w, #1 @ CTR block 2
orr $ctr32x, $ctr96_t32x, $ctr32x, lsl #32 @ CTR block 2
ldr $rk2q, [$cc, #32] @ load rk2
fmov $ctr2.d[1], $ctr32x @ CTR block 2
rev $ctr32w, $rctr32w @ CTR block 3
aese $ctr0b, $rk1 \n aesmc $ctr0b, $ctr0b @ AES block 0 - round 1
orr $ctr32x, $ctr96_t32x, $ctr32x, lsl #32 @ CTR block 3
fmov $ctr3.d[1], $ctr32x @ CTR block 3
aese $ctr1b, $rk0 \n aesmc $ctr1b, $ctr1b @ AES block 1 - round 0
ldr $rk3q, [$cc, #48] @ load rk3
aese $ctr0b, $rk2 \n aesmc $ctr0b, $ctr0b @ AES block 0 - round 2
ldr $rk6q, [$cc, #96] @ load rk6
aese $ctr2b, $rk0 \n aesmc $ctr2b, $ctr2b @ AES block 2 - round 0
ldr $rk5q, [$cc, #80] @ load rk5
aese $ctr1b, $rk1 \n aesmc $ctr1b, $ctr1b @ AES block 1 - round 1
ldr $h3q, [$current_tag, #80] @ load h3l | h3h
ext $h3b, $h3b, $h3b, #8
aese $ctr3b, $rk0 \n aesmc $ctr3b, $ctr3b @ AES block 3 - round 0
ldr $rk13q, [$cc, #208] @ load rk13
aese $ctr2b, $rk1 \n aesmc $ctr2b, $ctr2b @ AES block 2 - round 1
ldr $rk4q, [$cc, #64] @ load rk4
aese $ctr1b, $rk2 \n aesmc $ctr1b, $ctr1b @ AES block 1 - round 2
ldr $h2q, [$current_tag, #64] @ load h2l | h2h
ext $h2b, $h2b, $h2b, #8
aese $ctr3b, $rk1 \n aesmc $ctr3b, $ctr3b @ AES block 3 - round 1
ldr $rk12q, [$cc, #192] @ load rk12
aese $ctr2b, $rk2 \n aesmc $ctr2b, $ctr2b @ AES block 2 - round 2
ldr $h4q, [$current_tag, #112] @ load h4l | h4h
ext $h4b, $h4b, $h4b, #8
aese $ctr1b, $rk3 \n aesmc $ctr1b, $ctr1b @ AES block 1 - round 3
ldr $rk11q, [$cc, #176] @ load rk11
aese $ctr3b, $rk2 \n aesmc $ctr3b, $ctr3b @ AES block 3 - round 2
ldr $rk8q, [$cc, #128] @ load rk8
aese $ctr2b, $rk3 \n aesmc $ctr2b, $ctr2b @ AES block 2 - round 3
add $rctr32w, $rctr32w, #1 @ CTR block 3
aese $ctr0b, $rk3 \n aesmc $ctr0b, $ctr0b @ AES block 0 - round 3
ldp $rk14_l, $rk14_h, [$cc, #224] @ load rk14
aese $ctr3b, $rk3 \n aesmc $ctr3b, $ctr3b @ AES block 3 - round 3
ld1 { $acc_lb}, [$current_tag]
ext $acc_lb, $acc_lb, $acc_lb, #8
rev64 $acc_lb, $acc_lb
aese $ctr2b, $rk4 \n aesmc $ctr2b, $ctr2b @ AES block 2 - round 4
aese $ctr0b, $rk4 \n aesmc $ctr0b, $ctr0b @ AES block 0 - round 4
aese $ctr1b, $rk4 \n aesmc $ctr1b, $ctr1b @ AES block 1 - round 4
aese $ctr3b, $rk4 \n aesmc $ctr3b, $ctr3b @ AES block 3 - round 4
aese $ctr0b, $rk5 \n aesmc $ctr0b, $ctr0b @ AES block 0 - round 5
aese $ctr1b, $rk5 \n aesmc $ctr1b, $ctr1b @ AES block 1 - round 5
aese $ctr3b, $rk5 \n aesmc $ctr3b, $ctr3b @ AES block 3 - round 5
aese $ctr2b, $rk5 \n aesmc $ctr2b, $ctr2b @ AES block 2 - round 5
aese $ctr1b, $rk6 \n aesmc $ctr1b, $ctr1b @ AES block 1 - round 6
trn2 $h34k.2d, $h3.2d, $h4.2d @ h4l | h3l
aese $ctr3b, $rk6 \n aesmc $ctr3b, $ctr3b @ AES block 3 - round 6
ldr $rk9q, [$cc, #144] @ load rk9
aese $ctr0b, $rk6 \n aesmc $ctr0b, $ctr0b @ AES block 0 - round 6
ldr $h1q, [$current_tag, #32] @ load h1l | h1h
ext $h1b, $h1b, $h1b, #8
aese $ctr2b, $rk6 \n aesmc $ctr2b, $ctr2b @ AES block 2 - round 6
ldr $rk10q, [$cc, #160] @ load rk10
aese $ctr1b, $rk7 \n aesmc $ctr1b, $ctr1b @ AES block 1 - round 7
trn1 $acc_h.2d, $h3.2d, $h4.2d @ h4h | h3h
aese $ctr0b, $rk7 \n aesmc $ctr0b, $ctr0b @ AES block 0 - round 7
aese $ctr2b, $rk7 \n aesmc $ctr2b, $ctr2b @ AES block 2 - round 7
aese $ctr3b, $rk7 \n aesmc $ctr3b, $ctr3b @ AES block 3 - round 7
trn2 $h12k.2d, $h1.2d, $h2.2d @ h2l | h1l
aese $ctr1b, $rk8 \n aesmc $ctr1b, $ctr1b @ AES block 1 - round 8
aese $ctr2b, $rk8 \n aesmc $ctr2b, $ctr2b @ AES block 2 - round 8
aese $ctr3b, $rk8 \n aesmc $ctr3b, $ctr3b @ AES block 3 - round 8
aese $ctr1b, $rk9 \n aesmc $ctr1b, $ctr1b @ AES block 1 - round 9
aese $ctr2b, $rk9 \n aesmc $ctr2b, $ctr2b @ AES block 2 - round 9
aese $ctr0b, $rk8 \n aesmc $ctr0b, $ctr0b @ AES block 0 - round 8
aese $ctr1b, $rk10 \n aesmc $ctr1b, $ctr1b @ AES block 1 - round 10
aese $ctr3b, $rk9 \n aesmc $ctr3b, $ctr3b @ AES block 3 - round 9
aese $ctr0b, $rk9 \n aesmc $ctr0b, $ctr0b @ AES block 0 - round 9
aese $ctr2b, $rk10 \n aesmc $ctr2b, $ctr2b @ AES block 2 - round 10
aese $ctr3b, $rk10 \n aesmc $ctr3b, $ctr3b @ AES block 3 - round 10
aese $ctr1b, $rk11 \n aesmc $ctr1b, $ctr1b @ AES block 1 - round 11
aese $ctr2b, $rk11 \n aesmc $ctr2b, $ctr2b @ AES block 2 - round 11
aese $ctr0b, $rk10 \n aesmc $ctr0b, $ctr0b @ AES block 0 - round 10
aese $ctr1b, $rk12 \n aesmc $ctr1b, $ctr1b @ AES block 1 - round 12
aese $ctr2b, $rk12 \n aesmc $ctr2b, $ctr2b @ AES block 2 - round 12
aese $ctr0b, $rk11 \n aesmc $ctr0b, $ctr0b @ AES block 0 - round 11
eor $h34k.16b, $h34k.16b, $acc_h.16b @ h4k | h3k
aese $ctr3b, $rk11 \n aesmc $ctr3b, $ctr3b @ AES block 3 - round 11
aese $ctr2b, $rk13 @ AES block 2 - round 13
trn1 $t0.2d, $h1.2d, $h2.2d @ h2h | h1h
aese $ctr0b, $rk12 \n aesmc $ctr0b, $ctr0b @ AES block 0 - round 12
aese $ctr3b, $rk12 \n aesmc $ctr3b, $ctr3b @ AES block 3 - round 12
aese $ctr1b, $rk13 @ AES block 1 - round 13
aese $ctr0b, $rk13 @ AES block 0 - round 13
aese $ctr3b, $rk13 @ AES block 3 - round 13
eor $h12k.16b, $h12k.16b, $t0.16b @ h2k | h1k
b.ge .L256_enc_tail @ handle tail
ldp $input_l1, $input_h1, [$input_ptr, #16] @ AES block 1 - load plaintext
rev $ctr32w, $rctr32w @ CTR block 4
ldp $input_l0, $input_h0, [$input_ptr, #0] @ AES block 0 - load plaintext
ldp $input_l3, $input_h3, [$input_ptr, #48] @ AES block 3 - load plaintext
ldp $input_l2, $input_h2, [$input_ptr, #32] @ AES block 2 - load plaintext
add $input_ptr, $input_ptr, #64 @ AES input_ptr update
eor $input_l1, $input_l1, $rk14_l @ AES block 1 - round 14 low
eor $input_h1, $input_h1, $rk14_h @ AES block 1 - round 14 high
fmov $ctr_t1d, $input_l1 @ AES block 1 - mov low
eor $input_l0, $input_l0, $rk14_l @ AES block 0 - round 14 low
eor $input_h0, $input_h0, $rk14_h @ AES block 0 - round 14 high
eor $input_h3, $input_h3, $rk14_h @ AES block 3 - round 14 high
fmov $ctr_t0d, $input_l0 @ AES block 0 - mov low
cmp $input_ptr, $main_end_input_ptr @ check if we have <= 8 blocks
fmov $ctr_t0.d[1], $input_h0 @ AES block 0 - mov high
eor $input_l3, $input_l3, $rk14_l @ AES block 3 - round 14 low
eor $input_l2, $input_l2, $rk14_l @ AES block 2 - round 14 low
fmov $ctr_t1.d[1], $input_h1 @ AES block 1 - mov high
fmov $ctr_t2d, $input_l2 @ AES block 2 - mov low
add $rctr32w, $rctr32w, #1 @ CTR block 4
orr $ctr32x, $ctr96_t32x, $ctr32x, lsl #32 @ CTR block 4
fmov $ctr_t3d, $input_l3 @ AES block 3 - mov low
eor $input_h2, $input_h2, $rk14_h @ AES block 2 - round 14 high
fmov $ctr_t2.d[1], $input_h2 @ AES block 2 - mov high
eor $res0b, $ctr_t0b, $ctr0b @ AES block 0 - result
fmov $ctr0d, $ctr96_b64x @ CTR block 4
fmov $ctr0.d[1], $ctr32x @ CTR block 4
rev $ctr32w, $rctr32w @ CTR block 5
add $rctr32w, $rctr32w, #1 @ CTR block 5
eor $res1b, $ctr_t1b, $ctr1b @ AES block 1 - result
fmov $ctr1d, $ctr96_b64x @ CTR block 5
orr $ctr32x, $ctr96_t32x, $ctr32x, lsl #32 @ CTR block 5
fmov $ctr1.d[1], $ctr32x @ CTR block 5
rev $ctr32w, $rctr32w @ CTR block 6
st1 { $res0b}, [$output_ptr], #16 @ AES block 0 - store result
fmov $ctr_t3.d[1], $input_h3 @ AES block 3 - mov high
orr $ctr32x, $ctr96_t32x, $ctr32x, lsl #32 @ CTR block 6
eor $res2b, $ctr_t2b, $ctr2b @ AES block 2 - result
st1 { $res1b}, [$output_ptr], #16 @ AES block 1 - store result
add $rctr32w, $rctr32w, #1 @ CTR block 6
fmov $ctr2d, $ctr96_b64x @ CTR block 6
fmov $ctr2.d[1], $ctr32x @ CTR block 6
st1 { $res2b}, [$output_ptr], #16 @ AES block 2 - store result
rev $ctr32w, $rctr32w @ CTR block 7
orr $ctr32x, $ctr96_t32x, $ctr32x, lsl #32 @ CTR block 7
eor $res3b, $ctr_t3b, $ctr3b @ AES block 3 - result
st1 { $res3b}, [$output_ptr], #16 @ AES block 3 - store result
b.ge L256_enc_prepretail @ do prepretail
.L256_enc_main_loop: @ main loop start
aese $ctr0b, $rk0 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 0
rev64 $res0b, $res0b @ GHASH block 4k (only t0 is free)
aese $ctr1b, $rk0 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 0
fmov $ctr3d, $ctr96_b64x @ CTR block 4k+3
aese $ctr2b, $rk0 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 0
ext $acc_lb, $acc_lb, $acc_lb, #8 @ PRE 0
aese $ctr0b, $rk1 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 1
fmov $ctr3.d[1], $ctr32x @ CTR block 4k+3
aese $ctr1b, $rk1 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 1
ldp $input_l3, $input_h3, [$input_ptr, #48] @ AES block 4k+7 - load plaintext
aese $ctr2b, $rk1 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 1
ldp $input_l2, $input_h2, [$input_ptr, #32] @ AES block 4k+6 - load plaintext
aese $ctr0b, $rk2 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 2
eor $res0b, $res0b, $acc_lb @ PRE 1
aese $ctr1b, $rk2 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 2
aese $ctr3b, $rk0 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 0
eor $input_l3, $input_l3, $rk14_l @ AES block 4k+7 - round 14 low
aese $ctr0b, $rk3 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 3
mov $acc_md, $h34k.d[1] @ GHASH block 4k - mid
pmull2 $acc_h.1q, $res0.2d, $h4.2d @ GHASH block 4k - high
eor $input_h2, $input_h2, $rk14_h @ AES block 4k+6 - round 14 high
mov $t0d, $res0.d[1] @ GHASH block 4k - mid
aese $ctr3b, $rk1 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 1
rev64 $res1b, $res1b @ GHASH block 4k+1 (t0 and t1 free)
aese $ctr0b, $rk4 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 4
pmull $acc_l.1q, $res0.1d, $h4.1d @ GHASH block 4k - low
eor $t0.8b, $t0.8b, $res0.8b @ GHASH block 4k - mid
aese $ctr2b, $rk2 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 2
aese $ctr0b, $rk5 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 5
rev64 $res3b, $res3b @ GHASH block 4k+3 (t0, t1, t2 and t3 free)
pmull2 $t1.1q, $res1.2d, $h3.2d @ GHASH block 4k+1 - high
pmull $acc_m.1q, $t0.1d, $acc_m.1d @ GHASH block 4k - mid
rev64 $res2b, $res2b @ GHASH block 4k+2 (t0, t1, and t2 free)
pmull $t2.1q, $res1.1d, $h3.1d @ GHASH block 4k+1 - low
eor $acc_hb, $acc_hb, $t1.16b @ GHASH block 4k+1 - high
mov $t3d, $res1.d[1] @ GHASH block 4k+1 - mid
aese $ctr1b, $rk3 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 3
aese $ctr3b, $rk2 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 2
eor $acc_lb, $acc_lb, $t2.16b @ GHASH block 4k+1 - low
aese $ctr2b, $rk3 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 3
aese $ctr1b, $rk4 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 4
mov $t6d, $res2.d[1] @ GHASH block 4k+2 - mid
aese $ctr3b, $rk3 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 3
eor $t3.8b, $t3.8b, $res1.8b @ GHASH block 4k+1 - mid
aese $ctr2b, $rk4 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 4
aese $ctr0b, $rk6 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 6
eor $t6.8b, $t6.8b, $res2.8b @ GHASH block 4k+2 - mid
aese $ctr3b, $rk4 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 4
pmull $t3.1q, $t3.1d, $h34k.1d @ GHASH block 4k+1 - mid
aese $ctr0b, $rk7 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 7
aese $ctr3b, $rk5 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 5
ins $t6.d[1], $t6.d[0] @ GHASH block 4k+2 - mid
aese $ctr1b, $rk5 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 5
aese $ctr0b, $rk8 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 8
aese $ctr2b, $rk5 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 5
aese $ctr1b, $rk6 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 6
eor $acc_mb, $acc_mb, $t3.16b @ GHASH block 4k+1 - mid
pmull2 $t4.1q, $res2.2d, $h2.2d @ GHASH block 4k+2 - high
pmull $t5.1q, $res2.1d, $h2.1d @ GHASH block 4k+2 - low
aese $ctr1b, $rk7 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 7
pmull $t8.1q, $res3.1d, $h1.1d @ GHASH block 4k+3 - low
eor $acc_hb, $acc_hb, $t4.16b @ GHASH block 4k+2 - high
aese $ctr3b, $rk6 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 6
ldp $input_l1, $input_h1, [$input_ptr, #16] @ AES block 4k+5 - load plaintext
aese $ctr1b, $rk8 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 8
mov $t9d, $res3.d[1] @ GHASH block 4k+3 - mid
aese $ctr2b, $rk6 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 6
eor $acc_lb, $acc_lb, $t5.16b @ GHASH block 4k+2 - low
pmull2 $t6.1q, $t6.2d, $h12k.2d @ GHASH block 4k+2 - mid
pmull2 $t7.1q, $res3.2d, $h1.2d @ GHASH block 4k+3 - high
eor $t9.8b, $t9.8b, $res3.8b @ GHASH block 4k+3 - mid
aese $ctr2b, $rk7 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 7
eor $input_l1, $input_l1, $rk14_l @ AES block 4k+5 - round 14 low
aese $ctr1b, $rk9 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 9
eor $acc_mb, $acc_mb, $t6.16b @ GHASH block 4k+2 - mid
aese $ctr3b, $rk7 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 7
eor $input_l2, $input_l2, $rk14_l @ AES block 4k+6 - round 14 low
aese $ctr0b, $rk9 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 9
movi $mod_constant.8b, #0xc2
pmull $t9.1q, $t9.1d, $h12k.1d @ GHASH block 4k+3 - mid
eor $acc_hb, $acc_hb, $t7.16b @ GHASH block 4k+3 - high
fmov $ctr_t1d, $input_l1 @ AES block 4k+5 - mov low
aese $ctr2b, $rk8 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 8
ldp $input_l0, $input_h0, [$input_ptr, #0] @ AES block 4k+4 - load plaintext
aese $ctr0b, $rk10 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 10
shl $mod_constantd, $mod_constantd, #56 @ mod_constant
aese $ctr3b, $rk8 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 8
eor $acc_lb, $acc_lb, $t8.16b @ GHASH block 4k+3 - low
aese $ctr2b, $rk9 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 9
aese $ctr1b, $rk10 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 10
eor $acc_mb, $acc_mb, $t9.16b @ GHASH block 4k+3 - mid
aese $ctr3b, $rk9 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 9
add $rctr32w, $rctr32w, #1 @ CTR block 4k+3
aese $ctr0b, $rk11 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 11
eor $t9.16b, $acc_lb, $acc_hb @ MODULO - karatsuba tidy up
aese $ctr1b, $rk11 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 11
add $input_ptr, $input_ptr, #64 @ AES input_ptr update
pmull $mod_t.1q, $acc_h.1d, $mod_constant.1d @ MODULO - top 64b align with mid
rev $ctr32w, $rctr32w @ CTR block 4k+8
ext $acc_hb, $acc_hb, $acc_hb, #8 @ MODULO - other top alignment
aese $ctr2b, $rk10 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 10
eor $input_l0, $input_l0, $rk14_l @ AES block 4k+4 - round 14 low
aese $ctr1b, $rk12 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 12
eor $acc_mb, $acc_mb, $t9.16b @ MODULO - karatsuba tidy up
aese $ctr3b, $rk10 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 10
eor $input_h0, $input_h0, $rk14_h @ AES block 4k+4 - round 14 high
fmov $ctr_t0d, $input_l0 @ AES block 4k+4 - mov low
orr $ctr32x, $ctr96_t32x, $ctr32x, lsl #32 @ CTR block 4k+8
eor $mod_t.16b, $acc_hb, $mod_t.16b @ MODULO - fold into mid
aese $ctr0b, $rk12 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 12
eor $input_h1, $input_h1, $rk14_h @ AES block 4k+5 - round 14 high
aese $ctr2b, $rk11 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 11
eor $input_h3, $input_h3, $rk14_h @ AES block 4k+7 - round 14 high
aese $ctr3b, $rk11 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 11
add $rctr32w, $rctr32w, #1 @ CTR block 4k+8
aese $ctr0b, $rk13 @ AES block 4k+4 - round 13
fmov $ctr_t0.d[1], $input_h0 @ AES block 4k+4 - mov high
eor $acc_mb, $acc_mb, $mod_t.16b @ MODULO - fold into mid
aese $ctr2b, $rk12 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 12
fmov $ctr_t3d, $input_l3 @ AES block 4k+7 - mov low
aese $ctr1b, $rk13 @ AES block 4k+5 - round 13
fmov $ctr_t1.d[1], $input_h1 @ AES block 4k+5 - mov high
fmov $ctr_t2d, $input_l2 @ AES block 4k+6 - mov low
cmp $input_ptr, $main_end_input_ptr @ LOOP CONTROL
fmov $ctr_t2.d[1], $input_h2 @ AES block 4k+6 - mov high
pmull $acc_h.1q, $acc_m.1d, $mod_constant.1d @ MODULO - mid 64b align with low
eor $res0b, $ctr_t0b, $ctr0b @ AES block 4k+4 - result
fmov $ctr0d, $ctr96_b64x @ CTR block 4k+8
fmov $ctr0.d[1], $ctr32x @ CTR block 4k+8
rev $ctr32w, $rctr32w @ CTR block 4k+9
add $rctr32w, $rctr32w, #1 @ CTR block 4k+9
eor $res1b, $ctr_t1b, $ctr1b @ AES block 4k+5 - result
fmov $ctr1d, $ctr96_b64x @ CTR block 4k+9
orr $ctr32x, $ctr96_t32x, $ctr32x, lsl #32 @ CTR block 4k+9
aese $ctr3b, $rk12 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 12
fmov $ctr1.d[1], $ctr32x @ CTR block 4k+9
aese $ctr2b, $rk13 @ AES block 4k+6 - round 13
rev $ctr32w, $rctr32w @ CTR block 4k+10
st1 { $res0b}, [$output_ptr], #16 @ AES block 4k+4 - store result
orr $ctr32x, $ctr96_t32x, $ctr32x, lsl #32 @ CTR block 4k+10
eor $acc_lb, $acc_lb, $acc_hb @ MODULO - fold into low
fmov $ctr_t3.d[1], $input_h3 @ AES block 4k+7 - mov high
ext $acc_mb, $acc_mb, $acc_mb, #8 @ MODULO - other mid alignment
st1 { $res1b}, [$output_ptr], #16 @ AES block 4k+5 - store result
add $rctr32w, $rctr32w, #1 @ CTR block 4k+10
aese $ctr3b, $rk13 @ AES block 4k+7 - round 13
eor $res2b, $ctr_t2b, $ctr2b @ AES block 4k+6 - result
fmov $ctr2d, $ctr96_b64x @ CTR block 4k+10
st1 { $res2b}, [$output_ptr], #16 @ AES block 4k+6 - store result
fmov $ctr2.d[1], $ctr32x @ CTR block 4k+10
rev $ctr32w, $rctr32w @ CTR block 4k+11
eor $acc_lb, $acc_lb, $acc_mb @ MODULO - fold into low
orr $ctr32x, $ctr96_t32x, $ctr32x, lsl #32 @ CTR block 4k+11
eor $res3b, $ctr_t3b, $ctr3b @ AES block 4k+7 - result
st1 { $res3b}, [$output_ptr], #16 @ AES block 4k+7 - store result
b.lt L256_enc_main_loop
.L256_enc_prepretail: @ PREPRETAIL
aese $ctr1b, $rk0 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 0
rev64 $res2b, $res2b @ GHASH block 4k+2 (t0, t1, and t2 free)
aese $ctr2b, $rk0 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 0
fmov $ctr3d, $ctr96_b64x @ CTR block 4k+3
aese $ctr0b, $rk0 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 0
rev64 $res0b, $res0b @ GHASH block 4k (only t0 is free)
fmov $ctr3.d[1], $ctr32x @ CTR block 4k+3
ext $acc_lb, $acc_lb, $acc_lb, #8 @ PRE 0
aese $ctr2b, $rk1 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 1
aese $ctr0b, $rk1 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 1
eor $res0b, $res0b, $acc_lb @ PRE 1
rev64 $res1b, $res1b @ GHASH block 4k+1 (t0 and t1 free)
aese $ctr2b, $rk2 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 2
aese $ctr3b, $rk0 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 0
mov $acc_md, $h34k.d[1] @ GHASH block 4k - mid
aese $ctr1b, $rk1 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 1
pmull $acc_l.1q, $res0.1d, $h4.1d @ GHASH block 4k - low
mov $t0d, $res0.d[1] @ GHASH block 4k - mid
pmull2 $acc_h.1q, $res0.2d, $h4.2d @ GHASH block 4k - high
aese $ctr2b, $rk3 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 3
aese $ctr1b, $rk2 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 2
eor $t0.8b, $t0.8b, $res0.8b @ GHASH block 4k - mid
aese $ctr0b, $rk2 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 2
aese $ctr3b, $rk1 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 1
aese $ctr1b, $rk3 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 3
pmull $acc_m.1q, $t0.1d, $acc_m.1d @ GHASH block 4k - mid
pmull2 $t1.1q, $res1.2d, $h3.2d @ GHASH block 4k+1 - high
pmull $t2.1q, $res1.1d, $h3.1d @ GHASH block 4k+1 - low
aese $ctr3b, $rk2 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 2
eor $acc_hb, $acc_hb, $t1.16b @ GHASH block 4k+1 - high
mov $t3d, $res1.d[1] @ GHASH block 4k+1 - mid
aese $ctr0b, $rk3 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 3
eor $acc_lb, $acc_lb, $t2.16b @ GHASH block 4k+1 - low
aese $ctr3b, $rk3 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 3
eor $t3.8b, $t3.8b, $res1.8b @ GHASH block 4k+1 - mid
mov $t6d, $res2.d[1] @ GHASH block 4k+2 - mid
aese $ctr0b, $rk4 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 4
rev64 $res3b, $res3b @ GHASH block 4k+3 (t0, t1, t2 and t3 free)
aese $ctr3b, $rk4 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 4
pmull $t3.1q, $t3.1d, $h34k.1d @ GHASH block 4k+1 - mid
eor $t6.8b, $t6.8b, $res2.8b @ GHASH block 4k+2 - mid
add $rctr32w, $rctr32w, #1 @ CTR block 4k+3
pmull $t5.1q, $res2.1d, $h2.1d @ GHASH block 4k+2 - low
aese $ctr3b, $rk5 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 5
aese $ctr2b, $rk4 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 4
eor $acc_mb, $acc_mb, $t3.16b @ GHASH block 4k+1 - mid
pmull2 $t4.1q, $res2.2d, $h2.2d @ GHASH block 4k+2 - high
eor $acc_lb, $acc_lb, $t5.16b @ GHASH block 4k+2 - low
ins $t6.d[1], $t6.d[0] @ GHASH block 4k+2 - mid
aese $ctr2b, $rk5 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 5
eor $acc_hb, $acc_hb, $t4.16b @ GHASH block 4k+2 - high
mov $t9d, $res3.d[1] @ GHASH block 4k+3 - mid
aese $ctr1b, $rk4 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 4
pmull2 $t6.1q, $t6.2d, $h12k.2d @ GHASH block 4k+2 - mid
eor $t9.8b, $t9.8b, $res3.8b @ GHASH block 4k+3 - mid
pmull2 $t7.1q, $res3.2d, $h1.2d @ GHASH block 4k+3 - high
aese $ctr1b, $rk5 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 5
pmull $t9.1q, $t9.1d, $h12k.1d @ GHASH block 4k+3 - mid
eor $acc_mb, $acc_mb, $t6.16b @ GHASH block 4k+2 - mid
aese $ctr0b, $rk5 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 5
aese $ctr1b, $rk6 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 6
aese $ctr2b, $rk6 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 6
aese $ctr0b, $rk6 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 6
movi $mod_constant.8b, #0xc2
aese $ctr3b, $rk6 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 6
aese $ctr1b, $rk7 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 7
eor $acc_hb, $acc_hb, $t7.16b @ GHASH block 4k+3 - high
aese $ctr0b, $rk7 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 7
aese $ctr3b, $rk7 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 7
shl $mod_constantd, $mod_constantd, #56 @ mod_constant
aese $ctr1b, $rk8 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 8
eor $acc_mb, $acc_mb, $t9.16b @ GHASH block 4k+3 - mid
pmull $t8.1q, $res3.1d, $h1.1d @ GHASH block 4k+3 - low
aese $ctr3b, $rk8 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 8
aese $ctr1b, $rk9 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 9
aese $ctr0b, $rk8 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 8
eor $acc_lb, $acc_lb, $t8.16b @ GHASH block 4k+3 - low
aese $ctr3b, $rk9 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 9
eor $acc_mb, $acc_mb, $acc_hb @ karatsuba tidy up
pmull $t1.1q, $acc_h.1d, $mod_constant.1d
ext $acc_hb, $acc_hb, $acc_hb, #8
aese $ctr3b, $rk10 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 10
aese $ctr2b, $rk7 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 7
eor $acc_mb, $acc_mb, $acc_lb
aese $ctr1b, $rk10 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 10
aese $ctr0b, $rk9 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 9
aese $ctr2b, $rk8 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 8
aese $ctr1b, $rk11 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 11
eor $acc_mb, $acc_mb, $t1.16b
aese $ctr0b, $rk10 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 10
aese $ctr2b, $rk9 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 9
aese $ctr1b, $rk12 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 12
aese $ctr0b, $rk11 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 11
eor $acc_mb, $acc_mb, $acc_hb
aese $ctr3b, $rk11 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 11
aese $ctr2b, $rk10 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 10
aese $ctr0b, $rk12 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 12
pmull $t1.1q, $acc_m.1d, $mod_constant.1d
aese $ctr2b, $rk11 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 11
ext $acc_mb, $acc_mb, $acc_mb, #8
aese $ctr3b, $rk12 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 12
aese $ctr1b, $rk13 @ AES block 4k+5 - round 13
eor $acc_lb, $acc_lb, $t1.16b
aese $ctr2b, $rk12 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 12
aese $ctr3b, $rk13 @ AES block 4k+7 - round 13
aese $ctr0b, $rk13 @ AES block 4k+4 - round 13
aese $ctr2b, $rk13 @ AES block 4k+6 - round 13
eor $acc_lb, $acc_lb, $acc_mb
.L256_enc_tail: @ TAIL
ext $t0.16b, $acc_lb, $acc_lb, #8 @ prepare final partial tag
sub $main_end_input_ptr, $end_input_ptr, $input_ptr @ main_end_input_ptr is number of bytes left to process
ldp $input_l0, $input_h0, [$input_ptr], #16 @ AES block 4k+4 - load plaintext
eor $input_l0, $input_l0, $rk14_l @ AES block 4k+4 - round 14 low
eor $input_h0, $input_h0, $rk14_h @ AES block 4k+4 - round 14 high
cmp $main_end_input_ptr, #48
fmov $ctr_t0d, $input_l0 @ AES block 4k+4 - mov low
fmov $ctr_t0.d[1], $input_h0 @ AES block 4k+4 - mov high
eor $res1b, $ctr_t0b, $ctr0b @ AES block 4k+4 - result
b.gt .L256_enc_blocks_more_than_3
cmp $main_end_input_ptr, #32
mov $ctr3b, $ctr2b
movi $acc_l.8b, #0
movi $acc_h.8b, #0
sub $rctr32w, $rctr32w, #1
mov $ctr2b, $ctr1b
movi $acc_m.8b, #0
b.gt .L256_enc_blocks_more_than_2
mov $ctr3b, $ctr1b
sub $rctr32w, $rctr32w, #1
cmp $main_end_input_ptr, #16
b.gt .L256_enc_blocks_more_than_1
sub $rctr32w, $rctr32w, #1
b .L256_enc_blocks_less_than_1
.L256_enc_blocks_more_than_3: @ blocks left > 3
st1 { $res1b}, [$output_ptr], #16 @ AES final-3 block - store result
ldp $input_l0, $input_h0, [$input_ptr], #16 @ AES final-2 block - load input low & high
rev64 $res0b, $res1b @ GHASH final-3 block
eor $input_l0, $input_l0, $rk14_l @ AES final-2 block - round 14 low
eor $res0b, $res0b, $t0.16b @ feed in partial tag
eor $input_h0, $input_h0, $rk14_h @ AES final-2 block - round 14 high
mov $rk4d, $res0.d[1] @ GHASH final-3 block - mid
fmov $res1d, $input_l0 @ AES final-2 block - mov low
fmov $res1.d[1], $input_h0 @ AES final-2 block - mov high
eor $rk4v.8b, $rk4v.8b, $res0.8b @ GHASH final-3 block - mid
movi $t0.8b, #0 @ suppress further partial tag feed in
mov $acc_md, $h34k.d[1] @ GHASH final-3 block - mid
pmull $acc_l.1q, $res0.1d, $h4.1d @ GHASH final-3 block - low
pmull2 $acc_h.1q, $res0.2d, $h4.2d @ GHASH final-3 block - high
pmull $acc_m.1q, $rk4v.1d, $acc_m.1d @ GHASH final-3 block - mid
eor $res1b, $res1b, $ctr1b @ AES final-2 block - result
.L256_enc_blocks_more_than_2: @ blocks left > 2
st1 { $res1b}, [$output_ptr], #16 @ AES final-2 block - store result
ldp $input_l0, $input_h0, [$input_ptr], #16 @ AES final-1 block - load input low & high
rev64 $res0b, $res1b @ GHASH final-2 block
eor $input_l0, $input_l0, $rk14_l @ AES final-1 block - round 14 low
eor $res0b, $res0b, $t0.16b @ feed in partial tag
fmov $res1d, $input_l0 @ AES final-1 block - mov low
eor $input_h0, $input_h0, $rk14_h @ AES final-1 block - round 14 high
fmov $res1.d[1], $input_h0 @ AES final-1 block - mov high
movi $t0.8b, #0 @ suppress further partial tag feed in
pmull2 $rk2q1, $res0.2d, $h3.2d @ GHASH final-2 block - high
mov $rk4d, $res0.d[1] @ GHASH final-2 block - mid
pmull $rk3q1, $res0.1d, $h3.1d @ GHASH final-2 block - low
eor $rk4v.8b, $rk4v.8b, $res0.8b @ GHASH final-2 block - mid
eor $res1b, $res1b, $ctr2b @ AES final-1 block - result
eor $acc_hb, $acc_hb, $rk2 @ GHASH final-2 block - high
pmull $rk4v.1q, $rk4v.1d, $h34k.1d @ GHASH final-2 block - mid
eor $acc_lb, $acc_lb, $rk3 @ GHASH final-2 block - low
eor $acc_mb, $acc_mb, $rk4v.16b @ GHASH final-2 block - mid
.L256_enc_blocks_more_than_1: @ blocks left > 1
st1 { $res1b}, [$output_ptr], #16 @ AES final-1 block - store result
rev64 $res0b, $res1b @ GHASH final-1 block
ldp $input_l0, $input_h0, [$input_ptr], #16 @ AES final block - load input low & high
eor $res0b, $res0b, $t0.16b @ feed in partial tag
movi $t0.8b, #0 @ suppress further partial tag feed in
eor $input_l0, $input_l0, $rk14_l @ AES final block - round 14 low
mov $rk4d, $res0.d[1] @ GHASH final-1 block - mid
pmull2 $rk2q1, $res0.2d, $h2.2d @ GHASH final-1 block - high
eor $input_h0, $input_h0, $rk14_h @ AES final block - round 14 high
eor $rk4v.8b, $rk4v.8b, $res0.8b @ GHASH final-1 block - mid
eor $acc_hb, $acc_hb, $rk2 @ GHASH final-1 block - high
ins $rk4v.d[1], $rk4v.d[0] @ GHASH final-1 block - mid
fmov $res1d, $input_l0 @ AES final block - mov low
fmov $res1.d[1], $input_h0 @ AES final block - mov high
pmull2 $rk4v.1q, $rk4v.2d, $h12k.2d @ GHASH final-1 block - mid
pmull $rk3q1, $res0.1d, $h2.1d @ GHASH final-1 block - low
eor $res1b, $res1b, $ctr3b @ AES final block - result
eor $acc_mb, $acc_mb, $rk4v.16b @ GHASH final-1 block - mid
eor $acc_lb, $acc_lb, $rk3 @ GHASH final-1 block - low
.L256_enc_blocks_less_than_1: @ blocks left <= 1
and $bit_length, $bit_length, #127 @ bit_length %= 128
mvn $rk14_l, xzr @ rk14_l = 0xffffffffffffffff
sub $bit_length, $bit_length, #128 @ bit_length -= 128
neg $bit_length, $bit_length @ bit_length = 128 - #bits in input (in range [1,128])
ld1 { $rk0}, [$output_ptr] @ load existing bytes where the possibly partial last block is to be stored
mvn $rk14_h, xzr @ rk14_h = 0xffffffffffffffff
and $bit_length, $bit_length, #127 @ bit_length %= 128
lsr $rk14_h, $rk14_h, $bit_length @ rk14_h is mask for top 64b of last block
cmp $bit_length, #64
csel $input_l0, $rk14_l, $rk14_h, lt
csel $input_h0, $rk14_h, xzr, lt
fmov $ctr0d, $input_l0 @ ctr0b is mask for last block
fmov $ctr0.d[1], $input_h0
and $res1b, $res1b, $ctr0b @ possibly partial last block has zeroes in highest bits
rev64 $res0b, $res1b @ GHASH final block
eor $res0b, $res0b, $t0.16b @ feed in partial tag
bif $res1b, $rk0, $ctr0b @ insert existing bytes in top end of result before storing
pmull2 $rk2q1, $res0.2d, $h1.2d @ GHASH final block - high
mov $t0d, $res0.d[1] @ GHASH final block - mid
rev $ctr32w, $rctr32w
pmull $rk3q1, $res0.1d, $h1.1d @ GHASH final block - low
eor $acc_hb, $acc_hb, $rk2 @ GHASH final block - high
eor $t0.8b, $t0.8b, $res0.8b @ GHASH final block - mid
pmull $t0.1q, $t0.1d, $h12k.1d @ GHASH final block - mid
eor $acc_lb, $acc_lb, $rk3 @ GHASH final block - low
eor $acc_mb, $acc_mb, $t0.16b @ GHASH final block - mid
movi $mod_constant.8b, #0xc2
eor $t9.16b, $acc_lb, $acc_hb @ MODULO - karatsuba tidy up
shl $mod_constantd, $mod_constantd, #56 @ mod_constant
eor $acc_mb, $acc_mb, $t9.16b @ MODULO - karatsuba tidy up
pmull $mod_t.1q, $acc_h.1d, $mod_constant.1d @ MODULO - top 64b align with mid
ext $acc_hb, $acc_hb, $acc_hb, #8 @ MODULO - other top alignment
eor $acc_mb, $acc_mb, $mod_t.16b @ MODULO - fold into mid
eor $acc_mb, $acc_mb, $acc_hb @ MODULO - fold into mid
pmull $acc_h.1q, $acc_m.1d, $mod_constant.1d @ MODULO - mid 64b align with low
ext $acc_mb, $acc_mb, $acc_mb, #8 @ MODULO - other mid alignment
str $ctr32w, [$counter, #12] @ store the updated counter
st1 { $res1b}, [$output_ptr] @ store all 16B
eor $acc_lb, $acc_lb, $acc_hb @ MODULO - fold into low
eor $acc_lb, $acc_lb, $acc_mb @ MODULO - fold into low
ext $acc_lb, $acc_lb, $acc_lb, #8
rev64 $acc_lb, $acc_lb
mov x0, $len
st1 { $acc_l.16b }, [$current_tag]
ldp x21, x22, [sp, #16]
ldp x23, x24, [sp, #32]
ldp d8, d9, [sp, #48]
ldp d10, d11, [sp, #64]
ldp d12, d13, [sp, #80]
ldp d14, d15, [sp, #96]
ldp x19, x20, [sp], #112
ret
.L256_enc_ret:
mov w0, #0x0
ret
.size aes_gcm_enc_256_kernel,.-aes_gcm_enc_256_kernel
___
{
my $t8="v4";
my $t8d="d4";
my $t9="v6";
my $t9d="d6";
#########################################################################################
# size_t aes_gcm_dec_256_kernel(const unsigned char *in,
# size_t len,
# unsigned char *out,
# const void *key,
# unsigned char ivec[16],
# u64 *Xi);
#
$code.=<<___;
.global aes_gcm_dec_256_kernel
.type aes_gcm_dec_256_kernel,%function
.align 4
aes_gcm_dec_256_kernel:
AARCH64_VALID_CALL_TARGET
cbz x1, .L256_dec_ret
stp x19, x20, [sp, #-112]!
mov x16, x4
mov x8, x5
stp x21, x22, [sp, #16]
stp x23, x24, [sp, #32]
stp d8, d9, [sp, #48]
stp d10, d11, [sp, #64]
stp d12, d13, [sp, #80]
stp d14, d15, [sp, #96]
lsr $main_end_input_ptr, $bit_length, #3 @ byte_len
mov $len, $main_end_input_ptr
ldp $ctr96_b64x, $ctr96_t32x, [$counter] @ ctr96_b64, ctr96_t32
ldr $rk8q, [$cc, #128] @ load rk8
sub $main_end_input_ptr, $main_end_input_ptr, #1 @ byte_len - 1
ldr $rk7q, [$cc, #112] @ load rk7
and $main_end_input_ptr, $main_end_input_ptr, #0xffffffffffffffc0 @ number of bytes to be processed in main loop (at least 1 byte must be handled by tail)
add $end_input_ptr, $input_ptr, $bit_length, lsr #3 @ end_input_ptr
ldr $rk6q, [$cc, #96] @ load rk6
lsr $rctr32x, $ctr96_t32x, #32
ldr $rk5q, [$cc, #80] @ load rk5
orr $ctr96_t32w, $ctr96_t32w, $ctr96_t32w
ldr $rk3q, [$cc, #48] @ load rk3
add $main_end_input_ptr, $main_end_input_ptr, $input_ptr
rev $rctr32w, $rctr32w @ rev_ctr32
add $rctr32w, $rctr32w, #1 @ increment rev_ctr32
fmov $ctr3d, $ctr96_b64x @ CTR block 3
rev $ctr32w, $rctr32w @ CTR block 1
add $rctr32w, $rctr32w, #1 @ CTR block 1
fmov $ctr1d, $ctr96_b64x @ CTR block 1
orr $ctr32x, $ctr96_t32x, $ctr32x, lsl #32 @ CTR block 1
ld1 { $ctr0b}, [$counter] @ special case vector load initial counter so we can start first AES block as quickly as possible
fmov $ctr1.d[1], $ctr32x @ CTR block 1
rev $ctr32w, $rctr32w @ CTR block 2
add $rctr32w, $rctr32w, #1 @ CTR block 2
fmov $ctr2d, $ctr96_b64x @ CTR block 2
orr $ctr32x, $ctr96_t32x, $ctr32x, lsl #32 @ CTR block 2
fmov $ctr2.d[1], $ctr32x @ CTR block 2
rev $ctr32w, $rctr32w @ CTR block 3
orr $ctr32x, $ctr96_t32x, $ctr32x, lsl #32 @ CTR block 3
ldr $rk0q, [$cc, #0] @ load rk0
fmov $ctr3.d[1], $ctr32x @ CTR block 3
add $rctr32w, $rctr32w, #1 @ CTR block 3
ldr $rk4q, [$cc, #64] @ load rk4
ldr $rk13q, [$cc, #208] @ load rk13
ldr $rk1q, [$cc, #16] @ load rk1
aese $ctr0b, $rk0 \n aesmc $ctr0b, $ctr0b @ AES block 0 - round 0
ldr $h3q, [$current_tag, #80] @ load h3l | h3h
ext $h3b, $h3b, $h3b, #8
aese $ctr3b, $rk0 \n aesmc $ctr3b, $ctr3b @ AES block 3 - round 0
ldr $h4q, [$current_tag, #112] @ load h4l | h4h
ext $h4b, $h4b, $h4b, #8
aese $ctr1b, $rk0 \n aesmc $ctr1b, $ctr1b @ AES block 1 - round 0
ldr $h2q, [$current_tag, #64] @ load h2l | h2h
ext $h2b, $h2b, $h2b, #8
aese $ctr2b, $rk0 \n aesmc $ctr2b, $ctr2b @ AES block 2 - round 0
ldr $rk2q, [$cc, #32] @ load rk2
aese $ctr0b, $rk1 \n aesmc $ctr0b, $ctr0b @ AES block 0 - round 1
ldp $rk14_l, $rk14_h, [$cc, #224] @ load rk14
aese $ctr1b, $rk1 \n aesmc $ctr1b, $ctr1b @ AES block 1 - round 1
ld1 { $acc_lb}, [$current_tag]
ext $acc_lb, $acc_lb, $acc_lb, #8
rev64 $acc_lb, $acc_lb
aese $ctr2b, $rk1 \n aesmc $ctr2b, $ctr2b @ AES block 2 - round 1
ldr $rk9q, [$cc, #144] @ load rk9
aese $ctr3b, $rk1 \n aesmc $ctr3b, $ctr3b @ AES block 3 - round 1
ldr $rk12q, [$cc, #192] @ load rk12
aese $ctr0b, $rk2 \n aesmc $ctr0b, $ctr0b @ AES block 0 - round 2
ldr $h1q, [$current_tag, #32] @ load h1l | h1h
ext $h1b, $h1b, $h1b, #8
aese $ctr2b, $rk2 \n aesmc $ctr2b, $ctr2b @ AES block 2 - round 2
ldr $rk10q, [$cc, #160] @ load rk10
aese $ctr3b, $rk2 \n aesmc $ctr3b, $ctr3b @ AES block 3 - round 2
aese $ctr0b, $rk3 \n aesmc $ctr0b, $ctr0b @ AES block 0 - round 3
aese $ctr1b, $rk2 \n aesmc $ctr1b, $ctr1b @ AES block 1 - round 2
aese $ctr3b, $rk3 \n aesmc $ctr3b, $ctr3b @ AES block 3 - round 3
aese $ctr0b, $rk4 \n aesmc $ctr0b, $ctr0b @ AES block 0 - round 4
cmp $input_ptr, $main_end_input_ptr @ check if we have <= 4 blocks
aese $ctr2b, $rk3 \n aesmc $ctr2b, $ctr2b @ AES block 2 - round 3
aese $ctr1b, $rk3 \n aesmc $ctr1b, $ctr1b @ AES block 1 - round 3
aese $ctr3b, $rk4 \n aesmc $ctr3b, $ctr3b @ AES block 3 - round 4
aese $ctr2b, $rk4 \n aesmc $ctr2b, $ctr2b @ AES block 2 - round 4
aese $ctr1b, $rk4 \n aesmc $ctr1b, $ctr1b @ AES block 1 - round 4
aese $ctr3b, $rk5 \n aesmc $ctr3b, $ctr3b @ AES block 3 - round 5
aese $ctr0b, $rk5 \n aesmc $ctr0b, $ctr0b @ AES block 0 - round 5
aese $ctr1b, $rk5 \n aesmc $ctr1b, $ctr1b @ AES block 1 - round 5
aese $ctr2b, $rk5 \n aesmc $ctr2b, $ctr2b @ AES block 2 - round 5
aese $ctr0b, $rk6 \n aesmc $ctr0b, $ctr0b @ AES block 0 - round 6
aese $ctr3b, $rk6 \n aesmc $ctr3b, $ctr3b @ AES block 3 - round 6
aese $ctr1b, $rk6 \n aesmc $ctr1b, $ctr1b @ AES block 1 - round 6
aese $ctr2b, $rk6 \n aesmc $ctr2b, $ctr2b @ AES block 2 - round 6
aese $ctr0b, $rk7 \n aesmc $ctr0b, $ctr0b @ AES block 0 - round 7
aese $ctr1b, $rk7 \n aesmc $ctr1b, $ctr1b @ AES block 1 - round 7
aese $ctr3b, $rk7 \n aesmc $ctr3b, $ctr3b @ AES block 3 - round 7
aese $ctr0b, $rk8 \n aesmc $ctr0b, $ctr0b @ AES block 0 - round 8
aese $ctr2b, $rk7 \n aesmc $ctr2b, $ctr2b @ AES block 2 - round 7
aese $ctr3b, $rk8 \n aesmc $ctr3b, $ctr3b @ AES block 3 - round 8
aese $ctr1b, $rk8 \n aesmc $ctr1b, $ctr1b @ AES block 1 - round 8
aese $ctr0b, $rk9 \n aesmc $ctr0b, $ctr0b @ AES block 0 - round 9
aese $ctr2b, $rk8 \n aesmc $ctr2b, $ctr2b @ AES block 2 - round 8
ldr $rk11q, [$cc, #176] @ load rk11
aese $ctr1b, $rk9 \n aesmc $ctr1b, $ctr1b @ AES block 1 - round 9
aese $ctr0b, $rk10 \n aesmc $ctr0b, $ctr0b @ AES block 0 - round 10
aese $ctr3b, $rk9 \n aesmc $ctr3b, $ctr3b @ AES block 3 - round 9
aese $ctr1b, $rk10 \n aesmc $ctr1b, $ctr1b @ AES block 1 - round 10
aese $ctr2b, $rk9 \n aesmc $ctr2b, $ctr2b @ AES block 2 - round 9
aese $ctr3b, $rk10 \n aesmc $ctr3b, $ctr3b @ AES block 3 - round 10
aese $ctr0b, $rk11 \n aesmc $ctr0b, $ctr0b @ AES block 0 - round 11
aese $ctr2b, $rk10 \n aesmc $ctr2b, $ctr2b @ AES block 2 - round 10
aese $ctr3b, $rk11 \n aesmc $ctr3b, $ctr3b @ AES block 3 - round 11
aese $ctr1b, $rk11 \n aesmc $ctr1b, $ctr1b @ AES block 1 - round 11
aese $ctr2b, $rk11 \n aesmc $ctr2b, $ctr2b @ AES block 2 - round 11
trn1 $acc_h.2d, $h3.2d, $h4.2d @ h4h | h3h
trn2 $h34k.2d, $h3.2d, $h4.2d @ h4l | h3l
trn1 $t0.2d, $h1.2d, $h2.2d @ h2h | h1h
trn2 $h12k.2d, $h1.2d, $h2.2d @ h2l | h1l
aese $ctr1b, $rk12 \n aesmc $ctr1b, $ctr1b @ AES block 1 - round 12
aese $ctr0b, $rk12 \n aesmc $ctr0b, $ctr0b @ AES block 0 - round 12
aese $ctr2b, $rk12 \n aesmc $ctr2b, $ctr2b @ AES block 2 - round 12
aese $ctr3b, $rk12 \n aesmc $ctr3b, $ctr3b @ AES block 3 - round 12
eor $h34k.16b, $h34k.16b, $acc_h.16b @ h4k | h3k
aese $ctr1b, $rk13 @ AES block 1 - round 13
aese $ctr2b, $rk13 @ AES block 2 - round 13
eor $h12k.16b, $h12k.16b, $t0.16b @ h2k | h1k
aese $ctr3b, $rk13 @ AES block 3 - round 13
aese $ctr0b, $rk13 @ AES block 0 - round 13
b.ge .L256_dec_tail @ handle tail
ldr $res0q, [$input_ptr, #0] @ AES block 0 - load ciphertext
ldr $res1q, [$input_ptr, #16] @ AES block 1 - load ciphertext
rev $ctr32w, $rctr32w @ CTR block 4
eor $ctr0b, $res0b, $ctr0b @ AES block 0 - result
eor $ctr1b, $res1b, $ctr1b @ AES block 1 - result
rev64 $res1b, $res1b @ GHASH block 1
ldr $res3q, [$input_ptr, #48] @ AES block 3 - load ciphertext
mov $output_h0, $ctr0.d[1] @ AES block 0 - mov high
mov $output_l0, $ctr0.d[0] @ AES block 0 - mov low
rev64 $res0b, $res0b @ GHASH block 0
add $rctr32w, $rctr32w, #1 @ CTR block 4
fmov $ctr0d, $ctr96_b64x @ CTR block 4
orr $ctr32x, $ctr96_t32x, $ctr32x, lsl #32 @ CTR block 4
fmov $ctr0.d[1], $ctr32x @ CTR block 4
rev $ctr32w, $rctr32w @ CTR block 5
add $rctr32w, $rctr32w, #1 @ CTR block 5
mov $output_l1, $ctr1.d[0] @ AES block 1 - mov low
orr $ctr32x, $ctr96_t32x, $ctr32x, lsl #32 @ CTR block 5
mov $output_h1, $ctr1.d[1] @ AES block 1 - mov high
eor $output_h0, $output_h0, $rk14_h @ AES block 0 - round 14 high
eor $output_l0, $output_l0, $rk14_l @ AES block 0 - round 14 low
stp $output_l0, $output_h0, [$output_ptr], #16 @ AES block 0 - store result
fmov $ctr1d, $ctr96_b64x @ CTR block 5
ldr $res2q, [$input_ptr, #32] @ AES block 2 - load ciphertext
add $input_ptr, $input_ptr, #64 @ AES input_ptr update
fmov $ctr1.d[1], $ctr32x @ CTR block 5
rev $ctr32w, $rctr32w @ CTR block 6
add $rctr32w, $rctr32w, #1 @ CTR block 6
eor $output_l1, $output_l1, $rk14_l @ AES block 1 - round 14 low
orr $ctr32x, $ctr96_t32x, $ctr32x, lsl #32 @ CTR block 6
eor $output_h1, $output_h1, $rk14_h @ AES block 1 - round 14 high
stp $output_l1, $output_h1, [$output_ptr], #16 @ AES block 1 - store result
eor $ctr2b, $res2b, $ctr2b @ AES block 2 - result
cmp $input_ptr, $main_end_input_ptr @ check if we have <= 8 blocks
b.ge .L256_dec_prepretail @ do prepretail
.L256_dec_main_loop: @ main loop start
mov $output_l2, $ctr2.d[0] @ AES block 4k+2 - mov low
ext $acc_lb, $acc_lb, $acc_lb, #8 @ PRE 0
eor $ctr3b, $res3b, $ctr3b @ AES block 4k+3 - result
aese $ctr0b, $rk0 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 0
mov $output_h2, $ctr2.d[1] @ AES block 4k+2 - mov high
aese $ctr1b, $rk0 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 0
fmov $ctr2d, $ctr96_b64x @ CTR block 4k+6
fmov $ctr2.d[1], $ctr32x @ CTR block 4k+6
eor $res0b, $res0b, $acc_lb @ PRE 1
rev $ctr32w, $rctr32w @ CTR block 4k+7
aese $ctr0b, $rk1 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 1
mov $output_h3, $ctr3.d[1] @ AES block 4k+3 - mov high
aese $ctr1b, $rk1 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 1
mov $output_l3, $ctr3.d[0] @ AES block 4k+3 - mov low
pmull2 $acc_h.1q, $res0.2d, $h4.2d @ GHASH block 4k - high
mov $t0d, $res0.d[1] @ GHASH block 4k - mid
fmov $ctr3d, $ctr96_b64x @ CTR block 4k+7
aese $ctr0b, $rk2 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 2
orr $ctr32x, $ctr96_t32x, $ctr32x, lsl #32 @ CTR block 4k+7
aese $ctr2b, $rk0 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 0
fmov $ctr3.d[1], $ctr32x @ CTR block 4k+7
aese $ctr1b, $rk2 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 2
eor $t0.8b, $t0.8b, $res0.8b @ GHASH block 4k - mid
aese $ctr0b, $rk3 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 3
eor $output_h2, $output_h2, $rk14_h @ AES block 4k+2 - round 14 high
aese $ctr2b, $rk1 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 1
mov $acc_md, $h34k.d[1] @ GHASH block 4k - mid
aese $ctr1b, $rk3 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 3
rev64 $res2b, $res2b @ GHASH block 4k+2
aese $ctr3b, $rk0 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 0
eor $output_l2, $output_l2, $rk14_l @ AES block 4k+2 - round 14 low
aese $ctr2b, $rk2 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 2
stp $output_l2, $output_h2, [$output_ptr], #16 @ AES block 4k+2 - store result
pmull $acc_l.1q, $res0.1d, $h4.1d @ GHASH block 4k - low
pmull2 $t1.1q, $res1.2d, $h3.2d @ GHASH block 4k+1 - high
aese $ctr2b, $rk3 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 3
rev64 $res3b, $res3b @ GHASH block 4k+3
pmull $acc_m.1q, $t0.1d, $acc_m.1d @ GHASH block 4k - mid
eor $output_l3, $output_l3, $rk14_l @ AES block 4k+3 - round 14 low
pmull $t2.1q, $res1.1d, $h3.1d @ GHASH block 4k+1 - low
eor $output_h3, $output_h3, $rk14_h @ AES block 4k+3 - round 14 high
eor $acc_hb, $acc_hb, $t1.16b @ GHASH block 4k+1 - high
aese $ctr2b, $rk4 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 4
aese $ctr3b, $rk1 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 1
mov $t3d, $res1.d[1] @ GHASH block 4k+1 - mid
aese $ctr0b, $rk4 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 4
eor $acc_lb, $acc_lb, $t2.16b @ GHASH block 4k+1 - low
aese $ctr2b, $rk5 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 5
add $rctr32w, $rctr32w, #1 @ CTR block 4k+7
aese $ctr3b, $rk2 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 2
mov $t6d, $res2.d[1] @ GHASH block 4k+2 - mid
aese $ctr1b, $rk4 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 4
eor $t3.8b, $t3.8b, $res1.8b @ GHASH block 4k+1 - mid
pmull $t5.1q, $res2.1d, $h2.1d @ GHASH block 4k+2 - low
aese $ctr3b, $rk3 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 3
eor $t6.8b, $t6.8b, $res2.8b @ GHASH block 4k+2 - mid
aese $ctr1b, $rk5 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 5
aese $ctr0b, $rk5 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 5
eor $acc_lb, $acc_lb, $t5.16b @ GHASH block 4k+2 - low
pmull $t3.1q, $t3.1d, $h34k.1d @ GHASH block 4k+1 - mid
rev $ctr32w, $rctr32w @ CTR block 4k+8
aese $ctr1b, $rk6 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 6
ins $t6.d[1], $t6.d[0] @ GHASH block 4k+2 - mid
aese $ctr0b, $rk6 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 6
add $rctr32w, $rctr32w, #1 @ CTR block 4k+8
aese $ctr3b, $rk4 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 4
aese $ctr1b, $rk7 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 7
eor $acc_mb, $acc_mb, $t3.16b @ GHASH block 4k+1 - mid
aese $ctr0b, $rk7 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 7
pmull2 $t4.1q, $res2.2d, $h2.2d @ GHASH block 4k+2 - high
mov $t9d, $res3.d[1] @ GHASH block 4k+3 - mid
aese $ctr3b, $rk5 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 5
pmull2 $t6.1q, $t6.2d, $h12k.2d @ GHASH block 4k+2 - mid
aese $ctr0b, $rk8 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 8
eor $acc_hb, $acc_hb, $t4.16b @ GHASH block 4k+2 - high
aese $ctr3b, $rk6 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 6
pmull $t8.1q, $res3.1d, $h1.1d @ GHASH block 4k+3 - low
orr $ctr32x, $ctr96_t32x, $ctr32x, lsl #32 @ CTR block 4k+8
eor $acc_mb, $acc_mb, $t6.16b @ GHASH block 4k+2 - mid
pmull2 $t7.1q, $res3.2d, $h1.2d @ GHASH block 4k+3 - high
aese $ctr0b, $rk9 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 9
eor $t9.8b, $t9.8b, $res3.8b @ GHASH block 4k+3 - mid
aese $ctr1b, $rk8 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 8
aese $ctr2b, $rk6 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 6
eor $acc_hb, $acc_hb, $t7.16b @ GHASH block 4k+3 - high
aese $ctr0b, $rk10 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 10
pmull $t9.1q, $t9.1d, $h12k.1d @ GHASH block 4k+3 - mid
movi $mod_constant.8b, #0xc2
aese $ctr2b, $rk7 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 7
eor $acc_lb, $acc_lb, $t8.16b @ GHASH block 4k+3 - low
aese $ctr0b, $rk11 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 11
aese $ctr3b, $rk7 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 7
shl $mod_constantd, $mod_constantd, #56 @ mod_constant
aese $ctr2b, $rk8 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 8
eor $acc_mb, $acc_mb, $t9.16b @ GHASH block 4k+3 - mid
aese $ctr0b, $rk12 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 12
pmull $mod_t.1q, $acc_h.1d, $mod_constant.1d @ MODULO - top 64b align with mid
eor $t9.16b, $acc_lb, $acc_hb @ MODULO - karatsuba tidy up
aese $ctr1b, $rk9 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 9
ldr $res0q, [$input_ptr, #0] @ AES block 4k+4 - load ciphertext
aese $ctr0b, $rk13 @ AES block 4k+4 - round 13
ext $acc_hb, $acc_hb, $acc_hb, #8 @ MODULO - other top alignment
aese $ctr1b, $rk10 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 10
eor $acc_mb, $acc_mb, $t9.16b @ MODULO - karatsuba tidy up
aese $ctr2b, $rk9 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 9
ldr $res1q, [$input_ptr, #16] @ AES block 4k+5 - load ciphertext
aese $ctr3b, $rk8 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 8
eor $ctr0b, $res0b, $ctr0b @ AES block 4k+4 - result
aese $ctr1b, $rk11 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 11
stp $output_l3, $output_h3, [$output_ptr], #16 @ AES block 4k+3 - store result
aese $ctr2b, $rk10 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 10
eor $acc_mb, $acc_mb, $mod_t.16b @ MODULO - fold into mid
aese $ctr3b, $rk9 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 9
ldr $res3q, [$input_ptr, #48] @ AES block 4k+7 - load ciphertext
aese $ctr1b, $rk12 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 12
ldr $res2q, [$input_ptr, #32] @ AES block 4k+6 - load ciphertext
aese $ctr2b, $rk11 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 11
mov $output_h0, $ctr0.d[1] @ AES block 4k+4 - mov high
aese $ctr3b, $rk10 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 10
eor $acc_mb, $acc_mb, $acc_hb @ MODULO - fold into mid
aese $ctr1b, $rk13 @ AES block 4k+5 - round 13
add $input_ptr, $input_ptr, #64 @ AES input_ptr update
mov $output_l0, $ctr0.d[0] @ AES block 4k+4 - mov low
aese $ctr2b, $rk12 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 12
fmov $ctr0d, $ctr96_b64x @ CTR block 4k+8
aese $ctr3b, $rk11 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 11
fmov $ctr0.d[1], $ctr32x @ CTR block 4k+8
pmull $mod_constant.1q, $acc_m.1d, $mod_constant.1d @ MODULO - mid 64b align with low
eor $ctr1b, $res1b, $ctr1b @ AES block 4k+5 - result
rev $ctr32w, $rctr32w @ CTR block 4k+9
aese $ctr2b, $rk13 @ AES block 4k+6 - round 13
orr $ctr32x, $ctr96_t32x, $ctr32x, lsl #32 @ CTR block 4k+9
cmp $input_ptr, $main_end_input_ptr @ LOOP CONTROL
add $rctr32w, $rctr32w, #1 @ CTR block 4k+9
eor $output_l0, $output_l0, $rk14_l @ AES block 4k+4 - round 14 low
eor $output_h0, $output_h0, $rk14_h @ AES block 4k+4 - round 14 high
mov $output_h1, $ctr1.d[1] @ AES block 4k+5 - mov high
eor $ctr2b, $res2b, $ctr2b @ AES block 4k+6 - result
eor $acc_lb, $acc_lb, $mod_constant.16b @ MODULO - fold into low
aese $ctr3b, $rk12 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 12
mov $output_l1, $ctr1.d[0] @ AES block 4k+5 - mov low
fmov $ctr1d, $ctr96_b64x @ CTR block 4k+9
ext $acc_mb, $acc_mb, $acc_mb, #8 @ MODULO - other mid alignment
fmov $ctr1.d[1], $ctr32x @ CTR block 4k+9
rev $ctr32w, $rctr32w @ CTR block 4k+10
add $rctr32w, $rctr32w, #1 @ CTR block 4k+10
aese $ctr3b, $rk13 @ AES block 4k+7 - round 13
orr $ctr32x, $ctr96_t32x, $ctr32x, lsl #32 @ CTR block 4k+10
rev64 $res1b, $res1b @ GHASH block 4k+5
eor $output_h1, $output_h1, $rk14_h @ AES block 4k+5 - round 14 high
stp $output_l0, $output_h0, [$output_ptr], #16 @ AES block 4k+4 - store result
eor $output_l1, $output_l1, $rk14_l @ AES block 4k+5 - round 14 low
stp $output_l1, $output_h1, [$output_ptr], #16 @ AES block 4k+5 - store result
rev64 $res0b, $res0b @ GHASH block 4k+4
eor $acc_lb, $acc_lb, $acc_mb @ MODULO - fold into low
b.lt .L256_dec_main_loop
.L256_dec_prepretail: @ PREPRETAIL
ext $acc_lb, $acc_lb, $acc_lb, #8 @ PRE 0
mov $output_l2, $ctr2.d[0] @ AES block 4k+2 - mov low
eor $ctr3b, $res3b, $ctr3b @ AES block 4k+3 - result
aese $ctr0b, $rk0 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 0
mov $output_h2, $ctr2.d[1] @ AES block 4k+2 - mov high
aese $ctr1b, $rk0 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 0
fmov $ctr2d, $ctr96_b64x @ CTR block 4k+6
fmov $ctr2.d[1], $ctr32x @ CTR block 4k+6
rev $ctr32w, $rctr32w @ CTR block 4k+7
eor $res0b, $res0b, $acc_lb @ PRE 1
rev64 $res2b, $res2b @ GHASH block 4k+2
orr $ctr32x, $ctr96_t32x, $ctr32x, lsl #32 @ CTR block 4k+7
mov $output_l3, $ctr3.d[0] @ AES block 4k+3 - mov low
aese $ctr1b, $rk1 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 1
mov $output_h3, $ctr3.d[1] @ AES block 4k+3 - mov high
pmull $acc_l.1q, $res0.1d, $h4.1d @ GHASH block 4k - low
mov $t0d, $res0.d[1] @ GHASH block 4k - mid
fmov $ctr3d, $ctr96_b64x @ CTR block 4k+7
pmull2 $acc_h.1q, $res0.2d, $h4.2d @ GHASH block 4k - high
fmov $ctr3.d[1], $ctr32x @ CTR block 4k+7
aese $ctr2b, $rk0 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 0
mov $acc_md, $h34k.d[1] @ GHASH block 4k - mid
aese $ctr0b, $rk1 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 1
eor $t0.8b, $t0.8b, $res0.8b @ GHASH block 4k - mid
pmull2 $t1.1q, $res1.2d, $h3.2d @ GHASH block 4k+1 - high
aese $ctr2b, $rk1 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 1
rev64 $res3b, $res3b @ GHASH block 4k+3
aese $ctr3b, $rk0 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 0
pmull $acc_m.1q, $t0.1d, $acc_m.1d @ GHASH block 4k - mid
eor $acc_hb, $acc_hb, $t1.16b @ GHASH block 4k+1 - high
pmull $t2.1q, $res1.1d, $h3.1d @ GHASH block 4k+1 - low
aese $ctr3b, $rk1 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 1
mov $t3d, $res1.d[1] @ GHASH block 4k+1 - mid
aese $ctr0b, $rk2 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 2
aese $ctr1b, $rk2 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 2
eor $acc_lb, $acc_lb, $t2.16b @ GHASH block 4k+1 - low
aese $ctr2b, $rk2 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 2
aese $ctr0b, $rk3 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 3
mov $t6d, $res2.d[1] @ GHASH block 4k+2 - mid
aese $ctr3b, $rk2 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 2
eor $t3.8b, $t3.8b, $res1.8b @ GHASH block 4k+1 - mid
pmull $t5.1q, $res2.1d, $h2.1d @ GHASH block 4k+2 - low
aese $ctr0b, $rk4 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 4
aese $ctr3b, $rk3 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 3
eor $t6.8b, $t6.8b, $res2.8b @ GHASH block 4k+2 - mid
pmull $t3.1q, $t3.1d, $h34k.1d @ GHASH block 4k+1 - mid
aese $ctr0b, $rk5 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 5
eor $acc_lb, $acc_lb, $t5.16b @ GHASH block 4k+2 - low
aese $ctr3b, $rk4 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 4
pmull2 $t7.1q, $res3.2d, $h1.2d @ GHASH block 4k+3 - high
eor $acc_mb, $acc_mb, $t3.16b @ GHASH block 4k+1 - mid
pmull2 $t4.1q, $res2.2d, $h2.2d @ GHASH block 4k+2 - high
aese $ctr3b, $rk5 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 5
ins $t6.d[1], $t6.d[0] @ GHASH block 4k+2 - mid
aese $ctr2b, $rk3 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 3
aese $ctr1b, $rk3 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 3
eor $acc_hb, $acc_hb, $t4.16b @ GHASH block 4k+2 - high
pmull $t8.1q, $res3.1d, $h1.1d @ GHASH block 4k+3 - low
aese $ctr2b, $rk4 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 4
mov $t9d, $res3.d[1] @ GHASH block 4k+3 - mid
aese $ctr1b, $rk4 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 4
pmull2 $t6.1q, $t6.2d, $h12k.2d @ GHASH block 4k+2 - mid
aese $ctr2b, $rk5 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 5
eor $t9.8b, $t9.8b, $res3.8b @ GHASH block 4k+3 - mid
aese $ctr1b, $rk5 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 5
aese $ctr3b, $rk6 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 6
eor $acc_mb, $acc_mb, $t6.16b @ GHASH block 4k+2 - mid
aese $ctr2b, $rk6 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 6
aese $ctr0b, $rk6 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 6
movi $mod_constant.8b, #0xc2
aese $ctr1b, $rk6 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 6
eor $acc_lb, $acc_lb, $t8.16b @ GHASH block 4k+3 - low
pmull $t9.1q, $t9.1d, $h12k.1d @ GHASH block 4k+3 - mid
aese $ctr3b, $rk7 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 7
eor $acc_hb, $acc_hb, $t7.16b @ GHASH block 4k+3 - high
aese $ctr1b, $rk7 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 7
aese $ctr0b, $rk7 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 7
eor $acc_mb, $acc_mb, $t9.16b @ GHASH block 4k+3 - mid
aese $ctr3b, $rk8 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 8
aese $ctr2b, $rk7 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 7
eor $t9.16b, $acc_lb, $acc_hb @ MODULO - karatsuba tidy up
aese $ctr1b, $rk8 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 8
aese $ctr0b, $rk8 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 8
shl $mod_constantd, $mod_constantd, #56 @ mod_constant
aese $ctr2b, $rk8 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 8
aese $ctr1b, $rk9 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 9
eor $acc_mb, $acc_mb, $t9.16b @ MODULO - karatsuba tidy up
pmull $mod_t.1q, $acc_h.1d, $mod_constant.1d @ MODULO - top 64b align with mid
aese $ctr2b, $rk9 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 9
ext $acc_hb, $acc_hb, $acc_hb, #8 @ MODULO - other top alignment
aese $ctr3b, $rk9 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 9
aese $ctr0b, $rk9 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 9
eor $acc_mb, $acc_mb, $mod_t.16b @ MODULO - fold into mid
aese $ctr2b, $rk10 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 10
aese $ctr3b, $rk10 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 10
aese $ctr0b, $rk10 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 10
eor $output_h2, $output_h2, $rk14_h @ AES block 4k+2 - round 14 high
aese $ctr1b, $rk10 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 10
eor $output_l3, $output_l3, $rk14_l @ AES block 4k+3 - round 14 low
aese $ctr2b, $rk11 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 11
eor $acc_mb, $acc_mb, $acc_hb @ MODULO - fold into mid
aese $ctr0b, $rk11 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 11
add $rctr32w, $rctr32w, #1 @ CTR block 4k+7
aese $ctr1b, $rk11 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 11
eor $output_l2, $output_l2, $rk14_l @ AES block 4k+2 - round 14 low
aese $ctr2b, $rk12 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 12
pmull $mod_constant.1q, $acc_m.1d, $mod_constant.1d @ MODULO - mid 64b align with low
eor $output_h3, $output_h3, $rk14_h @ AES block 4k+3 - round 14 high
aese $ctr3b, $rk11 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 11
stp $output_l2, $output_h2, [$output_ptr], #16 @ AES block 4k+2 - store result
aese $ctr1b, $rk12 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 12
ext $acc_mb, $acc_mb, $acc_mb, #8 @ MODULO - other mid alignment
aese $ctr0b, $rk12 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 12
stp $output_l3, $output_h3, [$output_ptr], #16 @ AES block 4k+3 - store result
aese $ctr3b, $rk12 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 12
eor $acc_lb, $acc_lb, $mod_constant.16b @ MODULO - fold into low
aese $ctr1b, $rk13 @ AES block 4k+5 - round 13
aese $ctr0b, $rk13 @ AES block 4k+4 - round 13
aese $ctr3b, $rk13 @ AES block 4k+7 - round 13
aese $ctr2b, $rk13 @ AES block 4k+6 - round 13
eor $acc_lb, $acc_lb, $acc_mb @ MODULO - fold into low
.L256_dec_tail: @ TAIL
sub $main_end_input_ptr, $end_input_ptr, $input_ptr @ main_end_input_ptr is number of bytes left to process
ld1 { $res1b}, [$input_ptr], #16 @ AES block 4k+4 - load ciphertext
eor $ctr0b, $res1b, $ctr0b @ AES block 4k+4 - result
mov $output_l0, $ctr0.d[0] @ AES block 4k+4 - mov low
mov $output_h0, $ctr0.d[1] @ AES block 4k+4 - mov high
ext $t0.16b, $acc_lb, $acc_lb, #8 @ prepare final partial tag
cmp $main_end_input_ptr, #48
eor $output_l0, $output_l0, $rk14_l @ AES block 4k+4 - round 14 low
eor $output_h0, $output_h0, $rk14_h @ AES block 4k+4 - round 14 high
b.gt .L256_dec_blocks_more_than_3
sub $rctr32w, $rctr32w, #1
mov $ctr3b, $ctr2b
movi $acc_m.8b, #0
movi $acc_l.8b, #0
cmp $main_end_input_ptr, #32
movi $acc_h.8b, #0
mov $ctr2b, $ctr1b
b.gt .L256_dec_blocks_more_than_2
sub $rctr32w, $rctr32w, #1
mov $ctr3b, $ctr1b
cmp $main_end_input_ptr, #16
b.gt .L256_dec_blocks_more_than_1
sub $rctr32w, $rctr32w, #1
b .L256_dec_blocks_less_than_1
.L256_dec_blocks_more_than_3: @ blocks left > 3
rev64 $res0b, $res1b @ GHASH final-3 block
ld1 { $res1b}, [$input_ptr], #16 @ AES final-2 block - load ciphertext
stp $output_l0, $output_h0, [$output_ptr], #16 @ AES final-3 block - store result
mov $acc_md, $h34k.d[1] @ GHASH final-3 block - mid
eor $res0b, $res0b, $t0.16b @ feed in partial tag
eor $ctr0b, $res1b, $ctr1b @ AES final-2 block - result
mov $rk4d, $res0.d[1] @ GHASH final-3 block - mid
mov $output_l0, $ctr0.d[0] @ AES final-2 block - mov low
mov $output_h0, $ctr0.d[1] @ AES final-2 block - mov high
eor $rk4v.8b, $rk4v.8b, $res0.8b @ GHASH final-3 block - mid
movi $t0.8b, #0 @ suppress further partial tag feed in
pmull2 $acc_h.1q, $res0.2d, $h4.2d @ GHASH final-3 block - high
pmull $acc_m.1q, $rk4v.1d, $acc_m.1d @ GHASH final-3 block - mid
eor $output_l0, $output_l0, $rk14_l @ AES final-2 block - round 14 low
pmull $acc_l.1q, $res0.1d, $h4.1d @ GHASH final-3 block - low
eor $output_h0, $output_h0, $rk14_h @ AES final-2 block - round 14 high
.L256_dec_blocks_more_than_2: @ blocks left > 2
rev64 $res0b, $res1b @ GHASH final-2 block
ld1 { $res1b}, [$input_ptr], #16 @ AES final-1 block - load ciphertext
eor $res0b, $res0b, $t0.16b @ feed in partial tag
stp $output_l0, $output_h0, [$output_ptr], #16 @ AES final-2 block - store result
eor $ctr0b, $res1b, $ctr2b @ AES final-1 block - result
mov $rk4d, $res0.d[1] @ GHASH final-2 block - mid
pmull $rk3q1, $res0.1d, $h3.1d @ GHASH final-2 block - low
pmull2 $rk2q1, $res0.2d, $h3.2d @ GHASH final-2 block - high
eor $rk4v.8b, $rk4v.8b, $res0.8b @ GHASH final-2 block - mid
mov $output_l0, $ctr0.d[0] @ AES final-1 block - mov low
mov $output_h0, $ctr0.d[1] @ AES final-1 block - mov high
eor $acc_lb, $acc_lb, $rk3 @ GHASH final-2 block - low
movi $t0.8b, #0 @ suppress further partial tag feed in
pmull $rk4v.1q, $rk4v.1d, $h34k.1d @ GHASH final-2 block - mid
eor $acc_hb, $acc_hb, $rk2 @ GHASH final-2 block - high
eor $output_l0, $output_l0, $rk14_l @ AES final-1 block - round 14 low
eor $acc_mb, $acc_mb, $rk4v.16b @ GHASH final-2 block - mid
eor $output_h0, $output_h0, $rk14_h @ AES final-1 block - round 14 high
.L256_dec_blocks_more_than_1: @ blocks left > 1
stp $output_l0, $output_h0, [$output_ptr], #16 @ AES final-1 block - store result
rev64 $res0b, $res1b @ GHASH final-1 block
ld1 { $res1b}, [$input_ptr], #16 @ AES final block - load ciphertext
eor $res0b, $res0b, $t0.16b @ feed in partial tag
movi $t0.8b, #0 @ suppress further partial tag feed in
mov $rk4d, $res0.d[1] @ GHASH final-1 block - mid
eor $ctr0b, $res1b, $ctr3b @ AES final block - result
pmull2 $rk2q1, $res0.2d, $h2.2d @ GHASH final-1 block - high
eor $rk4v.8b, $rk4v.8b, $res0.8b @ GHASH final-1 block - mid
pmull $rk3q1, $res0.1d, $h2.1d @ GHASH final-1 block - low
mov $output_l0, $ctr0.d[0] @ AES final block - mov low
ins $rk4v.d[1], $rk4v.d[0] @ GHASH final-1 block - mid
mov $output_h0, $ctr0.d[1] @ AES final block - mov high
pmull2 $rk4v.1q, $rk4v.2d, $h12k.2d @ GHASH final-1 block - mid
eor $output_l0, $output_l0, $rk14_l @ AES final block - round 14 low
eor $acc_lb, $acc_lb, $rk3 @ GHASH final-1 block - low
eor $acc_hb, $acc_hb, $rk2 @ GHASH final-1 block - high
eor $acc_mb, $acc_mb, $rk4v.16b @ GHASH final-1 block - mid
eor $output_h0, $output_h0, $rk14_h @ AES final block - round 14 high
.L256_dec_blocks_less_than_1: @ blocks left <= 1
and $bit_length, $bit_length, #127 @ bit_length %= 128
mvn $rk14_h, xzr @ rk14_h = 0xffffffffffffffff
sub $bit_length, $bit_length, #128 @ bit_length -= 128
mvn $rk14_l, xzr @ rk14_l = 0xffffffffffffffff
ldp $end_input_ptr, $main_end_input_ptr, [$output_ptr] @ load existing bytes we need to not overwrite
neg $bit_length, $bit_length @ bit_length = 128 - #bits in input (in range [1,128])
and $bit_length, $bit_length, #127 @ bit_length %= 128
lsr $rk14_h, $rk14_h, $bit_length @ rk14_h is mask for top 64b of last block
cmp $bit_length, #64
csel $ctr32x, $rk14_l, $rk14_h, lt
csel $ctr96_b64x, $rk14_h, xzr, lt
fmov $ctr0d, $ctr32x @ ctr0b is mask for last block
and $output_l0, $output_l0, $ctr32x
mov $ctr0.d[1], $ctr96_b64x
bic $end_input_ptr, $end_input_ptr, $ctr32x @ mask out low existing bytes
rev $ctr32w, $rctr32w
bic $main_end_input_ptr, $main_end_input_ptr, $ctr96_b64x @ mask out high existing bytes
orr $output_l0, $output_l0, $end_input_ptr
and $output_h0, $output_h0, $ctr96_b64x
orr $output_h0, $output_h0, $main_end_input_ptr
and $res1b, $res1b, $ctr0b @ possibly partial last block has zeroes in highest bits
rev64 $res0b, $res1b @ GHASH final block
eor $res0b, $res0b, $t0.16b @ feed in partial tag
pmull $rk3q1, $res0.1d, $h1.1d @ GHASH final block - low
mov $t0d, $res0.d[1] @ GHASH final block - mid
eor $t0.8b, $t0.8b, $res0.8b @ GHASH final block - mid
pmull2 $rk2q1, $res0.2d, $h1.2d @ GHASH final block - high
pmull $t0.1q, $t0.1d, $h12k.1d @ GHASH final block - mid
eor $acc_hb, $acc_hb, $rk2 @ GHASH final block - high
eor $acc_lb, $acc_lb, $rk3 @ GHASH final block - low
eor $acc_mb, $acc_mb, $t0.16b @ GHASH final block - mid
movi $mod_constant.8b, #0xc2
eor $t9.16b, $acc_lb, $acc_hb @ MODULO - karatsuba tidy up
shl $mod_constantd, $mod_constantd, #56 @ mod_constant
eor $acc_mb, $acc_mb, $t9.16b @ MODULO - karatsuba tidy up
pmull $mod_t.1q, $acc_h.1d, $mod_constant.1d @ MODULO - top 64b align with mid
ext $acc_hb, $acc_hb, $acc_hb, #8 @ MODULO - other top alignment
eor $acc_mb, $acc_mb, $mod_t.16b @ MODULO - fold into mid
eor $acc_mb, $acc_mb, $acc_hb @ MODULO - fold into mid
pmull $mod_constant.1q, $acc_m.1d, $mod_constant.1d @ MODULO - mid 64b align with low
ext $acc_mb, $acc_mb, $acc_mb, #8 @ MODULO - other mid alignment
eor $acc_lb, $acc_lb, $mod_constant.16b @ MODULO - fold into low
stp $output_l0, $output_h0, [$output_ptr]
str $ctr32w, [$counter, #12] @ store the updated counter
eor $acc_lb, $acc_lb, $acc_mb @ MODULO - fold into low
ext $acc_lb, $acc_lb, $acc_lb, #8
rev64 $acc_lb, $acc_lb
mov x0, $len
st1 { $acc_l.16b }, [$current_tag]
ldp x21, x22, [sp, #16]
ldp x23, x24, [sp, #32]
ldp d8, d9, [sp, #48]
ldp d10, d11, [sp, #64]
ldp d12, d13, [sp, #80]
ldp d14, d15, [sp, #96]
ldp x19, x20, [sp], #112
ret
.L256_dec_ret:
mov w0, #0x0
ret
.size aes_gcm_dec_256_kernel,.-aes_gcm_dec_256_kernel
___
}
}
$code.=<<___;
.asciz "GHASH for ARMv8, CRYPTOGAMS by <appro\@openssl.org>"
.align 2
#endif
___
if ($flavour =~ /64/) { ######## 64-bit code
sub unvmov {
my $arg=shift;
$arg =~ m/q([0-9]+)#(lo|hi),\s*q([0-9]+)#(lo|hi)/o &&
sprintf "ins v%d.d[%d],v%d.d[%d]",$1<8?$1:$1+8,($2 eq "lo")?0:1,
$3<8?$3:$3+8,($4 eq "lo")?0:1;
}
foreach(split("\n",$code)) {
s/@\s/\/\//o; # old->new style commentary
print $_,"\n";
}
} else { ######## 32-bit code
sub unvdup32 {
my $arg=shift;
$arg =~ m/q([0-9]+),\s*q([0-9]+)\[([0-3])\]/o &&
sprintf "vdup.32 q%d,d%d[%d]",$1,2*$2+($3>>1),$3&1;
}
sub unvpmullp64 {
my ($mnemonic,$arg)=@_;
if ($arg =~ m/q([0-9]+),\s*q([0-9]+),\s*q([0-9]+)/o) {
my $word = 0xf2a00e00|(($1&7)<<13)|(($1&8)<<19)
|(($2&7)<<17)|(($2&8)<<4)
|(($3&7)<<1) |(($3&8)<<2);
$word |= 0x00010001 if ($mnemonic =~ "2");
# since ARMv7 instructions are always encoded little-endian.
# correct solution is to use .inst directive, but older%%%%
# assemblers don't implement it:-(
sprintf "INST(0x%02x,0x%02x,0x%02x,0x%02x)\t@ %s %s",
$word&0xff,($word>>8)&0xff,
($word>>16)&0xff,($word>>24)&0xff,
$mnemonic,$arg;
}
}
foreach(split("\n",$code)) {
s/\b[wx]([0-9]+)\b/r$1/go; # new->old registers
s/\bv([0-9])\.[12468]+[bsd]\b/q$1/go; # new->old registers
s/\/\/\s?/@ /o; # new->old style commentary
# fix up remaining new-style suffixes
s/\],#[0-9]+/]!/o;
s/cclr\s+([^,]+),\s*([a-z]+)/mov.$2 $1,#0/o or
s/vdup\.32\s+(.*)/unvdup32($1)/geo or
s/v?(pmull2?)\.p64\s+(.*)/unvpmullp64($1,$2)/geo or
s/\bq([0-9]+)#(lo|hi)/sprintf "d%d",2*$1+($2 eq "hi")/geo or
s/^(\s+)b\./$1b/o or
s/^(\s+)ret/$1bx\tlr/o;
if (s/^(\s+)mov\.([a-z]+)/$1mov$2/) {
print " it $2\n";
}
print $_,"\n";
}
}
close STDOUT or die "error closing STDOUT: $!"; # enforce flush