blob: 4fc8947b851926640894da628ccd57e763b75e96 [file] [log] [blame]
#! /usr/bin/env perl
# Copyright 2020-2022 The OpenSSL Project Authors. All Rights Reserved.
#
# Licensed under the Apache License 2.0 (the "License"). You may not use
# this file except in compliance with the License. You can obtain a copy
# in the file LICENSE in the source distribution or at
# https://www.openssl.org/source/license.html
#
# This module implements SM4 with ASIMD on aarch64
#
# Feb 2022
#
# $output is the last argument if it looks like a file (it has an extension)
# $flavour is the first argument if it doesn't look like a file
$output = $#ARGV >= 0 && $ARGV[$#ARGV] =~ m|\.\w+$| ? pop : undef;
$flavour = $#ARGV >= 0 && $ARGV[0] !~ m|\.| ? shift : undef;
$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
( $xlate="${dir}arm-xlate.pl" and -f $xlate ) or
( $xlate="${dir}../../perlasm/arm-xlate.pl" and -f $xlate) or
die "can't locate arm-xlate.pl";
open OUT,"| \"$^X\" $xlate $flavour \"$output\""
or die "can't call $xlate: $!";
*STDOUT=*OUT;
$prefix="vpsm4";
my @vtmp=map("v$_",(0..3));
my @data=map("v$_",(4..7));
my @datax=map("v$_",(8..11));
my ($rk0,$rk1)=("v12","v13");
my ($rka,$rkb)=("v14","v15");
my @vtmpx=map("v$_",(12..15));
my @sbox=map("v$_",(16..31));
my ($inp,$outp,$blocks,$rks)=("x0","x1","w2","x3");
my ($tmpw,$tmp,$wtmp0,$wtmp1,$wtmp2)=("w6","x6","w7","w8","w9");
my ($ptr,$counter)=("x10","w11");
my ($word0,$word1,$word2,$word3)=("w12","w13","w14","w15");
sub rev32() {
my $dst = shift;
my $src = shift;
if ($src and ("$src" ne "$dst")) {
$code.=<<___;
#ifndef __ARMEB__
rev32 $dst.16b,$src.16b
#else
mov $dst.16b,$src.16b
#endif
___
} else {
$code.=<<___;
#ifndef __ARMEB__
rev32 $dst.16b,$dst.16b
#endif
___
}
}
sub transpose() {
my ($dat0,$dat1,$dat2,$dat3,$vt0,$vt1,$vt2,$vt3) = @_;
$code.=<<___;
zip1 $vt0.4s,$dat0.4s,$dat1.4s
zip2 $vt1.4s,$dat0.4s,$dat1.4s
zip1 $vt2.4s,$dat2.4s,$dat3.4s
zip2 $vt3.4s,$dat2.4s,$dat3.4s
zip1 $dat0.2d,$vt0.2d,$vt2.2d
zip2 $dat1.2d,$vt0.2d,$vt2.2d
zip1 $dat2.2d,$vt1.2d,$vt3.2d
zip2 $dat3.2d,$vt1.2d,$vt3.2d
___
}
# sbox operations for 4-lane of words
sub sbox() {
my $dat = shift;
$code.=<<___;
movi @vtmp[0].16b,#64
movi @vtmp[1].16b,#128
movi @vtmp[2].16b,#192
sub @vtmp[0].16b,$dat.16b,@vtmp[0].16b
sub @vtmp[1].16b,$dat.16b,@vtmp[1].16b
sub @vtmp[2].16b,$dat.16b,@vtmp[2].16b
tbl $dat.16b,{@sbox[0].16b,@sbox[1].16b,@sbox[2].16b,@sbox[3].16b},$dat.16b
tbl @vtmp[0].16b,{@sbox[4].16b,@sbox[5].16b,@sbox[6].16b,@sbox[7].16b},@vtmp[0].16b
tbl @vtmp[1].16b,{@sbox[8].16b,@sbox[9].16b,@sbox[10].16b,@sbox[11].16b},@vtmp[1].16b
tbl @vtmp[2].16b,{@sbox[12].16b,@sbox[13].16b,@sbox[14].16b,@sbox[15].16b},@vtmp[2].16b
add @vtmp[0].2d,@vtmp[0].2d,@vtmp[1].2d
add @vtmp[2].2d,@vtmp[2].2d,$dat.2d
add $dat.2d,@vtmp[0].2d,@vtmp[2].2d
ushr @vtmp[0].4s,$dat.4s,32-2
sli @vtmp[0].4s,$dat.4s,2
ushr @vtmp[2].4s,$dat.4s,32-10
eor @vtmp[1].16b,@vtmp[0].16b,$dat.16b
sli @vtmp[2].4s,$dat.4s,10
eor @vtmp[1].16b,@vtmp[2].16b,$vtmp[1].16b
ushr @vtmp[0].4s,$dat.4s,32-18
sli @vtmp[0].4s,$dat.4s,18
ushr @vtmp[2].4s,$dat.4s,32-24
eor @vtmp[1].16b,@vtmp[0].16b,$vtmp[1].16b
sli @vtmp[2].4s,$dat.4s,24
eor $dat.16b,@vtmp[2].16b,@vtmp[1].16b
___
}
# sbox operation for 8-lane of words
sub sbox_double() {
my $dat = shift;
my $datx = shift;
$code.=<<___;
movi @vtmp[3].16b,#64
sub @vtmp[0].16b,$dat.16b,@vtmp[3].16b
sub @vtmp[1].16b,@vtmp[0].16b,@vtmp[3].16b
sub @vtmp[2].16b,@vtmp[1].16b,@vtmp[3].16b
tbl $dat.16b,{@sbox[0].16b,@sbox[1].16b,@sbox[2].16b,@sbox[3].16b},$dat.16b
tbl @vtmp[0].16b,{@sbox[4].16b,@sbox[5].16b,@sbox[6].16b,@sbox[7].16b},@vtmp[0].16b
tbl @vtmp[1].16b,{@sbox[8].16b,@sbox[9].16b,@sbox[10].16b,@sbox[11].16b},@vtmp[1].16b
tbl @vtmp[2].16b,{@sbox[12].16b,@sbox[13].16b,@sbox[14].16b,@sbox[15].16b},@vtmp[2].16b
add @vtmp[1].2d,@vtmp[0].2d,@vtmp[1].2d
add $dat.2d,@vtmp[2].2d,$dat.2d
add $dat.2d,@vtmp[1].2d,$dat.2d
sub @vtmp[0].16b,$datx.16b,@vtmp[3].16b
sub @vtmp[1].16b,@vtmp[0].16b,@vtmp[3].16b
sub @vtmp[2].16b,@vtmp[1].16b,@vtmp[3].16b
tbl $datx.16b,{@sbox[0].16b,@sbox[1].16b,@sbox[2].16b,@sbox[3].16b},$datx.16b
tbl @vtmp[0].16b,{@sbox[4].16b,@sbox[5].16b,@sbox[6].16b,@sbox[7].16b},@vtmp[0].16b
tbl @vtmp[1].16b,{@sbox[8].16b,@sbox[9].16b,@sbox[10].16b,@sbox[11].16b},@vtmp[1].16b
tbl @vtmp[2].16b,{@sbox[12].16b,@sbox[13].16b,@sbox[14].16b,@sbox[15].16b},@vtmp[2].16b
add @vtmp[1].2d,@vtmp[0].2d,@vtmp[1].2d
add $datx.2d,@vtmp[2].2d,$datx.2d
add $datx.2d,@vtmp[1].2d,$datx.2d
ushr @vtmp[0].4s,$dat.4s,32-2
sli @vtmp[0].4s,$dat.4s,2
ushr @vtmp[2].4s,$datx.4s,32-2
eor @vtmp[1].16b,@vtmp[0].16b,$dat.16b
sli @vtmp[2].4s,$datx.4s,2
ushr @vtmp[0].4s,$dat.4s,32-10
eor @vtmp[3].16b,@vtmp[2].16b,$datx.16b
sli @vtmp[0].4s,$dat.4s,10
ushr @vtmp[2].4s,$datx.4s,32-10
eor @vtmp[1].16b,@vtmp[0].16b,$vtmp[1].16b
sli @vtmp[2].4s,$datx.4s,10
ushr @vtmp[0].4s,$dat.4s,32-18
eor @vtmp[3].16b,@vtmp[2].16b,$vtmp[3].16b
sli @vtmp[0].4s,$dat.4s,18
ushr @vtmp[2].4s,$datx.4s,32-18
eor @vtmp[1].16b,@vtmp[0].16b,$vtmp[1].16b
sli @vtmp[2].4s,$datx.4s,18
ushr @vtmp[0].4s,$dat.4s,32-24
eor @vtmp[3].16b,@vtmp[2].16b,$vtmp[3].16b
sli @vtmp[0].4s,$dat.4s,24
ushr @vtmp[2].4s,$datx.4s,32-24
eor $dat.16b,@vtmp[0].16b,@vtmp[1].16b
sli @vtmp[2].4s,$datx.4s,24
eor $datx.16b,@vtmp[2].16b,@vtmp[3].16b
___
}
# sbox operation for one single word
sub sbox_1word () {
my $word = shift;
$code.=<<___;
movi @vtmp[1].16b,#64
movi @vtmp[2].16b,#128
movi @vtmp[3].16b,#192
mov @vtmp[0].s[0],$word
sub @vtmp[1].16b,@vtmp[0].16b,@vtmp[1].16b
sub @vtmp[2].16b,@vtmp[0].16b,@vtmp[2].16b
sub @vtmp[3].16b,@vtmp[0].16b,@vtmp[3].16b
tbl @vtmp[0].16b,{@sbox[0].16b,@sbox[1].16b,@sbox[2].16b,@sbox[3].16b},@vtmp[0].16b
tbl @vtmp[1].16b,{@sbox[4].16b,@sbox[5].16b,@sbox[6].16b,@sbox[7].16b},@vtmp[1].16b
tbl @vtmp[2].16b,{@sbox[8].16b,@sbox[9].16b,@sbox[10].16b,@sbox[11].16b},@vtmp[2].16b
tbl @vtmp[3].16b,{@sbox[12].16b,@sbox[13].16b,@sbox[14].16b,@sbox[15].16b},@vtmp[3].16b
mov $word,@vtmp[0].s[0]
mov $wtmp0,@vtmp[1].s[0]
mov $wtmp2,@vtmp[2].s[0]
add $wtmp0,$word,$wtmp0
mov $word,@vtmp[3].s[0]
add $wtmp0,$wtmp0,$wtmp2
add $wtmp0,$wtmp0,$word
eor $word,$wtmp0,$wtmp0,ror #32-2
eor $word,$word,$wtmp0,ror #32-10
eor $word,$word,$wtmp0,ror #32-18
eor $word,$word,$wtmp0,ror #32-24
___
}
# sm4 for one block of data, in scalar registers word0/word1/word2/word3
sub sm4_1blk () {
my $kptr = shift;
$code.=<<___;
ldp $wtmp0,$wtmp1,[$kptr],8
// B0 ^= SBOX(B1 ^ B2 ^ B3 ^ RK0)
eor $tmpw,$word2,$word3
eor $wtmp2,$wtmp0,$word1
eor $tmpw,$tmpw,$wtmp2
___
&sbox_1word($tmpw);
$code.=<<___;
eor $word0,$word0,$tmpw
// B1 ^= SBOX(B0 ^ B2 ^ B3 ^ RK1)
eor $tmpw,$word2,$word3
eor $wtmp2,$word0,$wtmp1
eor $tmpw,$tmpw,$wtmp2
___
&sbox_1word($tmpw);
$code.=<<___;
ldp $wtmp0,$wtmp1,[$kptr],8
eor $word1,$word1,$tmpw
// B2 ^= SBOX(B0 ^ B1 ^ B3 ^ RK2)
eor $tmpw,$word0,$word1
eor $wtmp2,$wtmp0,$word3
eor $tmpw,$tmpw,$wtmp2
___
&sbox_1word($tmpw);
$code.=<<___;
eor $word2,$word2,$tmpw
// B3 ^= SBOX(B0 ^ B1 ^ B2 ^ RK3)
eor $tmpw,$word0,$word1
eor $wtmp2,$word2,$wtmp1
eor $tmpw,$tmpw,$wtmp2
___
&sbox_1word($tmpw);
$code.=<<___;
eor $word3,$word3,$tmpw
___
}
# sm4 for 4-lanes of data, in neon registers data0/data1/data2/data3
sub sm4_4blks () {
my $kptr = shift;
$code.=<<___;
ldp $wtmp0,$wtmp1,[$kptr],8
dup $rk0.4s,$wtmp0
dup $rk1.4s,$wtmp1
// B0 ^= SBOX(B1 ^ B2 ^ B3 ^ RK0)
eor $rka.16b,@data[2].16b,@data[3].16b
eor $rk0.16b,@data[1].16b,$rk0.16b
eor $rk0.16b,$rka.16b,$rk0.16b
___
&sbox($rk0);
$code.=<<___;
eor @data[0].16b,@data[0].16b,$rk0.16b
// B1 ^= SBOX(B0 ^ B2 ^ B3 ^ RK1)
eor $rka.16b,$rka.16b,@data[0].16b
eor $rk1.16b,$rka.16b,$rk1.16b
___
&sbox($rk1);
$code.=<<___;
ldp $wtmp0,$wtmp1,[$kptr],8
eor @data[1].16b,@data[1].16b,$rk1.16b
dup $rk0.4s,$wtmp0
dup $rk1.4s,$wtmp1
// B2 ^= SBOX(B0 ^ B1 ^ B3 ^ RK2)
eor $rka.16b,@data[0].16b,@data[1].16b
eor $rk0.16b,@data[3].16b,$rk0.16b
eor $rk0.16b,$rka.16b,$rk0.16b
___
&sbox($rk0);
$code.=<<___;
eor @data[2].16b,@data[2].16b,$rk0.16b
// B3 ^= SBOX(B0 ^ B1 ^ B2 ^ RK3)
eor $rka.16b,$rka.16b,@data[2].16b
eor $rk1.16b,$rka.16b,$rk1.16b
___
&sbox($rk1);
$code.=<<___;
eor @data[3].16b,@data[3].16b,$rk1.16b
___
}
# sm4 for 8 lanes of data, in neon registers
# data0/data1/data2/data3 datax0/datax1/datax2/datax3
sub sm4_8blks () {
my $kptr = shift;
$code.=<<___;
ldp $wtmp0,$wtmp1,[$kptr],8
// B0 ^= SBOX(B1 ^ B2 ^ B3 ^ RK0)
dup $rk0.4s,$wtmp0
eor $rka.16b,@data[2].16b,@data[3].16b
eor $rkb.16b,@datax[2].16b,@datax[3].16b
eor @vtmp[0].16b,@data[1].16b,$rk0.16b
eor @vtmp[1].16b,@datax[1].16b,$rk0.16b
eor $rk0.16b,$rka.16b,@vtmp[0].16b
eor $rk1.16b,$rkb.16b,@vtmp[1].16b
___
&sbox_double($rk0,$rk1);
$code.=<<___;
eor @data[0].16b,@data[0].16b,$rk0.16b
eor @datax[0].16b,@datax[0].16b,$rk1.16b
// B1 ^= SBOX(B0 ^ B2 ^ B3 ^ RK1)
dup $rk1.4s,$wtmp1
eor $rka.16b,$rka.16b,@data[0].16b
eor $rkb.16b,$rkb.16b,@datax[0].16b
eor $rk0.16b,$rka.16b,$rk1.16b
eor $rk1.16b,$rkb.16b,$rk1.16b
___
&sbox_double($rk0,$rk1);
$code.=<<___;
ldp $wtmp0,$wtmp1,[$kptr],8
eor @data[1].16b,@data[1].16b,$rk0.16b
eor @datax[1].16b,@datax[1].16b,$rk1.16b
// B2 ^= SBOX(B0 ^ B1 ^ B3 ^ RK2)
dup $rk0.4s,$wtmp0
eor $rka.16b,@data[0].16b,@data[1].16b
eor $rkb.16b,@datax[0].16b,@datax[1].16b
eor @vtmp[0].16b,@data[3].16b,$rk0.16b
eor @vtmp[1].16b,@datax[3].16b,$rk0.16b
eor $rk0.16b,$rka.16b,@vtmp[0].16b
eor $rk1.16b,$rkb.16b,@vtmp[1].16b
___
&sbox_double($rk0,$rk1);
$code.=<<___;
eor @data[2].16b,@data[2].16b,$rk0.16b
eor @datax[2].16b,@datax[2].16b,$rk1.16b
// B3 ^= SBOX(B0 ^ B1 ^ B2 ^ RK3)
dup $rk1.4s,$wtmp1
eor $rka.16b,$rka.16b,@data[2].16b
eor $rkb.16b,$rkb.16b,@datax[2].16b
eor $rk0.16b,$rka.16b,$rk1.16b
eor $rk1.16b,$rkb.16b,$rk1.16b
___
&sbox_double($rk0,$rk1);
$code.=<<___;
eor @data[3].16b,@data[3].16b,$rk0.16b
eor @datax[3].16b,@datax[3].16b,$rk1.16b
___
}
sub encrypt_1blk_norev() {
my $dat = shift;
$code.=<<___;
mov $ptr,$rks
mov $counter,#8
mov $word0,$dat.s[0]
mov $word1,$dat.s[1]
mov $word2,$dat.s[2]
mov $word3,$dat.s[3]
10:
___
&sm4_1blk($ptr);
$code.=<<___;
subs $counter,$counter,#1
b.ne 10b
mov $dat.s[0],$word3
mov $dat.s[1],$word2
mov $dat.s[2],$word1
mov $dat.s[3],$word0
___
}
sub encrypt_1blk() {
my $dat = shift;
&encrypt_1blk_norev($dat);
&rev32($dat,$dat);
}
sub encrypt_4blks() {
$code.=<<___;
mov $ptr,$rks
mov $counter,#8
10:
___
&sm4_4blks($ptr);
$code.=<<___;
subs $counter,$counter,#1
b.ne 10b
___
&rev32(@vtmp[3],@data[0]);
&rev32(@vtmp[2],@data[1]);
&rev32(@vtmp[1],@data[2]);
&rev32(@vtmp[0],@data[3]);
}
sub encrypt_8blks() {
$code.=<<___;
mov $ptr,$rks
mov $counter,#8
10:
___
&sm4_8blks($ptr);
$code.=<<___;
subs $counter,$counter,#1
b.ne 10b
___
&rev32(@vtmp[3],@data[0]);
&rev32(@vtmp[2],@data[1]);
&rev32(@vtmp[1],@data[2]);
&rev32(@vtmp[0],@data[3]);
&rev32(@data[3],@datax[0]);
&rev32(@data[2],@datax[1]);
&rev32(@data[1],@datax[2]);
&rev32(@data[0],@datax[3]);
}
sub load_sbox () {
my $data = shift;
$code.=<<___;
adr $ptr,.Lsbox
ld1 {@sbox[0].4s,@sbox[1].4s,@sbox[2].4s,@sbox[3].4s},[$ptr],#64
ld1 {@sbox[4].4s,@sbox[5].4s,@sbox[6].4s,@sbox[7].4s},[$ptr],#64
ld1 {@sbox[8].4s,@sbox[9].4s,@sbox[10].4s,@sbox[11].4s},[$ptr],#64
ld1 {@sbox[12].4s,@sbox[13].4s,@sbox[14].4s,@sbox[15].4s},[$ptr]
___
}
$code=<<___;
#include "arm_arch.h"
.arch armv8-a
.text
.type _vpsm4_consts,%object
.align 7
_vpsm4_consts:
.Lsbox:
.byte 0xD6,0x90,0xE9,0xFE,0xCC,0xE1,0x3D,0xB7,0x16,0xB6,0x14,0xC2,0x28,0xFB,0x2C,0x05
.byte 0x2B,0x67,0x9A,0x76,0x2A,0xBE,0x04,0xC3,0xAA,0x44,0x13,0x26,0x49,0x86,0x06,0x99
.byte 0x9C,0x42,0x50,0xF4,0x91,0xEF,0x98,0x7A,0x33,0x54,0x0B,0x43,0xED,0xCF,0xAC,0x62
.byte 0xE4,0xB3,0x1C,0xA9,0xC9,0x08,0xE8,0x95,0x80,0xDF,0x94,0xFA,0x75,0x8F,0x3F,0xA6
.byte 0x47,0x07,0xA7,0xFC,0xF3,0x73,0x17,0xBA,0x83,0x59,0x3C,0x19,0xE6,0x85,0x4F,0xA8
.byte 0x68,0x6B,0x81,0xB2,0x71,0x64,0xDA,0x8B,0xF8,0xEB,0x0F,0x4B,0x70,0x56,0x9D,0x35
.byte 0x1E,0x24,0x0E,0x5E,0x63,0x58,0xD1,0xA2,0x25,0x22,0x7C,0x3B,0x01,0x21,0x78,0x87
.byte 0xD4,0x00,0x46,0x57,0x9F,0xD3,0x27,0x52,0x4C,0x36,0x02,0xE7,0xA0,0xC4,0xC8,0x9E
.byte 0xEA,0xBF,0x8A,0xD2,0x40,0xC7,0x38,0xB5,0xA3,0xF7,0xF2,0xCE,0xF9,0x61,0x15,0xA1
.byte 0xE0,0xAE,0x5D,0xA4,0x9B,0x34,0x1A,0x55,0xAD,0x93,0x32,0x30,0xF5,0x8C,0xB1,0xE3
.byte 0x1D,0xF6,0xE2,0x2E,0x82,0x66,0xCA,0x60,0xC0,0x29,0x23,0xAB,0x0D,0x53,0x4E,0x6F
.byte 0xD5,0xDB,0x37,0x45,0xDE,0xFD,0x8E,0x2F,0x03,0xFF,0x6A,0x72,0x6D,0x6C,0x5B,0x51
.byte 0x8D,0x1B,0xAF,0x92,0xBB,0xDD,0xBC,0x7F,0x11,0xD9,0x5C,0x41,0x1F,0x10,0x5A,0xD8
.byte 0x0A,0xC1,0x31,0x88,0xA5,0xCD,0x7B,0xBD,0x2D,0x74,0xD0,0x12,0xB8,0xE5,0xB4,0xB0
.byte 0x89,0x69,0x97,0x4A,0x0C,0x96,0x77,0x7E,0x65,0xB9,0xF1,0x09,0xC5,0x6E,0xC6,0x84
.byte 0x18,0xF0,0x7D,0xEC,0x3A,0xDC,0x4D,0x20,0x79,0xEE,0x5F,0x3E,0xD7,0xCB,0x39,0x48
.Lck:
.long 0x00070E15, 0x1C232A31, 0x383F464D, 0x545B6269
.long 0x70777E85, 0x8C939AA1, 0xA8AFB6BD, 0xC4CBD2D9
.long 0xE0E7EEF5, 0xFC030A11, 0x181F262D, 0x343B4249
.long 0x50575E65, 0x6C737A81, 0x888F969D, 0xA4ABB2B9
.long 0xC0C7CED5, 0xDCE3EAF1, 0xF8FF060D, 0x141B2229
.long 0x30373E45, 0x4C535A61, 0x686F767D, 0x848B9299
.long 0xA0A7AEB5, 0xBCC3CAD1, 0xD8DFE6ED, 0xF4FB0209
.long 0x10171E25, 0x2C333A41, 0x484F565D, 0x646B7279
.Lfk:
.dword 0x56aa3350a3b1bac6,0xb27022dc677d9197
.Lshuffles:
.dword 0x0B0A090807060504,0x030201000F0E0D0C
.size _vpsm4_consts,.-_vpsm4_consts
___
{{{
my ($key,$keys,$enc)=("x0","x1","w2");
my ($pointer,$schedules,$wtmp,$roundkey)=("x5","x6","w7","w8");
my ($vkey,$vfk,$vmap)=("v5","v6","v7");
$code.=<<___;
.type _vpsm4_set_key,%function
.align 4
_vpsm4_set_key:
AARCH64_VALID_CALL_TARGET
ld1 {$vkey.4s},[$key]
___
&load_sbox();
&rev32($vkey,$vkey);
$code.=<<___;
adr $pointer,.Lshuffles
ld1 {$vmap.4s},[$pointer]
adr $pointer,.Lfk
ld1 {$vfk.4s},[$pointer]
eor $vkey.16b,$vkey.16b,$vfk.16b
mov $schedules,#32
adr $pointer,.Lck
movi @vtmp[0].16b,#64
cbnz $enc,1f
add $keys,$keys,124
1:
mov $wtmp,$vkey.s[1]
ldr $roundkey,[$pointer],#4
eor $roundkey,$roundkey,$wtmp
mov $wtmp,$vkey.s[2]
eor $roundkey,$roundkey,$wtmp
mov $wtmp,$vkey.s[3]
eor $roundkey,$roundkey,$wtmp
// sbox lookup
mov @data[0].s[0],$roundkey
tbl @vtmp[1].16b,{@sbox[0].16b,@sbox[1].16b,@sbox[2].16b,@sbox[3].16b},@data[0].16b
sub @data[0].16b,@data[0].16b,@vtmp[0].16b
tbx @vtmp[1].16b,{@sbox[4].16b,@sbox[5].16b,@sbox[6].16b,@sbox[7].16b},@data[0].16b
sub @data[0].16b,@data[0].16b,@vtmp[0].16b
tbx @vtmp[1].16b,{@sbox[8].16b,@sbox[9].16b,@sbox[10].16b,@sbox[11].16b},@data[0].16b
sub @data[0].16b,@data[0].16b,@vtmp[0].16b
tbx @vtmp[1].16b,{@sbox[12].16b,@sbox[13].16b,@sbox[14].16b,@sbox[15].16b},@data[0].16b
mov $wtmp,@vtmp[1].s[0]
eor $roundkey,$wtmp,$wtmp,ror #19
eor $roundkey,$roundkey,$wtmp,ror #9
mov $wtmp,$vkey.s[0]
eor $roundkey,$roundkey,$wtmp
mov $vkey.s[0],$roundkey
cbz $enc,2f
str $roundkey,[$keys],#4
b 3f
2:
str $roundkey,[$keys],#-4
3:
tbl $vkey.16b,{$vkey.16b},$vmap.16b
subs $schedules,$schedules,#1
b.ne 1b
ret
.size _vpsm4_set_key,.-_vpsm4_set_key
___
}}}
{{{
$code.=<<___;
.type _vpsm4_enc_4blks,%function
.align 4
_vpsm4_enc_4blks:
AARCH64_VALID_CALL_TARGET
___
&encrypt_4blks();
$code.=<<___;
ret
.size _vpsm4_enc_4blks,.-_vpsm4_enc_4blks
___
}}}
{{{
$code.=<<___;
.type _vpsm4_enc_8blks,%function
.align 4
_vpsm4_enc_8blks:
AARCH64_VALID_CALL_TARGET
___
&encrypt_8blks();
$code.=<<___;
ret
.size _vpsm4_enc_8blks,.-_vpsm4_enc_8blks
___
}}}
{{{
my ($key,$keys)=("x0","x1");
$code.=<<___;
.globl ${prefix}_set_encrypt_key
.type ${prefix}_set_encrypt_key,%function
.align 5
${prefix}_set_encrypt_key:
AARCH64_SIGN_LINK_REGISTER
stp x29,x30,[sp,#-16]!
mov w2,1
bl _vpsm4_set_key
ldp x29,x30,[sp],#16
AARCH64_VALIDATE_LINK_REGISTER
ret
.size ${prefix}_set_encrypt_key,.-${prefix}_set_encrypt_key
___
}}}
{{{
my ($key,$keys)=("x0","x1");
$code.=<<___;
.globl ${prefix}_set_decrypt_key
.type ${prefix}_set_decrypt_key,%function
.align 5
${prefix}_set_decrypt_key:
AARCH64_SIGN_LINK_REGISTER
stp x29,x30,[sp,#-16]!
mov w2,0
bl _vpsm4_set_key
ldp x29,x30,[sp],#16
AARCH64_VALIDATE_LINK_REGISTER
ret
.size ${prefix}_set_decrypt_key,.-${prefix}_set_decrypt_key
___
}}}
{{{
sub gen_block () {
my $dir = shift;
my ($inp,$outp,$rk)=map("x$_",(0..2));
$code.=<<___;
.globl ${prefix}_${dir}crypt
.type ${prefix}_${dir}crypt,%function
.align 5
${prefix}_${dir}crypt:
AARCH64_VALID_CALL_TARGET
ld1 {@data[0].16b},[$inp]
___
&load_sbox();
&rev32(@data[0],@data[0]);
$code.=<<___;
mov $rks,x2
___
&encrypt_1blk(@data[0]);
$code.=<<___;
st1 {@data[0].16b},[$outp]
ret
.size ${prefix}_${dir}crypt,.-${prefix}_${dir}crypt
___
}
&gen_block("en");
&gen_block("de");
}}}
{{{
my ($enc) = ("w4");
my @dat=map("v$_",(16..23));
$code.=<<___;
.globl ${prefix}_ecb_encrypt
.type ${prefix}_ecb_encrypt,%function
.align 5
${prefix}_ecb_encrypt:
AARCH64_SIGN_LINK_REGISTER
// convert length into blocks
lsr x2,x2,4
stp d8,d9,[sp,#-80]!
stp d10,d11,[sp,#16]
stp d12,d13,[sp,#32]
stp d14,d15,[sp,#48]
stp x29,x30,[sp,#64]
___
&load_sbox();
$code.=<<___;
.Lecb_8_blocks_process:
cmp $blocks,#8
b.lt .Lecb_4_blocks_process
ld4 {@data[0].4s,@data[1].4s,@data[2].4s,@data[3].4s},[$inp],#64
ld4 {@datax[0].4s,$datax[1].4s,@datax[2].4s,@datax[3].4s},[$inp],#64
___
&rev32(@data[0],@data[0]);
&rev32(@data[1],@data[1]);
&rev32(@data[2],@data[2]);
&rev32(@data[3],@data[3]);
&rev32(@datax[0],@datax[0]);
&rev32(@datax[1],@datax[1]);
&rev32(@datax[2],@datax[2]);
&rev32(@datax[3],@datax[3]);
$code.=<<___;
bl _vpsm4_enc_8blks
st4 {@vtmp[0].4s,@vtmp[1].4s,@vtmp[2].4s,@vtmp[3].4s},[$outp],#64
st4 {@data[0].4s,@data[1].4s,@data[2].4s,@data[3].4s},[$outp],#64
subs $blocks,$blocks,#8
b.gt .Lecb_8_blocks_process
b 100f
.Lecb_4_blocks_process:
cmp $blocks,#4
b.lt 1f
ld4 {@data[0].4s,@data[1].4s,@data[2].4s,@data[3].4s},[$inp],#64
___
&rev32(@data[0],@data[0]);
&rev32(@data[1],@data[1]);
&rev32(@data[2],@data[2]);
&rev32(@data[3],@data[3]);
$code.=<<___;
bl _vpsm4_enc_4blks
st4 {@vtmp[0].4s,@vtmp[1].4s,@vtmp[2].4s,@vtmp[3].4s},[$outp],#64
sub $blocks,$blocks,#4
1:
// process last block
cmp $blocks,#1
b.lt 100f
b.gt 1f
ld1 {@data[0].16b},[$inp]
___
&rev32(@data[0],@data[0]);
&encrypt_1blk(@data[0]);
$code.=<<___;
st1 {@data[0].16b},[$outp]
b 100f
1: // process last 2 blocks
ld4 {@data[0].s,@data[1].s,@data[2].s,@data[3].s}[0],[$inp],#16
ld4 {@data[0].s,@data[1].s,@data[2].s,@data[3].s}[1],[$inp],#16
cmp $blocks,#2
b.gt 1f
___
&rev32(@data[0],@data[0]);
&rev32(@data[1],@data[1]);
&rev32(@data[2],@data[2]);
&rev32(@data[3],@data[3]);
$code.=<<___;
bl _vpsm4_enc_4blks
st4 {@vtmp[0].s-@vtmp[3].s}[0],[$outp],#16
st4 {@vtmp[0].s-@vtmp[3].s}[1],[$outp]
b 100f
1: // process last 3 blocks
ld4 {@data[0].s,@data[1].s,@data[2].s,@data[3].s}[2],[$inp],#16
___
&rev32(@data[0],@data[0]);
&rev32(@data[1],@data[1]);
&rev32(@data[2],@data[2]);
&rev32(@data[3],@data[3]);
$code.=<<___;
bl _vpsm4_enc_4blks
st4 {@vtmp[0].s-@vtmp[3].s}[0],[$outp],#16
st4 {@vtmp[0].s-@vtmp[3].s}[1],[$outp],#16
st4 {@vtmp[0].s-@vtmp[3].s}[2],[$outp]
100:
ldp d10,d11,[sp,#16]
ldp d12,d13,[sp,#32]
ldp d14,d15,[sp,#48]
ldp x29,x30,[sp,#64]
ldp d8,d9,[sp],#80
AARCH64_VALIDATE_LINK_REGISTER
ret
.size ${prefix}_ecb_encrypt,.-${prefix}_ecb_encrypt
___
}}}
{{{
my ($len,$ivp,$enc)=("x2","x4","w5");
my $ivec0=("v3");
my $ivec1=("v15");
$code.=<<___;
.globl ${prefix}_cbc_encrypt
.type ${prefix}_cbc_encrypt,%function
.align 5
${prefix}_cbc_encrypt:
AARCH64_VALID_CALL_TARGET
lsr $len,$len,4
___
&load_sbox();
$code.=<<___;
cbz $enc,.Ldec
ld1 {$ivec0.4s},[$ivp]
.Lcbc_4_blocks_enc:
cmp $blocks,#4
b.lt 1f
ld1 {@data[0].4s,@data[1].4s,@data[2].4s,@data[3].4s},[$inp],#64
eor @data[0].16b,@data[0].16b,$ivec0.16b
___
&rev32(@data[1],@data[1]);
&rev32(@data[0],@data[0]);
&rev32(@data[2],@data[2]);
&rev32(@data[3],@data[3]);
&encrypt_1blk_norev(@data[0]);
$code.=<<___;
eor @data[1].16b,@data[1].16b,@data[0].16b
___
&encrypt_1blk_norev(@data[1]);
&rev32(@data[0],@data[0]);
$code.=<<___;
eor @data[2].16b,@data[2].16b,@data[1].16b
___
&encrypt_1blk_norev(@data[2]);
&rev32(@data[1],@data[1]);
$code.=<<___;
eor @data[3].16b,@data[3].16b,@data[2].16b
___
&encrypt_1blk_norev(@data[3]);
&rev32(@data[2],@data[2]);
&rev32(@data[3],@data[3]);
$code.=<<___;
orr $ivec0.16b,@data[3].16b,@data[3].16b
st1 {@data[0].4s,@data[1].4s,@data[2].4s,@data[3].4s},[$outp],#64
subs $blocks,$blocks,#4
b.ne .Lcbc_4_blocks_enc
b 2f
1:
subs $blocks,$blocks,#1
b.lt 2f
ld1 {@data[0].4s},[$inp],#16
eor $ivec0.16b,$ivec0.16b,@data[0].16b
___
&rev32($ivec0,$ivec0);
&encrypt_1blk($ivec0);
$code.=<<___;
st1 {$ivec0.16b},[$outp],#16
b 1b
2:
// save back IV
st1 {$ivec0.16b},[$ivp]
ret
.Ldec:
// decryption mode starts
AARCH64_SIGN_LINK_REGISTER
stp d8,d9,[sp,#-80]!
stp d10,d11,[sp,#16]
stp d12,d13,[sp,#32]
stp d14,d15,[sp,#48]
stp x29,x30,[sp,#64]
.Lcbc_8_blocks_dec:
cmp $blocks,#8
b.lt 1f
ld4 {@data[0].4s,@data[1].4s,@data[2].4s,@data[3].4s},[$inp]
add $ptr,$inp,#64
ld4 {@datax[0].4s,@datax[1].4s,@datax[2].4s,@datax[3].4s},[$ptr]
___
&rev32(@data[0],@data[0]);
&rev32(@data[1],@data[1]);
&rev32(@data[2],@data[2]);
&rev32(@data[3],$data[3]);
&rev32(@datax[0],@datax[0]);
&rev32(@datax[1],@datax[1]);
&rev32(@datax[2],@datax[2]);
&rev32(@datax[3],$datax[3]);
$code.=<<___;
bl _vpsm4_enc_8blks
___
&transpose(@vtmp,@datax);
&transpose(@data,@datax);
$code.=<<___;
ld1 {$ivec1.16b},[$ivp]
ld1 {@datax[0].4s,@datax[1].4s,@datax[2].4s,@datax[3].4s},[$inp],#64
// note ivec1 and vtmpx[3] are resuing the same register
// care needs to be taken to avoid conflict
eor @vtmp[0].16b,@vtmp[0].16b,$ivec1.16b
ld1 {@vtmpx[0].4s,@vtmpx[1].4s,@vtmpx[2].4s,@vtmpx[3].4s},[$inp],#64
eor @vtmp[1].16b,@vtmp[1].16b,@datax[0].16b
eor @vtmp[2].16b,@vtmp[2].16b,@datax[1].16b
eor @vtmp[3].16b,$vtmp[3].16b,@datax[2].16b
// save back IV
st1 {$vtmpx[3].16b}, [$ivp]
eor @data[0].16b,@data[0].16b,$datax[3].16b
eor @data[1].16b,@data[1].16b,@vtmpx[0].16b
eor @data[2].16b,@data[2].16b,@vtmpx[1].16b
eor @data[3].16b,$data[3].16b,@vtmpx[2].16b
st1 {@vtmp[0].4s,@vtmp[1].4s,@vtmp[2].4s,@vtmp[3].4s},[$outp],#64
st1 {@data[0].4s,@data[1].4s,@data[2].4s,@data[3].4s},[$outp],#64
subs $blocks,$blocks,#8
b.gt .Lcbc_8_blocks_dec
b.eq 100f
1:
ld1 {$ivec1.16b},[$ivp]
.Lcbc_4_blocks_dec:
cmp $blocks,#4
b.lt 1f
ld4 {@data[0].4s,@data[1].4s,@data[2].4s,@data[3].4s},[$inp]
___
&rev32(@data[0],@data[0]);
&rev32(@data[1],@data[1]);
&rev32(@data[2],@data[2]);
&rev32(@data[3],$data[3]);
$code.=<<___;
bl _vpsm4_enc_4blks
ld1 {@data[0].4s,@data[1].4s,@data[2].4s,@data[3].4s},[$inp],#64
___
&transpose(@vtmp,@datax);
$code.=<<___;
eor @vtmp[0].16b,@vtmp[0].16b,$ivec1.16b
eor @vtmp[1].16b,@vtmp[1].16b,@data[0].16b
orr $ivec1.16b,@data[3].16b,@data[3].16b
eor @vtmp[2].16b,@vtmp[2].16b,@data[1].16b
eor @vtmp[3].16b,$vtmp[3].16b,@data[2].16b
st1 {@vtmp[0].4s,@vtmp[1].4s,@vtmp[2].4s,@vtmp[3].4s},[$outp],#64
subs $blocks,$blocks,#4
b.gt .Lcbc_4_blocks_dec
// save back IV
st1 {@vtmp[3].16b}, [$ivp]
b 100f
1: // last block
subs $blocks,$blocks,#1
b.lt 100f
b.gt 1f
ld1 {@data[0].4s},[$inp],#16
// save back IV
st1 {$data[0].16b}, [$ivp]
___
&rev32(@datax[0],@data[0]);
&encrypt_1blk(@datax[0]);
$code.=<<___;
eor @datax[0].16b,@datax[0].16b,$ivec1.16b
st1 {@datax[0].16b},[$outp],#16
b 100f
1: // last two blocks
ld4 {@data[0].s,@data[1].s,@data[2].s,@data[3].s}[0],[$inp]
add $ptr,$inp,#16
ld4 {@data[0].s,@data[1].s,@data[2].s,@data[3].s}[1],[$ptr],#16
subs $blocks,$blocks,1
b.gt 1f
___
&rev32(@data[0],@data[0]);
&rev32(@data[1],@data[1]);
&rev32(@data[2],@data[2]);
&rev32(@data[3],@data[3]);
$code.=<<___;
bl _vpsm4_enc_4blks
ld1 {@data[0].4s,@data[1].4s},[$inp],#32
___
&transpose(@vtmp,@datax);
$code.=<<___;
eor @vtmp[0].16b,@vtmp[0].16b,$ivec1.16b
eor @vtmp[1].16b,@vtmp[1].16b,@data[0].16b
st1 {@vtmp[0].4s,@vtmp[1].4s},[$outp],#32
// save back IV
st1 {@data[1].16b}, [$ivp]
b 100f
1: // last 3 blocks
ld4 {@data[0].s,@data[1].s,@data[2].s,@data[3].s}[2],[$ptr]
___
&rev32(@data[0],@data[0]);
&rev32(@data[1],@data[1]);
&rev32(@data[2],@data[2]);
&rev32(@data[3],@data[3]);
$code.=<<___;
bl _vpsm4_enc_4blks
ld1 {@data[0].4s,@data[1].4s,@data[2].4s},[$inp],#48
___
&transpose(@vtmp,@datax);
$code.=<<___;
eor @vtmp[0].16b,@vtmp[0].16b,$ivec1.16b
eor @vtmp[1].16b,@vtmp[1].16b,@data[0].16b
eor @vtmp[2].16b,@vtmp[2].16b,@data[1].16b
st1 {@vtmp[0].4s,@vtmp[1].4s,@vtmp[2].4s},[$outp],#48
// save back IV
st1 {@data[2].16b}, [$ivp]
100:
ldp d10,d11,[sp,#16]
ldp d12,d13,[sp,#32]
ldp d14,d15,[sp,#48]
ldp x29,x30,[sp,#64]
ldp d8,d9,[sp],#80
AARCH64_VALIDATE_LINK_REGISTER
ret
.size ${prefix}_cbc_encrypt,.-${prefix}_cbc_encrypt
___
}}}
{{{
my ($ivp)=("x4");
my ($ctr)=("w5");
my $ivec=("v3");
$code.=<<___;
.globl ${prefix}_ctr32_encrypt_blocks
.type ${prefix}_ctr32_encrypt_blocks,%function
.align 5
${prefix}_ctr32_encrypt_blocks:
AARCH64_VALID_CALL_TARGET
ld1 {$ivec.4s},[$ivp]
___
&rev32($ivec,$ivec);
&load_sbox();
$code.=<<___;
cmp $blocks,#1
b.ne 1f
// fast processing for one single block without
// context saving overhead
___
&encrypt_1blk($ivec);
$code.=<<___;
ld1 {@data[0].16b},[$inp]
eor @data[0].16b,@data[0].16b,$ivec.16b
st1 {@data[0].16b},[$outp]
ret
1:
AARCH64_SIGN_LINK_REGISTER
stp d8,d9,[sp,#-80]!
stp d10,d11,[sp,#16]
stp d12,d13,[sp,#32]
stp d14,d15,[sp,#48]
stp x29,x30,[sp,#64]
mov $word0,$ivec.s[0]
mov $word1,$ivec.s[1]
mov $word2,$ivec.s[2]
mov $ctr,$ivec.s[3]
.Lctr32_4_blocks_process:
cmp $blocks,#4
b.lt 1f
dup @data[0].4s,$word0
dup @data[1].4s,$word1
dup @data[2].4s,$word2
mov @data[3].s[0],$ctr
add $ctr,$ctr,#1
mov $data[3].s[1],$ctr
add $ctr,$ctr,#1
mov @data[3].s[2],$ctr
add $ctr,$ctr,#1
mov @data[3].s[3],$ctr
add $ctr,$ctr,#1
cmp $blocks,#8
b.ge .Lctr32_8_blocks_process
bl _vpsm4_enc_4blks
ld4 {@vtmpx[0].4s,@vtmpx[1].4s,@vtmpx[2].4s,@vtmpx[3].4s},[$inp],#64
eor @vtmp[0].16b,@vtmp[0].16b,@vtmpx[0].16b
eor @vtmp[1].16b,@vtmp[1].16b,@vtmpx[1].16b
eor @vtmp[2].16b,@vtmp[2].16b,@vtmpx[2].16b
eor @vtmp[3].16b,@vtmp[3].16b,@vtmpx[3].16b
st4 {@vtmp[0].4s,@vtmp[1].4s,@vtmp[2].4s,@vtmp[3].4s},[$outp],#64
subs $blocks,$blocks,#4
b.ne .Lctr32_4_blocks_process
b 100f
.Lctr32_8_blocks_process:
dup @datax[0].4s,$word0
dup @datax[1].4s,$word1
dup @datax[2].4s,$word2
mov @datax[3].s[0],$ctr
add $ctr,$ctr,#1
mov $datax[3].s[1],$ctr
add $ctr,$ctr,#1
mov @datax[3].s[2],$ctr
add $ctr,$ctr,#1
mov @datax[3].s[3],$ctr
add $ctr,$ctr,#1
bl _vpsm4_enc_8blks
ld4 {@vtmpx[0].4s,@vtmpx[1].4s,@vtmpx[2].4s,@vtmpx[3].4s},[$inp],#64
ld4 {@datax[0].4s,@datax[1].4s,@datax[2].4s,@datax[3].4s},[$inp],#64
eor @vtmp[0].16b,@vtmp[0].16b,@vtmpx[0].16b
eor @vtmp[1].16b,@vtmp[1].16b,@vtmpx[1].16b
eor @vtmp[2].16b,@vtmp[2].16b,@vtmpx[2].16b
eor @vtmp[3].16b,@vtmp[3].16b,@vtmpx[3].16b
eor @data[0].16b,@data[0].16b,@datax[0].16b
eor @data[1].16b,@data[1].16b,@datax[1].16b
eor @data[2].16b,@data[2].16b,@datax[2].16b
eor @data[3].16b,@data[3].16b,@datax[3].16b
st4 {@vtmp[0].4s,@vtmp[1].4s,@vtmp[2].4s,@vtmp[3].4s},[$outp],#64
st4 {@data[0].4s,@data[1].4s,@data[2].4s,@data[3].4s},[$outp],#64
subs $blocks,$blocks,#8
b.ne .Lctr32_4_blocks_process
b 100f
1: // last block processing
subs $blocks,$blocks,#1
b.lt 100f
b.gt 1f
mov $ivec.s[0],$word0
mov $ivec.s[1],$word1
mov $ivec.s[2],$word2
mov $ivec.s[3],$ctr
___
&encrypt_1blk($ivec);
$code.=<<___;
ld1 {@data[0].16b},[$inp]
eor @data[0].16b,@data[0].16b,$ivec.16b
st1 {@data[0].16b},[$outp]
b 100f
1: // last 2 blocks processing
dup @data[0].4s,$word0
dup @data[1].4s,$word1
dup @data[2].4s,$word2
mov @data[3].s[0],$ctr
add $ctr,$ctr,#1
mov @data[3].s[1],$ctr
subs $blocks,$blocks,#1
b.ne 1f
bl _vpsm4_enc_4blks
ld4 {@vtmpx[0].s,@vtmpx[1].s,@vtmpx[2].s,@vtmpx[3].s}[0],[$inp],#16
ld4 {@vtmpx[0].s,@vtmpx[1].s,@vtmpx[2].s,@vtmpx[3].s}[1],[$inp],#16
eor @vtmp[0].16b,@vtmp[0].16b,@vtmpx[0].16b
eor @vtmp[1].16b,@vtmp[1].16b,@vtmpx[1].16b
eor @vtmp[2].16b,@vtmp[2].16b,@vtmpx[2].16b
eor @vtmp[3].16b,@vtmp[3].16b,@vtmpx[3].16b
st4 {@vtmp[0].s,@vtmp[1].s,@vtmp[2].s,@vtmp[3].s}[0],[$outp],#16
st4 {@vtmp[0].s,@vtmp[1].s,@vtmp[2].s,@vtmp[3].s}[1],[$outp],#16
b 100f
1: // last 3 blocks processing
add $ctr,$ctr,#1
mov @data[3].s[2],$ctr
bl _vpsm4_enc_4blks
ld4 {@vtmpx[0].s,@vtmpx[1].s,@vtmpx[2].s,@vtmpx[3].s}[0],[$inp],#16
ld4 {@vtmpx[0].s,@vtmpx[1].s,@vtmpx[2].s,@vtmpx[3].s}[1],[$inp],#16
ld4 {@vtmpx[0].s,@vtmpx[1].s,@vtmpx[2].s,@vtmpx[3].s}[2],[$inp],#16
eor @vtmp[0].16b,@vtmp[0].16b,@vtmpx[0].16b
eor @vtmp[1].16b,@vtmp[1].16b,@vtmpx[1].16b
eor @vtmp[2].16b,@vtmp[2].16b,@vtmpx[2].16b
eor @vtmp[3].16b,@vtmp[3].16b,@vtmpx[3].16b
st4 {@vtmp[0].s,@vtmp[1].s,@vtmp[2].s,@vtmp[3].s}[0],[$outp],#16
st4 {@vtmp[0].s,@vtmp[1].s,@vtmp[2].s,@vtmp[3].s}[1],[$outp],#16
st4 {@vtmp[0].s,@vtmp[1].s,@vtmp[2].s,@vtmp[3].s}[2],[$outp],#16
100:
ldp d10,d11,[sp,#16]
ldp d12,d13,[sp,#32]
ldp d14,d15,[sp,#48]
ldp x29,x30,[sp,#64]
ldp d8,d9,[sp],#80
AARCH64_VALIDATE_LINK_REGISTER
ret
.size ${prefix}_ctr32_encrypt_blocks,.-${prefix}_ctr32_encrypt_blocks
___
}}}
########################################
open SELF,$0;
while(<SELF>) {
next if (/^#!/);
last if (!s/^#/\/\// and !/^$/);
print;
}
close SELF;
foreach(split("\n",$code)) {
s/\`([^\`]*)\`/eval($1)/ge;
print $_,"\n";
}
close STDOUT or die "error closing STDOUT: $!";