crypto/modes/asm/ghash-riscv64-zvkb-zvbc.pl - third_party/openssl - Git at Google

 #! /usr/bin/env perl
 # This file is dual-licensed, meaning that you can use it under your
 # choice of either of the following two licenses:
 #
 # Copyright 2023 The OpenSSL Project Authors. All Rights Reserved.
 #
 # Licensed under the Apache License 2.0 (the "License"). You can obtain
 # a copy in the file LICENSE in the source distribution or at
 # https://www.openssl.org/source/license.html
 #
 # or
 #
 # Copyright (c) 2023, Christoph Müllner <christoph.muellner@vrull.eu>
 # All rights reserved.
 #
 # Redistribution and use in source and binary forms, with or without
 # modification, are permitted provided that the following conditions
 # are met:
 # 1. Redistributions of source code must retain the above copyright
 #    notice, this list of conditions and the following disclaimer.
 # 2. Redistributions in binary form must reproduce the above copyright
 #    notice, this list of conditions and the following disclaimer in the
 #    documentation and/or other materials provided with the distribution.
 #
 # THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
 # "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
 # LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
 # A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
 # OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
 # SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
 # LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
 # DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
 # THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
 # (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
 # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.

 # - RV64I
 # - RISC-V Vector ('V') with VLEN >= 128
 # - RISC-V Vector Cryptography Bit-manipulation extension ('Zvkb')
 # - RISC-V Vector Carryless Multiplication extension ('Zvbc')

 use strict;
 use warnings;

 use FindBin qw($Bin);
 use lib "$Bin";
 use lib "$Bin/../../perlasm";
 use riscv;

 # $output is the last argument if it looks like a file (it has an extension)
 # $flavour is the first argument if it doesn't look like a file
 my $output = $#ARGV >= 0 && $ARGV[$#ARGV] =~ m|\.\w+$| ? pop : undef;
 my $flavour = $#ARGV >= 0 && $ARGV[0] !~ m|\.| ? shift : undef;

 $output and open STDOUT,">$output";

 my $code=<<___;
 .text
 ___

 ################################################################################
 # void gcm_init_rv64i_zvkb_zvbc(u128 Htable[16], const u64 H[2]);
 #
 # input:	H: 128-bit H - secret parameter E(K, 0^128)
 # output:	Htable: Preprocessed key data for gcm_gmult_rv64i_zvkb_zvbc and
 #                       gcm_ghash_rv64i_zvkb_zvbc
 {
 my ($Htable,$H,$TMP0,$TMP1,$TMP2) = ("a0","a1","t0","t1","t2");
 my ($V0,$V1,$V2,$V3,$V4,$V5,$V6) = ("v0","v1","v2","v3","v4","v5","v6");

 $code .= <<___;
 .p2align 3
 .globl gcm_init_rv64i_zvkb_zvbc
 .type gcm_init_rv64i_zvkb_zvbc,\@function
 gcm_init_rv64i_zvkb_zvbc:
     # Load/store data in reverse order.
     # This is needed as a part of endianness swap.
     add $H, $H, 8
     li $TMP0, -8
     li $TMP1, 63
     la $TMP2, Lpolymod

     @{[vsetivli__x0_2_e64_m1_tu_mu]} # vsetivli x0, 2, e64, m1, tu, mu

     @{[vlse64_v  $V1, $H, $TMP0]}    # vlse64.v v1, (a1), t0
     @{[vle64_v $V2, $TMP2]}          # vle64.v v2, (t2)

     # Shift one left and get the carry bits.
     @{[vsrl_vx $V3, $V1, $TMP1]}     # vsrl.vx v3, v1, t1
     @{[vsll_vi $V1, $V1, 1]}         # vsll.vi v1, v1, 1

     # Use the fact that the polynomial degree is no more than 128,
     # i.e. only the LSB of the upper half could be set.
     # Thanks to this we don't need to do the full reduction here.
     # Instead simply subtract the reduction polynomial.
     # This idea was taken from x86 ghash implementation in OpenSSL.
     @{[vslideup_vi $V4, $V3, 1]}     # vslideup.vi v4, v3, 1
     @{[vslidedown_vi $V3, $V3, 1]}   # vslidedown.vi v3, v3, 1

     @{[vmv_v_i $V0, 2]}              # vmv.v.i v0, 2
     @{[vor_vv_v0t $V1, $V1, $V4]}    # vor.vv v1, v1, v4, v0.t

     # Need to set the mask to 3, if the carry bit is set.
     @{[vmv_v_v $V0, $V3]}            # vmv.v.v v0, v3
     @{[vmv_v_i $V3, 0]}              # vmv.v.i v3, 0
     @{[vmerge_vim $V3, $V3, 3]}      # vmerge.vim v3, v3, 3, v0
     @{[vmv_v_v $V0, $V3]}            # vmv.v.v v0, v3

     @{[vxor_vv_v0t $V1, $V1, $V2]}   # vxor.vv v1, v1, v2, v0.t

     @{[vse64_v $V1, $Htable]}        # vse64.v v1, (a0)
     ret
 .size gcm_init_rv64i_zvkb_zvbc,.-gcm_init_rv64i_zvkb_zvbc
 ___
 }

 ################################################################################
 # void gcm_gmult_rv64i_zvkb_zvbc(u64 Xi[2], const u128 Htable[16]);
 #
 # input:	Xi: current hash value
 #		Htable: preprocessed H
 # output:	Xi: next hash value Xi = (Xi * H mod f)
 {
 my ($Xi,$Htable,$TMP0,$TMP1,$TMP2,$TMP3,$TMP4) = ("a0","a1","t0","t1","t2","t3","t4");
 my ($V0,$V1,$V2,$V3,$V4,$V5,$V6) = ("v0","v1","v2","v3","v4","v5","v6");

 $code .= <<___;
 .text
 .p2align 3
 .globl gcm_gmult_rv64i_zvkb_zvbc
 .type gcm_gmult_rv64i_zvkb_zvbc,\@function
 gcm_gmult_rv64i_zvkb_zvbc:
     ld $TMP0, ($Htable)
     ld $TMP1, 8($Htable)
     li $TMP2, 63
     la $TMP3, Lpolymod
     ld $TMP3, 8($TMP3)

     # Load/store data in reverse order.
     # This is needed as a part of endianness swap.
     add $Xi, $Xi, 8
     li $TMP4, -8

     @{[vsetivli__x0_2_e64_m1_tu_mu]} # vsetivli x0, 2, e64, m1, tu, mu

     @{[vlse64_v $V5, $Xi, $TMP4]}    # vlse64.v v5, (a0), t4
     @{[vrev8_v $V5, $V5]}            # vrev8.v v5, v5

     # Multiplication

     # Do two 64x64 multiplications in one go to save some time
     # and simplify things.

     # A = a1a0 (t1, t0)
     # B = b1b0 (v5)
     # C = c1c0 (256 bit)
     # c1 = a1b1 + (a0b1)h + (a1b0)h
     # c0 = a0b0 + (a0b1)l + (a1b0)h

     # v1 = (a0b1)l,(a0b0)l
     @{[vclmul_vx $V1, $V5, $TMP0]}   # vclmul.vx v1, v5, t0
     # v3 = (a0b1)h,(a0b0)h
     @{[vclmulh_vx $V3, $V5, $TMP0]}  # vclmulh.vx v3, v5, t0

     # v4 = (a1b1)l,(a1b0)l
     @{[vclmul_vx $V4, $V5, $TMP1]}   # vclmul.vx v4, v5, t1
     # v2 = (a1b1)h,(a1b0)h
     @{[vclmulh_vx $V2, $V5, $TMP1]}   # vclmulh.vx v2, v5, t1

     # Is there a better way to do this?
     # Would need to swap the order of elements within a vector register.
     @{[vslideup_vi $V5, $V3, 1]}     # vslideup.vi v5, v3, 1
     @{[vslideup_vi $V6, $V4, 1]}     # vslideup.vi v6, v4, 1
     @{[vslidedown_vi $V3, $V3, 1]}   # vslidedown.vi v3, v3, 1
     @{[vslidedown_vi $V4, $V4, 1]}   # vslidedown.vi v4, v4, 1

     @{[vmv_v_i $V0, 1]}              # vmv.v.i v0, 1
     # v2 += (a0b1)h
     @{[vxor_vv_v0t $V2, $V2, $V3]}   # vxor.vv v2, v2, v3, v0.t
     # v2 += (a1b1)l
     @{[vxor_vv_v0t $V2, $V2, $V4]}   # vxor.vv v2, v2, v4, v0.t

     @{[vmv_v_i $V0, 2]}              # vmv.v.i v0, 2
     # v1 += (a0b0)h,0
     @{[vxor_vv_v0t $V1, $V1, $V5]}   # vxor.vv v1, v1, v5, v0.t
     # v1 += (a1b0)l,0
     @{[vxor_vv_v0t $V1, $V1, $V6]}   # vxor.vv v1, v1, v6, v0.t

     # Now the 256bit product should be stored in (v2,v1)
     # v1 = (a0b1)l + (a0b0)h + (a1b0)l, (a0b0)l
     # v2 = (a1b1)h, (a1b0)h + (a0b1)h + (a1b1)l

     # Reduction
     # Let C := A*B = c3,c2,c1,c0 = v2[1],v2[0],v1[1],v1[0]
     # This is a slight variation of the Gueron's Montgomery reduction.
     # The difference being the order of some operations has been changed,
     # to make a better use of vclmul(h) instructions.

     # First step:
     # c1 += (c0 * P)l
     # vmv.v.i v0, 2
     @{[vslideup_vi_v0t $V3, $V1, 1]} # vslideup.vi v3, v1, 1, v0.t
     @{[vclmul_vx_v0t $V3, $V3, $TMP3]} # vclmul.vx v3, v3, t3, v0.t
     @{[vxor_vv_v0t $V1, $V1, $V3]}   # vxor.vv v1, v1, v3, v0.t

     # Second step:
     # D = d1,d0 is final result
     # We want:
     # m1 = c1 + (c1 * P)h
     # m0 = (c1 * P)l + (c0 * P)h + c0
     # d1 = c3 + m1
     # d0 = c2 + m0

     #v3 = (c1 * P)l, 0
     @{[vclmul_vx_v0t $V3, $V1, $TMP3]} # vclmul.vx v3, v1, t3, v0.t
     #v4 = (c1 * P)h, (c0 * P)h
     @{[vclmulh_vx $V4, $V1, $TMP3]}   # vclmulh.vx v4, v1, t3

     @{[vmv_v_i $V0, 1]}              # vmv.v.i v0, 1
     @{[vslidedown_vi $V3, $V3, 1]}   # vslidedown.vi v3, v3, 1

     @{[vxor_vv $V1, $V1, $V4]}       # vxor.vv v1, v1, v4
     @{[vxor_vv_v0t $V1, $V1, $V3]}   # vxor.vv v1, v1, v3, v0.t

     # XOR in the upper upper part of the product
     @{[vxor_vv $V2, $V2, $V1]}       # vxor.vv v2, v2, v1

     @{[vrev8_v $V2, $V2]}            # vrev8.v v2, v2
     @{[vsse64_v $V2, $Xi, $TMP4]}    # vsse64.v v2, (a0), t4
     ret
 .size gcm_gmult_rv64i_zvkb_zvbc,.-gcm_gmult_rv64i_zvkb_zvbc
 ___
 }

 ################################################################################
 # void gcm_ghash_rv64i_zvkb_zvbc(u64 Xi[2], const u128 Htable[16],
 #                                const u8 *inp, size_t len);
 #
 # input:	Xi: current hash value
 #		Htable: preprocessed H
 #		inp: pointer to input data
 #		len: length of input data in bytes (multiple of block size)
 # output:	Xi: Xi+1 (next hash value Xi)
 {
 my ($Xi,$Htable,$inp,$len,$TMP0,$TMP1,$TMP2,$TMP3,$M8,$TMP5,$TMP6) = ("a0","a1","a2","a3","t0","t1","t2","t3","t4","t5","t6");
 my ($V0,$V1,$V2,$V3,$V4,$V5,$V6,$Vinp) = ("v0","v1","v2","v3","v4","v5","v6","v7");

 $code .= <<___;
 .p2align 3
 .globl gcm_ghash_rv64i_zvkb_zvbc
 .type gcm_ghash_rv64i_zvkb_zvbc,\@function
 gcm_ghash_rv64i_zvkb_zvbc:
     ld $TMP0, ($Htable)
     ld $TMP1, 8($Htable)
     li $TMP2, 63
     la $TMP3, Lpolymod
     ld $TMP3, 8($TMP3)

     # Load/store data in reverse order.
     # This is needed as a part of endianness swap.
     add $Xi, $Xi, 8
     add $inp, $inp, 8
     li $M8, -8

     @{[vsetivli__x0_2_e64_m1_tu_mu]} # vsetivli x0, 2, e64, m1, tu, mu

     @{[vlse64_v $V5, $Xi, $M8]}      # vlse64.v v5, (a0), t4

 Lstep:
     # Read input data
     @{[vlse64_v $Vinp, $inp, $M8]}   # vle64.v v0, (a2)
     add $inp, $inp, 16
     add $len, $len, -16
     # XOR them into Xi
     @{[vxor_vv $V5, $V5, $Vinp]}       # vxor.vv v0, v0, v1

     @{[vrev8_v $V5, $V5]}            # vrev8.v v5, v5

     # Multiplication

     # Do two 64x64 multiplications in one go to save some time
     # and simplify things.

     # A = a1a0 (t1, t0)
     # B = b1b0 (v5)
     # C = c1c0 (256 bit)
     # c1 = a1b1 + (a0b1)h + (a1b0)h
     # c0 = a0b0 + (a0b1)l + (a1b0)h

     # v1 = (a0b1)l,(a0b0)l
     @{[vclmul_vx $V1, $V5, $TMP0]}   # vclmul.vx v1, v5, t0
     # v3 = (a0b1)h,(a0b0)h
     @{[vclmulh_vx $V3, $V5, $TMP0]}  # vclmulh.vx v3, v5, t0

     # v4 = (a1b1)l,(a1b0)l
     @{[vclmul_vx $V4, $V5, $TMP1]}   # vclmul.vx v4, v5, t1
     # v2 = (a1b1)h,(a1b0)h
     @{[vclmulh_vx $V2, $V5, $TMP1]}   # vclmulh.vx v2, v5, t1

     # Is there a better way to do this?
     # Would need to swap the order of elements within a vector register.
     @{[vslideup_vi $V5, $V3, 1]}     # vslideup.vi v5, v3, 1
     @{[vslideup_vi $V6, $V4, 1]}     # vslideup.vi v6, v4, 1
     @{[vslidedown_vi $V3, $V3, 1]}   # vslidedown.vi v3, v3, 1
     @{[vslidedown_vi $V4, $V4, 1]}   # vslidedown.vi v4, v4, 1

     @{[vmv_v_i $V0, 1]}              # vmv.v.i v0, 1
     # v2 += (a0b1)h
     @{[vxor_vv_v0t $V2, $V2, $V3]}   # vxor.vv v2, v2, v3, v0.t
     # v2 += (a1b1)l
     @{[vxor_vv_v0t $V2, $V2, $V4]}   # vxor.vv v2, v2, v4, v0.t

     @{[vmv_v_i $V0, 2]}              # vmv.v.i v0, 2
     # v1 += (a0b0)h,0
     @{[vxor_vv_v0t $V1, $V1, $V5]}   # vxor.vv v1, v1, v5, v0.t
     # v1 += (a1b0)l,0
     @{[vxor_vv_v0t $V1, $V1, $V6]}   # vxor.vv v1, v1, v6, v0.t

     # Now the 256bit product should be stored in (v2,v1)
     # v1 = (a0b1)l + (a0b0)h + (a1b0)l, (a0b0)l
     # v2 = (a1b1)h, (a1b0)h + (a0b1)h + (a1b1)l

     # Reduction
     # Let C := A*B = c3,c2,c1,c0 = v2[1],v2[0],v1[1],v1[0]
     # This is a slight variation of the Gueron's Montgomery reduction.
     # The difference being the order of some operations has been changed,
     # to make a better use of vclmul(h) instructions.

     # First step:
     # c1 += (c0 * P)l
     # vmv.v.i v0, 2
     @{[vslideup_vi_v0t $V3, $V1, 1]} # vslideup.vi v3, v1, 1, v0.t
     @{[vclmul_vx_v0t $V3, $V3, $TMP3]} # vclmul.vx v3, v3, t3, v0.t
     @{[vxor_vv_v0t $V1, $V1, $V3]}   # vxor.vv v1, v1, v3, v0.t

     # Second step:
     # D = d1,d0 is final result
     # We want:
     # m1 = c1 + (c1 * P)h
     # m0 = (c1 * P)l + (c0 * P)h + c0
     # d1 = c3 + m1
     # d0 = c2 + m0

     #v3 = (c1 * P)l, 0
     @{[vclmul_vx_v0t $V3, $V1, $TMP3]} # vclmul.vx v3, v1, t3, v0.t
     #v4 = (c1 * P)h, (c0 * P)h
     @{[vclmulh_vx $V4, $V1, $TMP3]}   # vclmulh.vx v4, v1, t3

     @{[vmv_v_i $V0, 1]}              # vmv.v.i v0, 1
     @{[vslidedown_vi $V3, $V3, 1]}   # vslidedown.vi v3, v3, 1

     @{[vxor_vv $V1, $V1, $V4]}       # vxor.vv v1, v1, v4
     @{[vxor_vv_v0t $V1, $V1, $V3]}   # vxor.vv v1, v1, v3, v0.t

     # XOR in the upper upper part of the product
     @{[vxor_vv $V2, $V2, $V1]}       # vxor.vv v2, v2, v1

     @{[vrev8_v $V5, $V2]}            # vrev8.v v2, v2

     bnez $len, Lstep

     @{[vsse64_v $V5, $Xi, $M8]}    # vsse64.v v2, (a0), t4
     ret
 .size gcm_ghash_rv64i_zvkb_zvbc,.-gcm_ghash_rv64i_zvkb_zvbc
 ___
 }

 $code .= <<___;
 .p2align 4
 Lpolymod:
         .dword 0x0000000000000001
         .dword 0xc200000000000000
 .size Lpolymod,.-Lpolymod
 ___

 print $code;

 close STDOUT or die "error closing STDOUT: $!";
	#! /usr/bin/env perl
	# This file is dual-licensed, meaning that you can use it under your
	# choice of either of the following two licenses:
	#
	# Copyright 2023 The OpenSSL Project Authors. All Rights Reserved.
	#
	# Licensed under the Apache License 2.0 (the "License"). You can obtain
	# a copy in the file LICENSE in the source distribution or at
	# https://www.openssl.org/source/license.html
	#
	# or
	#
	# Copyright (c) 2023, Christoph Müllner <christoph.muellner@vrull.eu>
	# All rights reserved.
	#
	# Redistribution and use in source and binary forms, with or without
	# modification, are permitted provided that the following conditions
	# are met:
	# 1. Redistributions of source code must retain the above copyright
	# notice, this list of conditions and the following disclaimer.
	# 2. Redistributions in binary form must reproduce the above copyright
	# notice, this list of conditions and the following disclaimer in the
	# documentation and/or other materials provided with the distribution.
	#
	# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
	# "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
	# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
	# A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
	# OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
	# SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
	# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
	# DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
	# THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
	# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
	# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.

	# - RV64I
	# - RISC-V Vector ('V') with VLEN >= 128
	# - RISC-V Vector Cryptography Bit-manipulation extension ('Zvkb')
	# - RISC-V Vector Carryless Multiplication extension ('Zvbc')

	use strict;
	use warnings;

	use FindBin qw($Bin);
	use lib "$Bin";
	use lib "$Bin/../../perlasm";
	use riscv;

	# $output is the last argument if it looks like a file (it has an extension)
	# $flavour is the first argument if it doesn't look like a file
	my $output = $#ARGV >= 0 && $ARGV[$#ARGV] =~ m\|\.\w+$\| ? pop : undef;
	my $flavour = $#ARGV >= 0 && $ARGV[0] !~ m\|\.\| ? shift : undef;

	$output and open STDOUT,">$output";

	my $code=<<___;
	.text
	___

	################################################################################
	# void gcm_init_rv64i_zvkb_zvbc(u128 Htable[16], const u64 H[2]);
	#
	# input: H: 128-bit H - secret parameter E(K, 0^128)
	# output: Htable: Preprocessed key data for gcm_gmult_rv64i_zvkb_zvbc and
	# gcm_ghash_rv64i_zvkb_zvbc
	{
	my ($Htable,$H,$TMP0,$TMP1,$TMP2) = ("a0","a1","t0","t1","t2");
	my ($V0,$V1,$V2,$V3,$V4,$V5,$V6) = ("v0","v1","v2","v3","v4","v5","v6");

	$code .= <<___;
	.p2align 3
	.globl gcm_init_rv64i_zvkb_zvbc
	.type gcm_init_rv64i_zvkb_zvbc,\@function
	gcm_init_rv64i_zvkb_zvbc:
	# Load/store data in reverse order.
	# This is needed as a part of endianness swap.
	add $H, $H, 8
	li $TMP0, -8
	li $TMP1, 63
	la $TMP2, Lpolymod

	@{[vsetivli__x0_2_e64_m1_tu_mu]} # vsetivli x0, 2, e64, m1, tu, mu

	@{[vlse64_v $V1, $H, $TMP0]} # vlse64.v v1, (a1), t0
	@{[vle64_v $V2, $TMP2]} # vle64.v v2, (t2)

	# Shift one left and get the carry bits.
	@{[vsrl_vx $V3, $V1, $TMP1]} # vsrl.vx v3, v1, t1
	@{[vsll_vi $V1, $V1, 1]} # vsll.vi v1, v1, 1

	# Use the fact that the polynomial degree is no more than 128,
	# i.e. only the LSB of the upper half could be set.
	# Thanks to this we don't need to do the full reduction here.
	# Instead simply subtract the reduction polynomial.
	# This idea was taken from x86 ghash implementation in OpenSSL.
	@{[vslideup_vi $V4, $V3, 1]} # vslideup.vi v4, v3, 1
	@{[vslidedown_vi $V3, $V3, 1]} # vslidedown.vi v3, v3, 1

	@{[vmv_v_i $V0, 2]} # vmv.v.i v0, 2
	@{[vor_vv_v0t $V1, $V1, $V4]} # vor.vv v1, v1, v4, v0.t

	# Need to set the mask to 3, if the carry bit is set.
	@{[vmv_v_v $V0, $V3]} # vmv.v.v v0, v3
	@{[vmv_v_i $V3, 0]} # vmv.v.i v3, 0
	@{[vmerge_vim $V3, $V3, 3]} # vmerge.vim v3, v3, 3, v0
	@{[vmv_v_v $V0, $V3]} # vmv.v.v v0, v3

	@{[vxor_vv_v0t $V1, $V1, $V2]} # vxor.vv v1, v1, v2, v0.t

	@{[vse64_v $V1, $Htable]} # vse64.v v1, (a0)
	ret
	.size gcm_init_rv64i_zvkb_zvbc,.-gcm_init_rv64i_zvkb_zvbc
	___
	}

	################################################################################
	# void gcm_gmult_rv64i_zvkb_zvbc(u64 Xi[2], const u128 Htable[16]);
	#
	# input: Xi: current hash value
	# Htable: preprocessed H
	# output: Xi: next hash value Xi = (Xi * H mod f)
	{
	my ($Xi,$Htable,$TMP0,$TMP1,$TMP2,$TMP3,$TMP4) = ("a0","a1","t0","t1","t2","t3","t4");
	my ($V0,$V1,$V2,$V3,$V4,$V5,$V6) = ("v0","v1","v2","v3","v4","v5","v6");

	$code .= <<___;
	.text
	.p2align 3
	.globl gcm_gmult_rv64i_zvkb_zvbc
	.type gcm_gmult_rv64i_zvkb_zvbc,\@function
	gcm_gmult_rv64i_zvkb_zvbc:
	ld $TMP0, ($Htable)
	ld $TMP1, 8($Htable)
	li $TMP2, 63
	la $TMP3, Lpolymod
	ld $TMP3, 8($TMP3)

	# Load/store data in reverse order.
	# This is needed as a part of endianness swap.
	add $Xi, $Xi, 8
	li $TMP4, -8

	@{[vsetivli__x0_2_e64_m1_tu_mu]} # vsetivli x0, 2, e64, m1, tu, mu

	@{[vlse64_v $V5, $Xi, $TMP4]} # vlse64.v v5, (a0), t4
	@{[vrev8_v $V5, $V5]} # vrev8.v v5, v5

	# Multiplication

	# Do two 64x64 multiplications in one go to save some time
	# and simplify things.

	# A = a1a0 (t1, t0)
	# B = b1b0 (v5)
	# C = c1c0 (256 bit)
	# c1 = a1b1 + (a0b1)h + (a1b0)h
	# c0 = a0b0 + (a0b1)l + (a1b0)h

	# v1 = (a0b1)l,(a0b0)l
	@{[vclmul_vx $V1, $V5, $TMP0]} # vclmul.vx v1, v5, t0
	# v3 = (a0b1)h,(a0b0)h
	@{[vclmulh_vx $V3, $V5, $TMP0]} # vclmulh.vx v3, v5, t0

	# v4 = (a1b1)l,(a1b0)l
	@{[vclmul_vx $V4, $V5, $TMP1]} # vclmul.vx v4, v5, t1
	# v2 = (a1b1)h,(a1b0)h
	@{[vclmulh_vx $V2, $V5, $TMP1]} # vclmulh.vx v2, v5, t1

	# Is there a better way to do this?
	# Would need to swap the order of elements within a vector register.
	@{[vslideup_vi $V5, $V3, 1]} # vslideup.vi v5, v3, 1
	@{[vslideup_vi $V6, $V4, 1]} # vslideup.vi v6, v4, 1
	@{[vslidedown_vi $V3, $V3, 1]} # vslidedown.vi v3, v3, 1
	@{[vslidedown_vi $V4, $V4, 1]} # vslidedown.vi v4, v4, 1

	@{[vmv_v_i $V0, 1]} # vmv.v.i v0, 1
	# v2 += (a0b1)h
	@{[vxor_vv_v0t $V2, $V2, $V3]} # vxor.vv v2, v2, v3, v0.t
	# v2 += (a1b1)l
	@{[vxor_vv_v0t $V2, $V2, $V4]} # vxor.vv v2, v2, v4, v0.t

	@{[vmv_v_i $V0, 2]} # vmv.v.i v0, 2
	# v1 += (a0b0)h,0
	@{[vxor_vv_v0t $V1, $V1, $V5]} # vxor.vv v1, v1, v5, v0.t
	# v1 += (a1b0)l,0
	@{[vxor_vv_v0t $V1, $V1, $V6]} # vxor.vv v1, v1, v6, v0.t

	# Now the 256bit product should be stored in (v2,v1)
	# v1 = (a0b1)l + (a0b0)h + (a1b0)l, (a0b0)l
	# v2 = (a1b1)h, (a1b0)h + (a0b1)h + (a1b1)l

	# Reduction
	# Let C := A*B = c3,c2,c1,c0 = v2[1],v2[0],v1[1],v1[0]
	# This is a slight variation of the Gueron's Montgomery reduction.
	# The difference being the order of some operations has been changed,
	# to make a better use of vclmul(h) instructions.

	# First step:
	# c1 += (c0 * P)l
	# vmv.v.i v0, 2
	@{[vslideup_vi_v0t $V3, $V1, 1]} # vslideup.vi v3, v1, 1, v0.t
	@{[vclmul_vx_v0t $V3, $V3, $TMP3]} # vclmul.vx v3, v3, t3, v0.t
	@{[vxor_vv_v0t $V1, $V1, $V3]} # vxor.vv v1, v1, v3, v0.t

	# Second step:
	# D = d1,d0 is final result
	# We want:
	# m1 = c1 + (c1 * P)h
	# m0 = (c1 * P)l + (c0 * P)h + c0
	# d1 = c3 + m1
	# d0 = c2 + m0

	#v3 = (c1 * P)l, 0
	@{[vclmul_vx_v0t $V3, $V1, $TMP3]} # vclmul.vx v3, v1, t3, v0.t
	#v4 = (c1 * P)h, (c0 * P)h
	@{[vclmulh_vx $V4, $V1, $TMP3]} # vclmulh.vx v4, v1, t3

	@{[vmv_v_i $V0, 1]} # vmv.v.i v0, 1
	@{[vslidedown_vi $V3, $V3, 1]} # vslidedown.vi v3, v3, 1

	@{[vxor_vv $V1, $V1, $V4]} # vxor.vv v1, v1, v4
	@{[vxor_vv_v0t $V1, $V1, $V3]} # vxor.vv v1, v1, v3, v0.t

	# XOR in the upper upper part of the product
	@{[vxor_vv $V2, $V2, $V1]} # vxor.vv v2, v2, v1

	@{[vrev8_v $V2, $V2]} # vrev8.v v2, v2
	@{[vsse64_v $V2, $Xi, $TMP4]} # vsse64.v v2, (a0), t4
	ret
	.size gcm_gmult_rv64i_zvkb_zvbc,.-gcm_gmult_rv64i_zvkb_zvbc
	___
	}

	################################################################################
	# void gcm_ghash_rv64i_zvkb_zvbc(u64 Xi[2], const u128 Htable[16],
	# const u8 *inp, size_t len);
	#
	# input: Xi: current hash value
	# Htable: preprocessed H
	# inp: pointer to input data
	# len: length of input data in bytes (multiple of block size)
	# output: Xi: Xi+1 (next hash value Xi)
	{
	my ($Xi,$Htable,$inp,$len,$TMP0,$TMP1,$TMP2,$TMP3,$M8,$TMP5,$TMP6) = ("a0","a1","a2","a3","t0","t1","t2","t3","t4","t5","t6");
	my ($V0,$V1,$V2,$V3,$V4,$V5,$V6,$Vinp) = ("v0","v1","v2","v3","v4","v5","v6","v7");

	$code .= <<___;
	.p2align 3
	.globl gcm_ghash_rv64i_zvkb_zvbc
	.type gcm_ghash_rv64i_zvkb_zvbc,\@function
	gcm_ghash_rv64i_zvkb_zvbc:
	ld $TMP0, ($Htable)
	ld $TMP1, 8($Htable)
	li $TMP2, 63
	la $TMP3, Lpolymod
	ld $TMP3, 8($TMP3)

	# Load/store data in reverse order.
	# This is needed as a part of endianness swap.
	add $Xi, $Xi, 8
	add $inp, $inp, 8
	li $M8, -8

	@{[vsetivli__x0_2_e64_m1_tu_mu]} # vsetivli x0, 2, e64, m1, tu, mu

	@{[vlse64_v $V5, $Xi, $M8]} # vlse64.v v5, (a0), t4

	Lstep:
	# Read input data
	@{[vlse64_v $Vinp, $inp, $M8]} # vle64.v v0, (a2)
	add $inp, $inp, 16
	add $len, $len, -16
	# XOR them into Xi
	@{[vxor_vv $V5, $V5, $Vinp]} # vxor.vv v0, v0, v1

	@{[vrev8_v $V5, $V5]} # vrev8.v v5, v5

	# Multiplication

	# Do two 64x64 multiplications in one go to save some time
	# and simplify things.

	# A = a1a0 (t1, t0)
	# B = b1b0 (v5)
	# C = c1c0 (256 bit)
	# c1 = a1b1 + (a0b1)h + (a1b0)h
	# c0 = a0b0 + (a0b1)l + (a1b0)h

	# v1 = (a0b1)l,(a0b0)l
	@{[vclmul_vx $V1, $V5, $TMP0]} # vclmul.vx v1, v5, t0
	# v3 = (a0b1)h,(a0b0)h
	@{[vclmulh_vx $V3, $V5, $TMP0]} # vclmulh.vx v3, v5, t0

	# v4 = (a1b1)l,(a1b0)l
	@{[vclmul_vx $V4, $V5, $TMP1]} # vclmul.vx v4, v5, t1
	# v2 = (a1b1)h,(a1b0)h
	@{[vclmulh_vx $V2, $V5, $TMP1]} # vclmulh.vx v2, v5, t1

	# Is there a better way to do this?
	# Would need to swap the order of elements within a vector register.
	@{[vslideup_vi $V5, $V3, 1]} # vslideup.vi v5, v3, 1
	@{[vslideup_vi $V6, $V4, 1]} # vslideup.vi v6, v4, 1
	@{[vslidedown_vi $V3, $V3, 1]} # vslidedown.vi v3, v3, 1
	@{[vslidedown_vi $V4, $V4, 1]} # vslidedown.vi v4, v4, 1

	@{[vmv_v_i $V0, 1]} # vmv.v.i v0, 1
	# v2 += (a0b1)h
	@{[vxor_vv_v0t $V2, $V2, $V3]} # vxor.vv v2, v2, v3, v0.t
	# v2 += (a1b1)l
	@{[vxor_vv_v0t $V2, $V2, $V4]} # vxor.vv v2, v2, v4, v0.t

	@{[vmv_v_i $V0, 2]} # vmv.v.i v0, 2
	# v1 += (a0b0)h,0
	@{[vxor_vv_v0t $V1, $V1, $V5]} # vxor.vv v1, v1, v5, v0.t
	# v1 += (a1b0)l,0
	@{[vxor_vv_v0t $V1, $V1, $V6]} # vxor.vv v1, v1, v6, v0.t

	# Now the 256bit product should be stored in (v2,v1)
	# v1 = (a0b1)l + (a0b0)h + (a1b0)l, (a0b0)l
	# v2 = (a1b1)h, (a1b0)h + (a0b1)h + (a1b1)l

	# Reduction
	# Let C := A*B = c3,c2,c1,c0 = v2[1],v2[0],v1[1],v1[0]
	# This is a slight variation of the Gueron's Montgomery reduction.
	# The difference being the order of some operations has been changed,
	# to make a better use of vclmul(h) instructions.

	# First step:
	# c1 += (c0 * P)l
	# vmv.v.i v0, 2
	@{[vslideup_vi_v0t $V3, $V1, 1]} # vslideup.vi v3, v1, 1, v0.t
	@{[vclmul_vx_v0t $V3, $V3, $TMP3]} # vclmul.vx v3, v3, t3, v0.t
	@{[vxor_vv_v0t $V1, $V1, $V3]} # vxor.vv v1, v1, v3, v0.t

	# Second step:
	# D = d1,d0 is final result
	# We want:
	# m1 = c1 + (c1 * P)h
	# m0 = (c1 * P)l + (c0 * P)h + c0
	# d1 = c3 + m1
	# d0 = c2 + m0

	#v3 = (c1 * P)l, 0
	@{[vclmul_vx_v0t $V3, $V1, $TMP3]} # vclmul.vx v3, v1, t3, v0.t
	#v4 = (c1 * P)h, (c0 * P)h
	@{[vclmulh_vx $V4, $V1, $TMP3]} # vclmulh.vx v4, v1, t3

	@{[vmv_v_i $V0, 1]} # vmv.v.i v0, 1
	@{[vslidedown_vi $V3, $V3, 1]} # vslidedown.vi v3, v3, 1

	@{[vxor_vv $V1, $V1, $V4]} # vxor.vv v1, v1, v4
	@{[vxor_vv_v0t $V1, $V1, $V3]} # vxor.vv v1, v1, v3, v0.t

	# XOR in the upper upper part of the product
	@{[vxor_vv $V2, $V2, $V1]} # vxor.vv v2, v2, v1

	@{[vrev8_v $V5, $V2]} # vrev8.v v2, v2

	bnez $len, Lstep

	@{[vsse64_v $V5, $Xi, $M8]} # vsse64.v v2, (a0), t4
	ret
	.size gcm_ghash_rv64i_zvkb_zvbc,.-gcm_ghash_rv64i_zvkb_zvbc
	___
	}

	$code .= <<___;
	.p2align 4
	Lpolymod:
	.dword 0x0000000000000001
	.dword 0xc200000000000000
	.size Lpolymod,.-Lpolymod
	___

	print $code;

	close STDOUT or die "error closing STDOUT: $!";