| #! /usr/bin/env perl |
| # This file is dual-licensed, meaning that you can use it under your |
| # choice of either of the following two licenses: |
| # |
| # Copyright 2023 The OpenSSL Project Authors. All Rights Reserved. |
| # |
| # Licensed under the Apache License 2.0 (the "License"). You can obtain |
| # a copy in the file LICENSE in the source distribution or at |
| # https://www.openssl.org/source/license.html |
| # |
| # or |
| # |
| # Copyright (c) 2023, Christoph Müllner <christoph.muellner@vrull.eu> |
| # All rights reserved. |
| # |
| # Redistribution and use in source and binary forms, with or without |
| # modification, are permitted provided that the following conditions |
| # are met: |
| # 1. Redistributions of source code must retain the above copyright |
| # notice, this list of conditions and the following disclaimer. |
| # 2. Redistributions in binary form must reproduce the above copyright |
| # notice, this list of conditions and the following disclaimer in the |
| # documentation and/or other materials provided with the distribution. |
| # |
| # THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS |
| # "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT |
| # LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR |
| # A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT |
| # OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, |
| # SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT |
| # LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, |
| # DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY |
| # THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT |
| # (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE |
| # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. |
| |
| # - RV64I |
| # - RISC-V Vector ('V') with VLEN >= 128 |
| # - RISC-V Vector Cryptography Bit-manipulation extension ('Zvkb') |
| # - RISC-V Vector Carryless Multiplication extension ('Zvbc') |
| |
| use strict; |
| use warnings; |
| |
| use FindBin qw($Bin); |
| use lib "$Bin"; |
| use lib "$Bin/../../perlasm"; |
| use riscv; |
| |
| # $output is the last argument if it looks like a file (it has an extension) |
| # $flavour is the first argument if it doesn't look like a file |
| my $output = $#ARGV >= 0 && $ARGV[$#ARGV] =~ m|\.\w+$| ? pop : undef; |
| my $flavour = $#ARGV >= 0 && $ARGV[0] !~ m|\.| ? shift : undef; |
| |
| $output and open STDOUT,">$output"; |
| |
| my $code=<<___; |
| .text |
| ___ |
| |
| ################################################################################ |
| # void gcm_init_rv64i_zvkb_zvbc(u128 Htable[16], const u64 H[2]); |
| # |
| # input: H: 128-bit H - secret parameter E(K, 0^128) |
| # output: Htable: Preprocessed key data for gcm_gmult_rv64i_zvkb_zvbc and |
| # gcm_ghash_rv64i_zvkb_zvbc |
| { |
| my ($Htable,$H,$TMP0,$TMP1,$TMP2) = ("a0","a1","t0","t1","t2"); |
| my ($V0,$V1,$V2,$V3,$V4,$V5,$V6) = ("v0","v1","v2","v3","v4","v5","v6"); |
| |
| $code .= <<___; |
| .p2align 3 |
| .globl gcm_init_rv64i_zvkb_zvbc |
| .type gcm_init_rv64i_zvkb_zvbc,\@function |
| gcm_init_rv64i_zvkb_zvbc: |
| # Load/store data in reverse order. |
| # This is needed as a part of endianness swap. |
| add $H, $H, 8 |
| li $TMP0, -8 |
| li $TMP1, 63 |
| la $TMP2, Lpolymod |
| |
| @{[vsetivli__x0_2_e64_m1_tu_mu]} # vsetivli x0, 2, e64, m1, tu, mu |
| |
| @{[vlse64_v $V1, $H, $TMP0]} # vlse64.v v1, (a1), t0 |
| @{[vle64_v $V2, $TMP2]} # vle64.v v2, (t2) |
| |
| # Shift one left and get the carry bits. |
| @{[vsrl_vx $V3, $V1, $TMP1]} # vsrl.vx v3, v1, t1 |
| @{[vsll_vi $V1, $V1, 1]} # vsll.vi v1, v1, 1 |
| |
| # Use the fact that the polynomial degree is no more than 128, |
| # i.e. only the LSB of the upper half could be set. |
| # Thanks to this we don't need to do the full reduction here. |
| # Instead simply subtract the reduction polynomial. |
| # This idea was taken from x86 ghash implementation in OpenSSL. |
| @{[vslideup_vi $V4, $V3, 1]} # vslideup.vi v4, v3, 1 |
| @{[vslidedown_vi $V3, $V3, 1]} # vslidedown.vi v3, v3, 1 |
| |
| @{[vmv_v_i $V0, 2]} # vmv.v.i v0, 2 |
| @{[vor_vv_v0t $V1, $V1, $V4]} # vor.vv v1, v1, v4, v0.t |
| |
| # Need to set the mask to 3, if the carry bit is set. |
| @{[vmv_v_v $V0, $V3]} # vmv.v.v v0, v3 |
| @{[vmv_v_i $V3, 0]} # vmv.v.i v3, 0 |
| @{[vmerge_vim $V3, $V3, 3]} # vmerge.vim v3, v3, 3, v0 |
| @{[vmv_v_v $V0, $V3]} # vmv.v.v v0, v3 |
| |
| @{[vxor_vv_v0t $V1, $V1, $V2]} # vxor.vv v1, v1, v2, v0.t |
| |
| @{[vse64_v $V1, $Htable]} # vse64.v v1, (a0) |
| ret |
| .size gcm_init_rv64i_zvkb_zvbc,.-gcm_init_rv64i_zvkb_zvbc |
| ___ |
| } |
| |
| ################################################################################ |
| # void gcm_gmult_rv64i_zvkb_zvbc(u64 Xi[2], const u128 Htable[16]); |
| # |
| # input: Xi: current hash value |
| # Htable: preprocessed H |
| # output: Xi: next hash value Xi = (Xi * H mod f) |
| { |
| my ($Xi,$Htable,$TMP0,$TMP1,$TMP2,$TMP3,$TMP4) = ("a0","a1","t0","t1","t2","t3","t4"); |
| my ($V0,$V1,$V2,$V3,$V4,$V5,$V6) = ("v0","v1","v2","v3","v4","v5","v6"); |
| |
| $code .= <<___; |
| .text |
| .p2align 3 |
| .globl gcm_gmult_rv64i_zvkb_zvbc |
| .type gcm_gmult_rv64i_zvkb_zvbc,\@function |
| gcm_gmult_rv64i_zvkb_zvbc: |
| ld $TMP0, ($Htable) |
| ld $TMP1, 8($Htable) |
| li $TMP2, 63 |
| la $TMP3, Lpolymod |
| ld $TMP3, 8($TMP3) |
| |
| # Load/store data in reverse order. |
| # This is needed as a part of endianness swap. |
| add $Xi, $Xi, 8 |
| li $TMP4, -8 |
| |
| @{[vsetivli__x0_2_e64_m1_tu_mu]} # vsetivli x0, 2, e64, m1, tu, mu |
| |
| @{[vlse64_v $V5, $Xi, $TMP4]} # vlse64.v v5, (a0), t4 |
| @{[vrev8_v $V5, $V5]} # vrev8.v v5, v5 |
| |
| # Multiplication |
| |
| # Do two 64x64 multiplications in one go to save some time |
| # and simplify things. |
| |
| # A = a1a0 (t1, t0) |
| # B = b1b0 (v5) |
| # C = c1c0 (256 bit) |
| # c1 = a1b1 + (a0b1)h + (a1b0)h |
| # c0 = a0b0 + (a0b1)l + (a1b0)h |
| |
| # v1 = (a0b1)l,(a0b0)l |
| @{[vclmul_vx $V1, $V5, $TMP0]} # vclmul.vx v1, v5, t0 |
| # v3 = (a0b1)h,(a0b0)h |
| @{[vclmulh_vx $V3, $V5, $TMP0]} # vclmulh.vx v3, v5, t0 |
| |
| # v4 = (a1b1)l,(a1b0)l |
| @{[vclmul_vx $V4, $V5, $TMP1]} # vclmul.vx v4, v5, t1 |
| # v2 = (a1b1)h,(a1b0)h |
| @{[vclmulh_vx $V2, $V5, $TMP1]} # vclmulh.vx v2, v5, t1 |
| |
| # Is there a better way to do this? |
| # Would need to swap the order of elements within a vector register. |
| @{[vslideup_vi $V5, $V3, 1]} # vslideup.vi v5, v3, 1 |
| @{[vslideup_vi $V6, $V4, 1]} # vslideup.vi v6, v4, 1 |
| @{[vslidedown_vi $V3, $V3, 1]} # vslidedown.vi v3, v3, 1 |
| @{[vslidedown_vi $V4, $V4, 1]} # vslidedown.vi v4, v4, 1 |
| |
| @{[vmv_v_i $V0, 1]} # vmv.v.i v0, 1 |
| # v2 += (a0b1)h |
| @{[vxor_vv_v0t $V2, $V2, $V3]} # vxor.vv v2, v2, v3, v0.t |
| # v2 += (a1b1)l |
| @{[vxor_vv_v0t $V2, $V2, $V4]} # vxor.vv v2, v2, v4, v0.t |
| |
| @{[vmv_v_i $V0, 2]} # vmv.v.i v0, 2 |
| # v1 += (a0b0)h,0 |
| @{[vxor_vv_v0t $V1, $V1, $V5]} # vxor.vv v1, v1, v5, v0.t |
| # v1 += (a1b0)l,0 |
| @{[vxor_vv_v0t $V1, $V1, $V6]} # vxor.vv v1, v1, v6, v0.t |
| |
| # Now the 256bit product should be stored in (v2,v1) |
| # v1 = (a0b1)l + (a0b0)h + (a1b0)l, (a0b0)l |
| # v2 = (a1b1)h, (a1b0)h + (a0b1)h + (a1b1)l |
| |
| # Reduction |
| # Let C := A*B = c3,c2,c1,c0 = v2[1],v2[0],v1[1],v1[0] |
| # This is a slight variation of the Gueron's Montgomery reduction. |
| # The difference being the order of some operations has been changed, |
| # to make a better use of vclmul(h) instructions. |
| |
| # First step: |
| # c1 += (c0 * P)l |
| # vmv.v.i v0, 2 |
| @{[vslideup_vi_v0t $V3, $V1, 1]} # vslideup.vi v3, v1, 1, v0.t |
| @{[vclmul_vx_v0t $V3, $V3, $TMP3]} # vclmul.vx v3, v3, t3, v0.t |
| @{[vxor_vv_v0t $V1, $V1, $V3]} # vxor.vv v1, v1, v3, v0.t |
| |
| # Second step: |
| # D = d1,d0 is final result |
| # We want: |
| # m1 = c1 + (c1 * P)h |
| # m0 = (c1 * P)l + (c0 * P)h + c0 |
| # d1 = c3 + m1 |
| # d0 = c2 + m0 |
| |
| #v3 = (c1 * P)l, 0 |
| @{[vclmul_vx_v0t $V3, $V1, $TMP3]} # vclmul.vx v3, v1, t3, v0.t |
| #v4 = (c1 * P)h, (c0 * P)h |
| @{[vclmulh_vx $V4, $V1, $TMP3]} # vclmulh.vx v4, v1, t3 |
| |
| @{[vmv_v_i $V0, 1]} # vmv.v.i v0, 1 |
| @{[vslidedown_vi $V3, $V3, 1]} # vslidedown.vi v3, v3, 1 |
| |
| @{[vxor_vv $V1, $V1, $V4]} # vxor.vv v1, v1, v4 |
| @{[vxor_vv_v0t $V1, $V1, $V3]} # vxor.vv v1, v1, v3, v0.t |
| |
| # XOR in the upper upper part of the product |
| @{[vxor_vv $V2, $V2, $V1]} # vxor.vv v2, v2, v1 |
| |
| @{[vrev8_v $V2, $V2]} # vrev8.v v2, v2 |
| @{[vsse64_v $V2, $Xi, $TMP4]} # vsse64.v v2, (a0), t4 |
| ret |
| .size gcm_gmult_rv64i_zvkb_zvbc,.-gcm_gmult_rv64i_zvkb_zvbc |
| ___ |
| } |
| |
| ################################################################################ |
| # void gcm_ghash_rv64i_zvkb_zvbc(u64 Xi[2], const u128 Htable[16], |
| # const u8 *inp, size_t len); |
| # |
| # input: Xi: current hash value |
| # Htable: preprocessed H |
| # inp: pointer to input data |
| # len: length of input data in bytes (multiple of block size) |
| # output: Xi: Xi+1 (next hash value Xi) |
| { |
| my ($Xi,$Htable,$inp,$len,$TMP0,$TMP1,$TMP2,$TMP3,$M8,$TMP5,$TMP6) = ("a0","a1","a2","a3","t0","t1","t2","t3","t4","t5","t6"); |
| my ($V0,$V1,$V2,$V3,$V4,$V5,$V6,$Vinp) = ("v0","v1","v2","v3","v4","v5","v6","v7"); |
| |
| $code .= <<___; |
| .p2align 3 |
| .globl gcm_ghash_rv64i_zvkb_zvbc |
| .type gcm_ghash_rv64i_zvkb_zvbc,\@function |
| gcm_ghash_rv64i_zvkb_zvbc: |
| ld $TMP0, ($Htable) |
| ld $TMP1, 8($Htable) |
| li $TMP2, 63 |
| la $TMP3, Lpolymod |
| ld $TMP3, 8($TMP3) |
| |
| # Load/store data in reverse order. |
| # This is needed as a part of endianness swap. |
| add $Xi, $Xi, 8 |
| add $inp, $inp, 8 |
| li $M8, -8 |
| |
| @{[vsetivli__x0_2_e64_m1_tu_mu]} # vsetivli x0, 2, e64, m1, tu, mu |
| |
| @{[vlse64_v $V5, $Xi, $M8]} # vlse64.v v5, (a0), t4 |
| |
| Lstep: |
| # Read input data |
| @{[vlse64_v $Vinp, $inp, $M8]} # vle64.v v0, (a2) |
| add $inp, $inp, 16 |
| add $len, $len, -16 |
| # XOR them into Xi |
| @{[vxor_vv $V5, $V5, $Vinp]} # vxor.vv v0, v0, v1 |
| |
| @{[vrev8_v $V5, $V5]} # vrev8.v v5, v5 |
| |
| # Multiplication |
| |
| # Do two 64x64 multiplications in one go to save some time |
| # and simplify things. |
| |
| # A = a1a0 (t1, t0) |
| # B = b1b0 (v5) |
| # C = c1c0 (256 bit) |
| # c1 = a1b1 + (a0b1)h + (a1b0)h |
| # c0 = a0b0 + (a0b1)l + (a1b0)h |
| |
| # v1 = (a0b1)l,(a0b0)l |
| @{[vclmul_vx $V1, $V5, $TMP0]} # vclmul.vx v1, v5, t0 |
| # v3 = (a0b1)h,(a0b0)h |
| @{[vclmulh_vx $V3, $V5, $TMP0]} # vclmulh.vx v3, v5, t0 |
| |
| # v4 = (a1b1)l,(a1b0)l |
| @{[vclmul_vx $V4, $V5, $TMP1]} # vclmul.vx v4, v5, t1 |
| # v2 = (a1b1)h,(a1b0)h |
| @{[vclmulh_vx $V2, $V5, $TMP1]} # vclmulh.vx v2, v5, t1 |
| |
| # Is there a better way to do this? |
| # Would need to swap the order of elements within a vector register. |
| @{[vslideup_vi $V5, $V3, 1]} # vslideup.vi v5, v3, 1 |
| @{[vslideup_vi $V6, $V4, 1]} # vslideup.vi v6, v4, 1 |
| @{[vslidedown_vi $V3, $V3, 1]} # vslidedown.vi v3, v3, 1 |
| @{[vslidedown_vi $V4, $V4, 1]} # vslidedown.vi v4, v4, 1 |
| |
| @{[vmv_v_i $V0, 1]} # vmv.v.i v0, 1 |
| # v2 += (a0b1)h |
| @{[vxor_vv_v0t $V2, $V2, $V3]} # vxor.vv v2, v2, v3, v0.t |
| # v2 += (a1b1)l |
| @{[vxor_vv_v0t $V2, $V2, $V4]} # vxor.vv v2, v2, v4, v0.t |
| |
| @{[vmv_v_i $V0, 2]} # vmv.v.i v0, 2 |
| # v1 += (a0b0)h,0 |
| @{[vxor_vv_v0t $V1, $V1, $V5]} # vxor.vv v1, v1, v5, v0.t |
| # v1 += (a1b0)l,0 |
| @{[vxor_vv_v0t $V1, $V1, $V6]} # vxor.vv v1, v1, v6, v0.t |
| |
| # Now the 256bit product should be stored in (v2,v1) |
| # v1 = (a0b1)l + (a0b0)h + (a1b0)l, (a0b0)l |
| # v2 = (a1b1)h, (a1b0)h + (a0b1)h + (a1b1)l |
| |
| # Reduction |
| # Let C := A*B = c3,c2,c1,c0 = v2[1],v2[0],v1[1],v1[0] |
| # This is a slight variation of the Gueron's Montgomery reduction. |
| # The difference being the order of some operations has been changed, |
| # to make a better use of vclmul(h) instructions. |
| |
| # First step: |
| # c1 += (c0 * P)l |
| # vmv.v.i v0, 2 |
| @{[vslideup_vi_v0t $V3, $V1, 1]} # vslideup.vi v3, v1, 1, v0.t |
| @{[vclmul_vx_v0t $V3, $V3, $TMP3]} # vclmul.vx v3, v3, t3, v0.t |
| @{[vxor_vv_v0t $V1, $V1, $V3]} # vxor.vv v1, v1, v3, v0.t |
| |
| # Second step: |
| # D = d1,d0 is final result |
| # We want: |
| # m1 = c1 + (c1 * P)h |
| # m0 = (c1 * P)l + (c0 * P)h + c0 |
| # d1 = c3 + m1 |
| # d0 = c2 + m0 |
| |
| #v3 = (c1 * P)l, 0 |
| @{[vclmul_vx_v0t $V3, $V1, $TMP3]} # vclmul.vx v3, v1, t3, v0.t |
| #v4 = (c1 * P)h, (c0 * P)h |
| @{[vclmulh_vx $V4, $V1, $TMP3]} # vclmulh.vx v4, v1, t3 |
| |
| @{[vmv_v_i $V0, 1]} # vmv.v.i v0, 1 |
| @{[vslidedown_vi $V3, $V3, 1]} # vslidedown.vi v3, v3, 1 |
| |
| @{[vxor_vv $V1, $V1, $V4]} # vxor.vv v1, v1, v4 |
| @{[vxor_vv_v0t $V1, $V1, $V3]} # vxor.vv v1, v1, v3, v0.t |
| |
| # XOR in the upper upper part of the product |
| @{[vxor_vv $V2, $V2, $V1]} # vxor.vv v2, v2, v1 |
| |
| @{[vrev8_v $V5, $V2]} # vrev8.v v2, v2 |
| |
| bnez $len, Lstep |
| |
| @{[vsse64_v $V5, $Xi, $M8]} # vsse64.v v2, (a0), t4 |
| ret |
| .size gcm_ghash_rv64i_zvkb_zvbc,.-gcm_ghash_rv64i_zvkb_zvbc |
| ___ |
| } |
| |
| $code .= <<___; |
| .p2align 4 |
| Lpolymod: |
| .dword 0x0000000000000001 |
| .dword 0xc200000000000000 |
| .size Lpolymod,.-Lpolymod |
| ___ |
| |
| print $code; |
| |
| close STDOUT or die "error closing STDOUT: $!"; |