| #! /usr/bin/env perl |
| # Copyright 2016-2021 The OpenSSL Project Authors. All Rights Reserved. |
| # |
| # Licensed under the Apache License 2.0 (the "License"). You may not use |
| # this file except in compliance with the License. You can obtain a copy |
| # in the file LICENSE in the source distribution or at |
| # https://www.openssl.org/source/license.html |
| |
| # |
| # ==================================================================== |
| # Written by Andy Polyakov <appro@openssl.org> for the OpenSSL |
| # project. The module is, however, dual licensed under OpenSSL and |
| # CRYPTOGAMS licenses depending on where you obtain it. For further |
| # details see http://www.openssl.org/~appro/cryptogams/. |
| # ==================================================================== |
| # |
| # This module implements Poly1305 hash for SPARCv9, vanilla, as well |
| # as VIS3 and FMA extensions. |
| # |
| # May, August 2015 |
| # |
| # Numbers are cycles per processed byte with poly1305_blocks alone. |
| # |
| # IALU(*) FMA |
| # |
| # UltraSPARC III 12.3(**) |
| # SPARC T3 7.92 |
| # SPARC T4 1.70(***) 6.55 |
| # SPARC64 X 5.60 3.64 |
| # |
| # (*) Comparison to compiler-generated code is really problematic, |
| # because latter's performance varies too much depending on too |
| # many variables. For example, one can measure from 5x to 15x |
| # improvement on T4 for gcc-4.6. Well, in T4 case it's a bit |
| # unfair comparison, because compiler doesn't use VIS3, but |
| # given same initial conditions coefficient varies from 3x to 9x. |
| # (**) Pre-III performance should be even worse; floating-point |
| # performance for UltraSPARC I-IV on the other hand is reported |
| # to be 4.25 for hand-coded assembly, but they are just too old |
| # to care about. |
| # (***) Multi-process benchmark saturates at ~12.5x single-process |
| # result on 8-core processor, or ~21GBps per 2.85GHz socket. |
| |
| # $output is the last argument if it looks like a file (it has an extension) |
| my $output = $#ARGV >= 0 && $ARGV[$#ARGV] =~ m|\.\w+$| ? pop : undef; |
| |
| open STDOUT,">$output" if $output; |
| |
| my ($ctx,$inp,$len,$padbit,$shl,$shr) = map("%i$_",(0..5)); |
| my ($r0,$r1,$r2,$r3,$s1,$s2,$s3,$h4) = map("%l$_",(0..7)); |
| my ($h0,$h1,$h2,$h3, $t0,$t1,$t2) = map("%o$_",(0..5,7)); |
| my ($d0,$d1,$d2,$d3) = map("%g$_",(1..4)); |
| |
| $code.=<<___; |
| #ifndef __ASSEMBLER__ |
| # define __ASSEMBLER__ 1 |
| #endif |
| #include "crypto/sparc_arch.h" |
| |
| #ifdef __arch64__ |
| .register %g2,#scratch |
| .register %g3,#scratch |
| # define STPTR stx |
| # define SIZE_T 8 |
| #else |
| # define STPTR st |
| # define SIZE_T 4 |
| #endif |
| #define LOCALS (STACK_BIAS+STACK_FRAME) |
| |
| .section ".text",#alloc,#execinstr |
| |
| #ifdef __PIC__ |
| SPARC_PIC_THUNK(%g1) |
| #endif |
| |
| .globl poly1305_init |
| .align 32 |
| poly1305_init: |
| save %sp,-STACK_FRAME-16,%sp |
| nop |
| |
| SPARC_LOAD_ADDRESS(OPENSSL_sparcv9cap_P,%g1) |
| ld [%g1],%g1 |
| |
| and %g1,SPARCV9_FMADD|SPARCV9_VIS3,%g1 |
| cmp %g1,SPARCV9_FMADD |
| be .Lpoly1305_init_fma |
| nop |
| |
| stx %g0,[$ctx+0] |
| stx %g0,[$ctx+8] ! zero hash value |
| brz,pn $inp,.Lno_key |
| stx %g0,[$ctx+16] |
| |
| and $inp,7,$shr ! alignment factor |
| andn $inp,7,$inp |
| sll $shr,3,$shr ! *8 |
| neg $shr,$shl |
| |
| sethi %hi(0x0ffffffc),$t0 |
| set 8,$h1 |
| or $t0,%lo(0x0ffffffc),$t0 |
| set 16,$h2 |
| sllx $t0,32,$t1 |
| or $t0,$t1,$t1 ! 0x0ffffffc0ffffffc |
| or $t1,3,$t0 ! 0x0ffffffc0fffffff |
| |
| ldxa [$inp+%g0]0x88,$h0 ! load little-endian key |
| brz,pt $shr,.Lkey_aligned |
| ldxa [$inp+$h1]0x88,$h1 |
| |
| ldxa [$inp+$h2]0x88,$h2 |
| srlx $h0,$shr,$h0 |
| sllx $h1,$shl,$t2 |
| srlx $h1,$shr,$h1 |
| or $t2,$h0,$h0 |
| sllx $h2,$shl,$h2 |
| or $h2,$h1,$h1 |
| |
| .Lkey_aligned: |
| and $t0,$h0,$h0 |
| and $t1,$h1,$h1 |
| stx $h0,[$ctx+32+0] ! store key |
| stx $h1,[$ctx+32+8] |
| |
| andcc %g1,SPARCV9_VIS3,%g0 |
| be .Lno_key |
| nop |
| |
| 1: call .+8 |
| add %o7,poly1305_blocks_vis3-1b,%o7 |
| |
| add %o7,poly1305_emit-poly1305_blocks_vis3,%o5 |
| STPTR %o7,[%i2] |
| STPTR %o5,[%i2+SIZE_T] |
| |
| ret |
| restore %g0,1,%o0 ! return 1 |
| |
| .Lno_key: |
| ret |
| restore %g0,%g0,%o0 ! return 0 |
| .type poly1305_init,#function |
| .size poly1305_init,.-poly1305_init |
| |
| .globl poly1305_blocks |
| .align 32 |
| poly1305_blocks: |
| save %sp,-STACK_FRAME,%sp |
| srln $len,4,$len |
| |
| brz,pn $len,.Lno_data |
| nop |
| |
| ld [$ctx+32+0],$r1 ! load key |
| ld [$ctx+32+4],$r0 |
| ld [$ctx+32+8],$r3 |
| ld [$ctx+32+12],$r2 |
| |
| ld [$ctx+0],$h1 ! load hash value |
| ld [$ctx+4],$h0 |
| ld [$ctx+8],$h3 |
| ld [$ctx+12],$h2 |
| ld [$ctx+16],$h4 |
| |
| and $inp,7,$shr ! alignment factor |
| andn $inp,7,$inp |
| set 8,$d1 |
| sll $shr,3,$shr ! *8 |
| set 16,$d2 |
| neg $shr,$shl |
| |
| srl $r1,2,$s1 |
| srl $r2,2,$s2 |
| add $r1,$s1,$s1 |
| srl $r3,2,$s3 |
| add $r2,$s2,$s2 |
| add $r3,$s3,$s3 |
| |
| .Loop: |
| ldxa [$inp+%g0]0x88,$d0 ! load little-endian input |
| brz,pt $shr,.Linp_aligned |
| ldxa [$inp+$d1]0x88,$d1 |
| |
| ldxa [$inp+$d2]0x88,$d2 |
| srlx $d0,$shr,$d0 |
| sllx $d1,$shl,$t1 |
| srlx $d1,$shr,$d1 |
| or $t1,$d0,$d0 |
| sllx $d2,$shl,$d2 |
| or $d2,$d1,$d1 |
| |
| .Linp_aligned: |
| srlx $d0,32,$t0 |
| addcc $d0,$h0,$h0 ! accumulate input |
| srlx $d1,32,$t1 |
| addccc $t0,$h1,$h1 |
| addccc $d1,$h2,$h2 |
| addccc $t1,$h3,$h3 |
| addc $padbit,$h4,$h4 |
| |
| umul $r0,$h0,$d0 |
| umul $r1,$h0,$d1 |
| umul $r2,$h0,$d2 |
| umul $r3,$h0,$d3 |
| sub $len,1,$len |
| add $inp,16,$inp |
| |
| umul $s3,$h1,$t0 |
| umul $r0,$h1,$t1 |
| umul $r1,$h1,$t2 |
| add $t0,$d0,$d0 |
| add $t1,$d1,$d1 |
| umul $r2,$h1,$t0 |
| add $t2,$d2,$d2 |
| add $t0,$d3,$d3 |
| |
| umul $s2,$h2,$t1 |
| umul $s3,$h2,$t2 |
| umul $r0,$h2,$t0 |
| add $t1,$d0,$d0 |
| add $t2,$d1,$d1 |
| umul $r1,$h2,$t1 |
| add $t0,$d2,$d2 |
| add $t1,$d3,$d3 |
| |
| umul $s1,$h3,$t2 |
| umul $s2,$h3,$t0 |
| umul $s3,$h3,$t1 |
| add $t2,$d0,$d0 |
| add $t0,$d1,$d1 |
| umul $r0,$h3,$t2 |
| add $t1,$d2,$d2 |
| add $t2,$d3,$d3 |
| |
| umul $s1,$h4,$t0 |
| umul $s2,$h4,$t1 |
| umul $s3,$h4,$t2 |
| umul $r0,$h4,$h4 |
| add $t0,$d1,$d1 |
| add $t1,$d2,$d2 |
| srlx $d0,32,$h1 |
| add $t2,$d3,$d3 |
| srlx $d1,32,$h2 |
| |
| addcc $d1,$h1,$h1 |
| srlx $d2,32,$h3 |
| set 8,$d1 |
| addccc $d2,$h2,$h2 |
| srlx $d3,32,$t0 |
| set 16,$d2 |
| addccc $d3,$h3,$h3 |
| addc $t0,$h4,$h4 |
| |
| srl $h4,2,$t0 ! final reduction step |
| andn $h4,3,$t1 |
| and $h4,3,$h4 |
| add $t1,$t0,$t0 |
| |
| addcc $t0,$d0,$h0 |
| addccc %g0,$h1,$h1 |
| addccc %g0,$h2,$h2 |
| addccc %g0,$h3,$h3 |
| brnz,pt $len,.Loop |
| addc %g0,$h4,$h4 |
| |
| st $h1,[$ctx+0] ! store hash value |
| st $h0,[$ctx+4] |
| st $h3,[$ctx+8] |
| st $h2,[$ctx+12] |
| st $h4,[$ctx+16] |
| |
| .Lno_data: |
| ret |
| restore |
| .type poly1305_blocks,#function |
| .size poly1305_blocks,.-poly1305_blocks |
| ___ |
| ######################################################################## |
| # VIS3 has umulxhi and addxc... |
| { |
| my ($H0,$H1,$H2,$R0,$R1,$S1,$T1) = map("%o$_",(0..5,7)); |
| my ($D0,$D1,$D2,$T0) = map("%g$_",(1..4)); |
| |
| $code.=<<___; |
| .align 32 |
| poly1305_blocks_vis3: |
| save %sp,-STACK_FRAME,%sp |
| srln $len,4,$len |
| |
| brz,pn $len,.Lno_data |
| nop |
| |
| ldx [$ctx+32+0],$R0 ! load key |
| ldx [$ctx+32+8],$R1 |
| |
| ldx [$ctx+0],$H0 ! load hash value |
| ldx [$ctx+8],$H1 |
| ld [$ctx+16],$H2 |
| |
| and $inp,7,$shr ! alignment factor |
| andn $inp,7,$inp |
| set 8,$r1 |
| sll $shr,3,$shr ! *8 |
| set 16,$r2 |
| neg $shr,$shl |
| |
| srlx $R1,2,$S1 |
| b .Loop_vis3 |
| add $R1,$S1,$S1 |
| |
| .Loop_vis3: |
| ldxa [$inp+%g0]0x88,$D0 ! load little-endian input |
| brz,pt $shr,.Linp_aligned_vis3 |
| ldxa [$inp+$r1]0x88,$D1 |
| |
| ldxa [$inp+$r2]0x88,$D2 |
| srlx $D0,$shr,$D0 |
| sllx $D1,$shl,$T1 |
| srlx $D1,$shr,$D1 |
| or $T1,$D0,$D0 |
| sllx $D2,$shl,$D2 |
| or $D2,$D1,$D1 |
| |
| .Linp_aligned_vis3: |
| addcc $D0,$H0,$H0 ! accumulate input |
| sub $len,1,$len |
| addxccc $D1,$H1,$H1 |
| add $inp,16,$inp |
| |
| mulx $R0,$H0,$D0 ! r0*h0 |
| addxc $padbit,$H2,$H2 |
| umulxhi $R0,$H0,$D1 |
| mulx $S1,$H1,$T0 ! s1*h1 |
| umulxhi $S1,$H1,$T1 |
| addcc $T0,$D0,$D0 |
| mulx $R1,$H0,$T0 ! r1*h0 |
| addxc $T1,$D1,$D1 |
| umulxhi $R1,$H0,$D2 |
| addcc $T0,$D1,$D1 |
| mulx $R0,$H1,$T0 ! r0*h1 |
| addxc %g0,$D2,$D2 |
| umulxhi $R0,$H1,$T1 |
| addcc $T0,$D1,$D1 |
| mulx $S1,$H2,$T0 ! s1*h2 |
| addxc $T1,$D2,$D2 |
| mulx $R0,$H2,$T1 ! r0*h2 |
| addcc $T0,$D1,$D1 |
| addxc $T1,$D2,$D2 |
| |
| srlx $D2,2,$T0 ! final reduction step |
| andn $D2,3,$T1 |
| and $D2,3,$H2 |
| add $T1,$T0,$T0 |
| |
| addcc $T0,$D0,$H0 |
| addxccc %g0,$D1,$H1 |
| brnz,pt $len,.Loop_vis3 |
| addxc %g0,$H2,$H2 |
| |
| stx $H0,[$ctx+0] ! store hash value |
| stx $H1,[$ctx+8] |
| st $H2,[$ctx+16] |
| |
| ret |
| restore |
| .type poly1305_blocks_vis3,#function |
| .size poly1305_blocks_vis3,.-poly1305_blocks_vis3 |
| ___ |
| } |
| my ($mac,$nonce) = ($inp,$len); |
| |
| $code.=<<___; |
| .globl poly1305_emit |
| .align 32 |
| poly1305_emit: |
| save %sp,-STACK_FRAME,%sp |
| |
| ld [$ctx+0],$h1 ! load hash value |
| ld [$ctx+4],$h0 |
| ld [$ctx+8],$h3 |
| ld [$ctx+12],$h2 |
| ld [$ctx+16],$h4 |
| |
| addcc $h0,5,$r0 ! compare to modulus |
| addccc $h1,0,$r1 |
| addccc $h2,0,$r2 |
| addccc $h3,0,$r3 |
| addc $h4,0,$h4 |
| andcc $h4,4,%g0 ! did it carry/borrow? |
| |
| movnz %icc,$r0,$h0 |
| ld [$nonce+0],$r0 ! load nonce |
| movnz %icc,$r1,$h1 |
| ld [$nonce+4],$r1 |
| movnz %icc,$r2,$h2 |
| ld [$nonce+8],$r2 |
| movnz %icc,$r3,$h3 |
| ld [$nonce+12],$r3 |
| |
| addcc $r0,$h0,$h0 ! accumulate nonce |
| addccc $r1,$h1,$h1 |
| addccc $r2,$h2,$h2 |
| addc $r3,$h3,$h3 |
| |
| srl $h0,8,$r0 |
| stb $h0,[$mac+0] ! store little-endian result |
| srl $h0,16,$r1 |
| stb $r0,[$mac+1] |
| srl $h0,24,$r2 |
| stb $r1,[$mac+2] |
| stb $r2,[$mac+3] |
| |
| srl $h1,8,$r0 |
| stb $h1,[$mac+4] |
| srl $h1,16,$r1 |
| stb $r0,[$mac+5] |
| srl $h1,24,$r2 |
| stb $r1,[$mac+6] |
| stb $r2,[$mac+7] |
| |
| srl $h2,8,$r0 |
| stb $h2,[$mac+8] |
| srl $h2,16,$r1 |
| stb $r0,[$mac+9] |
| srl $h2,24,$r2 |
| stb $r1,[$mac+10] |
| stb $r2,[$mac+11] |
| |
| srl $h3,8,$r0 |
| stb $h3,[$mac+12] |
| srl $h3,16,$r1 |
| stb $r0,[$mac+13] |
| srl $h3,24,$r2 |
| stb $r1,[$mac+14] |
| stb $r2,[$mac+15] |
| |
| ret |
| restore |
| .type poly1305_emit,#function |
| .size poly1305_emit,.-poly1305_emit |
| ___ |
| |
| { |
| my ($ctx,$inp,$len,$padbit) = map("%i$_",(0..3)); |
| my ($in0,$in1,$in2,$in3,$in4) = map("%o$_",(0..4)); |
| my ($i1,$step,$shr,$shl) = map("%l$_",(0..7)); |
| my $i2=$step; |
| |
| my ($h0lo,$h0hi,$h1lo,$h1hi,$h2lo,$h2hi,$h3lo,$h3hi, |
| $two0,$two32,$two64,$two96,$two130,$five_two130, |
| $r0lo,$r0hi,$r1lo,$r1hi,$r2lo,$r2hi, |
| $s2lo,$s2hi,$s3lo,$s3hi, |
| $c0lo,$c0hi,$c1lo,$c1hi,$c2lo,$c2hi,$c3lo,$c3hi) = map("%f".2*$_,(0..31)); |
| # borrowings |
| my ($r3lo,$r3hi,$s1lo,$s1hi) = ($c0lo,$c0hi,$c1lo,$c1hi); |
| my ($x0,$x1,$x2,$x3) = ($c2lo,$c2hi,$c3lo,$c3hi); |
| my ($y0,$y1,$y2,$y3) = ($c1lo,$c1hi,$c3hi,$c3lo); |
| |
| $code.=<<___; |
| .align 32 |
| poly1305_init_fma: |
| save %sp,-STACK_FRAME-16,%sp |
| nop |
| |
| .Lpoly1305_init_fma: |
| 1: call .+8 |
| add %o7,.Lconsts_fma-1b,%o7 |
| |
| ldd [%o7+8*0],$two0 ! load constants |
| ldd [%o7+8*1],$two32 |
| ldd [%o7+8*2],$two64 |
| ldd [%o7+8*3],$two96 |
| ldd [%o7+8*5],$five_two130 |
| |
| std $two0,[$ctx+8*0] ! initial hash value, biased 0 |
| std $two32,[$ctx+8*1] |
| std $two64,[$ctx+8*2] |
| std $two96,[$ctx+8*3] |
| |
| brz,pn $inp,.Lno_key_fma |
| nop |
| |
| stx %fsr,[%sp+LOCALS] ! save original %fsr |
| ldx [%o7+8*6],%fsr ! load new %fsr |
| |
| std $two0,[$ctx+8*4] ! key "template" |
| std $two32,[$ctx+8*5] |
| std $two64,[$ctx+8*6] |
| std $two96,[$ctx+8*7] |
| |
| and $inp,7,$shr |
| andn $inp,7,$inp ! align pointer |
| mov 8,$i1 |
| sll $shr,3,$shr |
| mov 16,$i2 |
| neg $shr,$shl |
| |
| ldxa [$inp+%g0]0x88,$in0 ! load little-endian key |
| ldxa [$inp+$i1]0x88,$in2 |
| |
| brz $shr,.Lkey_aligned_fma |
| sethi %hi(0xf0000000),$i1 ! 0xf0000000 |
| |
| ldxa [$inp+$i2]0x88,$in4 |
| |
| srlx $in0,$shr,$in0 ! align data |
| sllx $in2,$shl,$in1 |
| srlx $in2,$shr,$in2 |
| or $in1,$in0,$in0 |
| sllx $in4,$shl,$in3 |
| or $in3,$in2,$in2 |
| |
| .Lkey_aligned_fma: |
| or $i1,3,$i2 ! 0xf0000003 |
| srlx $in0,32,$in1 |
| andn $in0,$i1,$in0 ! &=0x0fffffff |
| andn $in1,$i2,$in1 ! &=0x0ffffffc |
| srlx $in2,32,$in3 |
| andn $in2,$i2,$in2 |
| andn $in3,$i2,$in3 |
| |
| st $in0,[$ctx+`8*4+4`] ! fill "template" |
| st $in1,[$ctx+`8*5+4`] |
| st $in2,[$ctx+`8*6+4`] |
| st $in3,[$ctx+`8*7+4`] |
| |
| ldd [$ctx+8*4],$h0lo ! load [biased] key |
| ldd [$ctx+8*5],$h1lo |
| ldd [$ctx+8*6],$h2lo |
| ldd [$ctx+8*7],$h3lo |
| |
| fsubd $h0lo,$two0, $h0lo ! r0 |
| ldd [%o7+8*7],$two0 ! more constants |
| fsubd $h1lo,$two32,$h1lo ! r1 |
| ldd [%o7+8*8],$two32 |
| fsubd $h2lo,$two64,$h2lo ! r2 |
| ldd [%o7+8*9],$two64 |
| fsubd $h3lo,$two96,$h3lo ! r3 |
| ldd [%o7+8*10],$two96 |
| |
| fmuld $five_two130,$h1lo,$s1lo ! s1 |
| fmuld $five_two130,$h2lo,$s2lo ! s2 |
| fmuld $five_two130,$h3lo,$s3lo ! s3 |
| |
| faddd $h0lo,$two0, $h0hi |
| faddd $h1lo,$two32,$h1hi |
| faddd $h2lo,$two64,$h2hi |
| faddd $h3lo,$two96,$h3hi |
| |
| fsubd $h0hi,$two0, $h0hi |
| ldd [%o7+8*11],$two0 ! more constants |
| fsubd $h1hi,$two32,$h1hi |
| ldd [%o7+8*12],$two32 |
| fsubd $h2hi,$two64,$h2hi |
| ldd [%o7+8*13],$two64 |
| fsubd $h3hi,$two96,$h3hi |
| |
| fsubd $h0lo,$h0hi,$h0lo |
| std $h0hi,[$ctx+8*5] ! r0hi |
| fsubd $h1lo,$h1hi,$h1lo |
| std $h1hi,[$ctx+8*7] ! r1hi |
| fsubd $h2lo,$h2hi,$h2lo |
| std $h2hi,[$ctx+8*9] ! r2hi |
| fsubd $h3lo,$h3hi,$h3lo |
| std $h3hi,[$ctx+8*11] ! r3hi |
| |
| faddd $s1lo,$two0, $s1hi |
| faddd $s2lo,$two32,$s2hi |
| faddd $s3lo,$two64,$s3hi |
| |
| fsubd $s1hi,$two0, $s1hi |
| fsubd $s2hi,$two32,$s2hi |
| fsubd $s3hi,$two64,$s3hi |
| |
| fsubd $s1lo,$s1hi,$s1lo |
| fsubd $s2lo,$s2hi,$s2lo |
| fsubd $s3lo,$s3hi,$s3lo |
| |
| ldx [%sp+LOCALS],%fsr ! restore %fsr |
| |
| std $h0lo,[$ctx+8*4] ! r0lo |
| std $h1lo,[$ctx+8*6] ! r1lo |
| std $h2lo,[$ctx+8*8] ! r2lo |
| std $h3lo,[$ctx+8*10] ! r3lo |
| |
| std $s1hi,[$ctx+8*13] |
| std $s2hi,[$ctx+8*15] |
| std $s3hi,[$ctx+8*17] |
| |
| std $s1lo,[$ctx+8*12] |
| std $s2lo,[$ctx+8*14] |
| std $s3lo,[$ctx+8*16] |
| |
| add %o7,poly1305_blocks_fma-.Lconsts_fma,%o0 |
| add %o7,poly1305_emit_fma-.Lconsts_fma,%o1 |
| STPTR %o0,[%i2] |
| STPTR %o1,[%i2+SIZE_T] |
| |
| ret |
| restore %g0,1,%o0 ! return 1 |
| |
| .Lno_key_fma: |
| ret |
| restore %g0,%g0,%o0 ! return 0 |
| .type poly1305_init_fma,#function |
| .size poly1305_init_fma,.-poly1305_init_fma |
| |
| .align 32 |
| poly1305_blocks_fma: |
| save %sp,-STACK_FRAME-48,%sp |
| srln $len,4,$len |
| |
| brz,pn $len,.Labort |
| sub $len,1,$len |
| |
| 1: call .+8 |
| add %o7,.Lconsts_fma-1b,%o7 |
| |
| ldd [%o7+8*0],$two0 ! load constants |
| ldd [%o7+8*1],$two32 |
| ldd [%o7+8*2],$two64 |
| ldd [%o7+8*3],$two96 |
| ldd [%o7+8*4],$two130 |
| ldd [%o7+8*5],$five_two130 |
| |
| ldd [$ctx+8*0],$h0lo ! load [biased] hash value |
| ldd [$ctx+8*1],$h1lo |
| ldd [$ctx+8*2],$h2lo |
| ldd [$ctx+8*3],$h3lo |
| |
| std $two0,[%sp+LOCALS+8*0] ! input "template" |
| sethi %hi((1023+52+96)<<20),$in3 |
| std $two32,[%sp+LOCALS+8*1] |
| or $padbit,$in3,$in3 |
| std $two64,[%sp+LOCALS+8*2] |
| st $in3,[%sp+LOCALS+8*3] |
| |
| and $inp,7,$shr |
| andn $inp,7,$inp ! align pointer |
| mov 8,$i1 |
| sll $shr,3,$shr |
| mov 16,$step |
| neg $shr,$shl |
| |
| ldxa [$inp+%g0]0x88,$in0 ! load little-endian input |
| brz $shr,.Linp_aligned_fma |
| ldxa [$inp+$i1]0x88,$in2 |
| |
| ldxa [$inp+$step]0x88,$in4 |
| add $inp,8,$inp |
| |
| srlx $in0,$shr,$in0 ! align data |
| sllx $in2,$shl,$in1 |
| srlx $in2,$shr,$in2 |
| or $in1,$in0,$in0 |
| sllx $in4,$shl,$in3 |
| srlx $in4,$shr,$in4 ! pre-shift |
| or $in3,$in2,$in2 |
| |
| .Linp_aligned_fma: |
| srlx $in0,32,$in1 |
| movrz $len,0,$step |
| srlx $in2,32,$in3 |
| add $step,$inp,$inp ! conditional advance |
| |
| st $in0,[%sp+LOCALS+8*0+4] ! fill "template" |
| st $in1,[%sp+LOCALS+8*1+4] |
| st $in2,[%sp+LOCALS+8*2+4] |
| st $in3,[%sp+LOCALS+8*3+4] |
| |
| ldd [$ctx+8*4],$r0lo ! load key |
| ldd [$ctx+8*5],$r0hi |
| ldd [$ctx+8*6],$r1lo |
| ldd [$ctx+8*7],$r1hi |
| ldd [$ctx+8*8],$r2lo |
| ldd [$ctx+8*9],$r2hi |
| ldd [$ctx+8*10],$r3lo |
| ldd [$ctx+8*11],$r3hi |
| ldd [$ctx+8*12],$s1lo |
| ldd [$ctx+8*13],$s1hi |
| ldd [$ctx+8*14],$s2lo |
| ldd [$ctx+8*15],$s2hi |
| ldd [$ctx+8*16],$s3lo |
| ldd [$ctx+8*17],$s3hi |
| |
| stx %fsr,[%sp+LOCALS+8*4] ! save original %fsr |
| ldx [%o7+8*6],%fsr ! load new %fsr |
| |
| subcc $len,1,$len |
| movrz $len,0,$step |
| |
| ldd [%sp+LOCALS+8*0],$x0 ! load biased input |
| ldd [%sp+LOCALS+8*1],$x1 |
| ldd [%sp+LOCALS+8*2],$x2 |
| ldd [%sp+LOCALS+8*3],$x3 |
| |
| fsubd $h0lo,$two0, $h0lo ! de-bias hash value |
| fsubd $h1lo,$two32,$h1lo |
| ldxa [$inp+%g0]0x88,$in0 ! modulo-scheduled input load |
| fsubd $h2lo,$two64,$h2lo |
| fsubd $h3lo,$two96,$h3lo |
| ldxa [$inp+$i1]0x88,$in2 |
| |
| fsubd $x0,$two0, $x0 ! de-bias input |
| fsubd $x1,$two32,$x1 |
| fsubd $x2,$two64,$x2 |
| fsubd $x3,$two96,$x3 |
| |
| brz $shr,.Linp_aligned_fma2 |
| add $step,$inp,$inp ! conditional advance |
| |
| sllx $in0,$shl,$in1 ! align data |
| srlx $in0,$shr,$in3 |
| or $in1,$in4,$in0 |
| sllx $in2,$shl,$in1 |
| srlx $in2,$shr,$in4 ! pre-shift |
| or $in3,$in1,$in2 |
| .Linp_aligned_fma2: |
| srlx $in0,32,$in1 |
| srlx $in2,32,$in3 |
| |
| faddd $h0lo,$x0,$x0 ! accumulate input |
| stw $in0,[%sp+LOCALS+8*0+4] |
| faddd $h1lo,$x1,$x1 |
| stw $in1,[%sp+LOCALS+8*1+4] |
| faddd $h2lo,$x2,$x2 |
| stw $in2,[%sp+LOCALS+8*2+4] |
| faddd $h3lo,$x3,$x3 |
| stw $in3,[%sp+LOCALS+8*3+4] |
| |
| b .Lentry_fma |
| nop |
| |
| .align 16 |
| .Loop_fma: |
| ldxa [$inp+%g0]0x88,$in0 ! modulo-scheduled input load |
| ldxa [$inp+$i1]0x88,$in2 |
| movrz $len,0,$step |
| |
| faddd $y0,$h0lo,$h0lo ! accumulate input |
| faddd $y1,$h0hi,$h0hi |
| faddd $y2,$h2lo,$h2lo |
| faddd $y3,$h2hi,$h2hi |
| |
| brz,pn $shr,.Linp_aligned_fma3 |
| add $step,$inp,$inp ! conditional advance |
| |
| sllx $in0,$shl,$in1 ! align data |
| srlx $in0,$shr,$in3 |
| or $in1,$in4,$in0 |
| sllx $in2,$shl,$in1 |
| srlx $in2,$shr,$in4 ! pre-shift |
| or $in3,$in1,$in2 |
| |
| .Linp_aligned_fma3: |
| !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! base 2^48 -> base 2^32 |
| faddd $two64,$h1lo,$c1lo |
| srlx $in0,32,$in1 |
| faddd $two64,$h1hi,$c1hi |
| srlx $in2,32,$in3 |
| faddd $two130,$h3lo,$c3lo |
| st $in0,[%sp+LOCALS+8*0+4] ! fill "template" |
| faddd $two130,$h3hi,$c3hi |
| st $in1,[%sp+LOCALS+8*1+4] |
| faddd $two32,$h0lo,$c0lo |
| st $in2,[%sp+LOCALS+8*2+4] |
| faddd $two32,$h0hi,$c0hi |
| st $in3,[%sp+LOCALS+8*3+4] |
| faddd $two96,$h2lo,$c2lo |
| faddd $two96,$h2hi,$c2hi |
| |
| fsubd $c1lo,$two64,$c1lo |
| fsubd $c1hi,$two64,$c1hi |
| fsubd $c3lo,$two130,$c3lo |
| fsubd $c3hi,$two130,$c3hi |
| fsubd $c0lo,$two32,$c0lo |
| fsubd $c0hi,$two32,$c0hi |
| fsubd $c2lo,$two96,$c2lo |
| fsubd $c2hi,$two96,$c2hi |
| |
| fsubd $h1lo,$c1lo,$h1lo |
| fsubd $h1hi,$c1hi,$h1hi |
| fsubd $h3lo,$c3lo,$h3lo |
| fsubd $h3hi,$c3hi,$h3hi |
| fsubd $h2lo,$c2lo,$h2lo |
| fsubd $h2hi,$c2hi,$h2hi |
| fsubd $h0lo,$c0lo,$h0lo |
| fsubd $h0hi,$c0hi,$h0hi |
| |
| faddd $h1lo,$c0lo,$h1lo |
| faddd $h1hi,$c0hi,$h1hi |
| faddd $h3lo,$c2lo,$h3lo |
| faddd $h3hi,$c2hi,$h3hi |
| faddd $h2lo,$c1lo,$h2lo |
| faddd $h2hi,$c1hi,$h2hi |
| fmaddd $five_two130,$c3lo,$h0lo,$h0lo |
| fmaddd $five_two130,$c3hi,$h0hi,$h0hi |
| |
| faddd $h1lo,$h1hi,$x1 |
| ldd [$ctx+8*12],$s1lo ! reload constants |
| faddd $h3lo,$h3hi,$x3 |
| ldd [$ctx+8*13],$s1hi |
| faddd $h2lo,$h2hi,$x2 |
| ldd [$ctx+8*10],$r3lo |
| faddd $h0lo,$h0hi,$x0 |
| ldd [$ctx+8*11],$r3hi |
| |
| .Lentry_fma: |
| fmuld $x1,$s3lo,$h0lo |
| fmuld $x1,$s3hi,$h0hi |
| fmuld $x1,$r1lo,$h2lo |
| fmuld $x1,$r1hi,$h2hi |
| fmuld $x1,$r0lo,$h1lo |
| fmuld $x1,$r0hi,$h1hi |
| fmuld $x1,$r2lo,$h3lo |
| fmuld $x1,$r2hi,$h3hi |
| |
| fmaddd $x3,$s1lo,$h0lo,$h0lo |
| fmaddd $x3,$s1hi,$h0hi,$h0hi |
| fmaddd $x3,$s3lo,$h2lo,$h2lo |
| fmaddd $x3,$s3hi,$h2hi,$h2hi |
| fmaddd $x3,$s2lo,$h1lo,$h1lo |
| fmaddd $x3,$s2hi,$h1hi,$h1hi |
| fmaddd $x3,$r0lo,$h3lo,$h3lo |
| fmaddd $x3,$r0hi,$h3hi,$h3hi |
| |
| fmaddd $x2,$s2lo,$h0lo,$h0lo |
| fmaddd $x2,$s2hi,$h0hi,$h0hi |
| fmaddd $x2,$r0lo,$h2lo,$h2lo |
| fmaddd $x2,$r0hi,$h2hi,$h2hi |
| fmaddd $x2,$s3lo,$h1lo,$h1lo |
| ldd [%sp+LOCALS+8*0],$y0 ! load [biased] input |
| fmaddd $x2,$s3hi,$h1hi,$h1hi |
| ldd [%sp+LOCALS+8*1],$y1 |
| fmaddd $x2,$r1lo,$h3lo,$h3lo |
| ldd [%sp+LOCALS+8*2],$y2 |
| fmaddd $x2,$r1hi,$h3hi,$h3hi |
| ldd [%sp+LOCALS+8*3],$y3 |
| |
| fmaddd $x0,$r0lo,$h0lo,$h0lo |
| fsubd $y0,$two0, $y0 ! de-bias input |
| fmaddd $x0,$r0hi,$h0hi,$h0hi |
| fsubd $y1,$two32,$y1 |
| fmaddd $x0,$r2lo,$h2lo,$h2lo |
| fsubd $y2,$two64,$y2 |
| fmaddd $x0,$r2hi,$h2hi,$h2hi |
| fsubd $y3,$two96,$y3 |
| fmaddd $x0,$r1lo,$h1lo,$h1lo |
| fmaddd $x0,$r1hi,$h1hi,$h1hi |
| fmaddd $x0,$r3lo,$h3lo,$h3lo |
| fmaddd $x0,$r3hi,$h3hi,$h3hi |
| |
| bcc SIZE_T_CC,.Loop_fma |
| subcc $len,1,$len |
| |
| !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! base 2^48 -> base 2^32 |
| faddd $h0lo,$two32,$c0lo |
| faddd $h0hi,$two32,$c0hi |
| faddd $h2lo,$two96,$c2lo |
| faddd $h2hi,$two96,$c2hi |
| faddd $h1lo,$two64,$c1lo |
| faddd $h1hi,$two64,$c1hi |
| faddd $h3lo,$two130,$c3lo |
| faddd $h3hi,$two130,$c3hi |
| |
| fsubd $c0lo,$two32,$c0lo |
| fsubd $c0hi,$two32,$c0hi |
| fsubd $c2lo,$two96,$c2lo |
| fsubd $c2hi,$two96,$c2hi |
| fsubd $c1lo,$two64,$c1lo |
| fsubd $c1hi,$two64,$c1hi |
| fsubd $c3lo,$two130,$c3lo |
| fsubd $c3hi,$two130,$c3hi |
| |
| fsubd $h1lo,$c1lo,$h1lo |
| fsubd $h1hi,$c1hi,$h1hi |
| fsubd $h3lo,$c3lo,$h3lo |
| fsubd $h3hi,$c3hi,$h3hi |
| fsubd $h2lo,$c2lo,$h2lo |
| fsubd $h2hi,$c2hi,$h2hi |
| fsubd $h0lo,$c0lo,$h0lo |
| fsubd $h0hi,$c0hi,$h0hi |
| |
| faddd $h1lo,$c0lo,$h1lo |
| faddd $h1hi,$c0hi,$h1hi |
| faddd $h3lo,$c2lo,$h3lo |
| faddd $h3hi,$c2hi,$h3hi |
| faddd $h2lo,$c1lo,$h2lo |
| faddd $h2hi,$c1hi,$h2hi |
| fmaddd $five_two130,$c3lo,$h0lo,$h0lo |
| fmaddd $five_two130,$c3hi,$h0hi,$h0hi |
| |
| faddd $h1lo,$h1hi,$x1 |
| faddd $h3lo,$h3hi,$x3 |
| faddd $h2lo,$h2hi,$x2 |
| faddd $h0lo,$h0hi,$x0 |
| |
| faddd $x1,$two32,$x1 ! bias |
| faddd $x3,$two96,$x3 |
| faddd $x2,$two64,$x2 |
| faddd $x0,$two0, $x0 |
| |
| ldx [%sp+LOCALS+8*4],%fsr ! restore saved %fsr |
| |
| std $x1,[$ctx+8*1] ! store [biased] hash value |
| std $x3,[$ctx+8*3] |
| std $x2,[$ctx+8*2] |
| std $x0,[$ctx+8*0] |
| |
| .Labort: |
| ret |
| restore |
| .type poly1305_blocks_fma,#function |
| .size poly1305_blocks_fma,.-poly1305_blocks_fma |
| ___ |
| { |
| my ($mac,$nonce)=($inp,$len); |
| |
| my ($h0,$h1,$h2,$h3,$h4, $d0,$d1,$d2,$d3, $mask |
| ) = (map("%l$_",(0..5)),map("%o$_",(0..4))); |
| |
| $code.=<<___; |
| .align 32 |
| poly1305_emit_fma: |
| save %sp,-STACK_FRAME,%sp |
| |
| ld [$ctx+8*0+0],$d0 ! load hash |
| ld [$ctx+8*0+4],$h0 |
| ld [$ctx+8*1+0],$d1 |
| ld [$ctx+8*1+4],$h1 |
| ld [$ctx+8*2+0],$d2 |
| ld [$ctx+8*2+4],$h2 |
| ld [$ctx+8*3+0],$d3 |
| ld [$ctx+8*3+4],$h3 |
| |
| sethi %hi(0xfff00000),$mask |
| andn $d0,$mask,$d0 ! mask exponent |
| andn $d1,$mask,$d1 |
| andn $d2,$mask,$d2 |
| andn $d3,$mask,$d3 ! can be partially reduced... |
| mov 3,$mask |
| |
| srl $d3,2,$padbit ! ... so reduce |
| and $d3,$mask,$h4 |
| andn $d3,$mask,$d3 |
| add $padbit,$d3,$d3 |
| |
| addcc $d3,$h0,$h0 |
| addccc $d0,$h1,$h1 |
| addccc $d1,$h2,$h2 |
| addccc $d2,$h3,$h3 |
| addc %g0,$h4,$h4 |
| |
| addcc $h0,5,$d0 ! compare to modulus |
| addccc $h1,0,$d1 |
| addccc $h2,0,$d2 |
| addccc $h3,0,$d3 |
| addc $h4,0,$mask |
| |
| srl $mask,2,$mask ! did it carry/borrow? |
| neg $mask,$mask |
| sra $mask,31,$mask ! mask |
| |
| andn $h0,$mask,$h0 |
| and $d0,$mask,$d0 |
| andn $h1,$mask,$h1 |
| and $d1,$mask,$d1 |
| or $d0,$h0,$h0 |
| ld [$nonce+0],$d0 ! load nonce |
| andn $h2,$mask,$h2 |
| and $d2,$mask,$d2 |
| or $d1,$h1,$h1 |
| ld [$nonce+4],$d1 |
| andn $h3,$mask,$h3 |
| and $d3,$mask,$d3 |
| or $d2,$h2,$h2 |
| ld [$nonce+8],$d2 |
| or $d3,$h3,$h3 |
| ld [$nonce+12],$d3 |
| |
| addcc $d0,$h0,$h0 ! accumulate nonce |
| addccc $d1,$h1,$h1 |
| addccc $d2,$h2,$h2 |
| addc $d3,$h3,$h3 |
| |
| stb $h0,[$mac+0] ! write little-endian result |
| srl $h0,8,$h0 |
| stb $h1,[$mac+4] |
| srl $h1,8,$h1 |
| stb $h2,[$mac+8] |
| srl $h2,8,$h2 |
| stb $h3,[$mac+12] |
| srl $h3,8,$h3 |
| |
| stb $h0,[$mac+1] |
| srl $h0,8,$h0 |
| stb $h1,[$mac+5] |
| srl $h1,8,$h1 |
| stb $h2,[$mac+9] |
| srl $h2,8,$h2 |
| stb $h3,[$mac+13] |
| srl $h3,8,$h3 |
| |
| stb $h0,[$mac+2] |
| srl $h0,8,$h0 |
| stb $h1,[$mac+6] |
| srl $h1,8,$h1 |
| stb $h2,[$mac+10] |
| srl $h2,8,$h2 |
| stb $h3,[$mac+14] |
| srl $h3,8,$h3 |
| |
| stb $h0,[$mac+3] |
| stb $h1,[$mac+7] |
| stb $h2,[$mac+11] |
| stb $h3,[$mac+15] |
| |
| ret |
| restore |
| .type poly1305_emit_fma,#function |
| .size poly1305_emit_fma,.-poly1305_emit_fma |
| ___ |
| } |
| |
| $code.=<<___; |
| .align 64 |
| .Lconsts_fma: |
| .word 0x43300000,0x00000000 ! 2^(52+0) |
| .word 0x45300000,0x00000000 ! 2^(52+32) |
| .word 0x47300000,0x00000000 ! 2^(52+64) |
| .word 0x49300000,0x00000000 ! 2^(52+96) |
| .word 0x4b500000,0x00000000 ! 2^(52+130) |
| |
| .word 0x37f40000,0x00000000 ! 5/2^130 |
| .word 0,1<<30 ! fsr: truncate, no exceptions |
| |
| .word 0x44300000,0x00000000 ! 2^(52+16+0) |
| .word 0x46300000,0x00000000 ! 2^(52+16+32) |
| .word 0x48300000,0x00000000 ! 2^(52+16+64) |
| .word 0x4a300000,0x00000000 ! 2^(52+16+96) |
| .word 0x3e300000,0x00000000 ! 2^(52+16+0-96) |
| .word 0x40300000,0x00000000 ! 2^(52+16+32-96) |
| .word 0x42300000,0x00000000 ! 2^(52+16+64-96) |
| .asciz "Poly1305 for SPARCv9/VIS3/FMA, CRYPTOGAMS by <appro\@openssl.org>" |
| .align 4 |
| ___ |
| } |
| |
| # Purpose of these subroutines is to explicitly encode VIS instructions, |
| # so that one can compile the module without having to specify VIS |
| # extensions on compiler command line, e.g. -xarch=v9 vs. -xarch=v9a. |
| # Idea is to reserve for option to produce "universal" binary and let |
| # programmer detect if current CPU is VIS capable at run-time. |
| sub unvis3 { |
| my ($mnemonic,$rs1,$rs2,$rd)=@_; |
| my %bias = ( "g" => 0, "o" => 8, "l" => 16, "i" => 24 ); |
| my ($ref,$opf); |
| my %visopf = ( "addxc" => 0x011, |
| "addxccc" => 0x013, |
| "umulxhi" => 0x016 ); |
| |
| $ref = "$mnemonic\t$rs1,$rs2,$rd"; |
| |
| if ($opf=$visopf{$mnemonic}) { |
| foreach ($rs1,$rs2,$rd) { |
| return $ref if (!/%([goli])([0-9])/); |
| $_=$bias{$1}+$2; |
| } |
| |
| return sprintf ".word\t0x%08x !%s", |
| 0x81b00000|$rd<<25|$rs1<<14|$opf<<5|$rs2, |
| $ref; |
| } else { |
| return $ref; |
| } |
| } |
| |
| sub unfma { |
| my ($mnemonic,$rs1,$rs2,$rs3,$rd)=@_; |
| my ($ref,$opf); |
| my %fmaopf = ( "fmadds" => 0x1, |
| "fmaddd" => 0x2, |
| "fmsubs" => 0x5, |
| "fmsubd" => 0x6 ); |
| |
| $ref = "$mnemonic\t$rs1,$rs2,$rs3,$rd"; |
| |
| if ($opf=$fmaopf{$mnemonic}) { |
| foreach ($rs1,$rs2,$rs3,$rd) { |
| return $ref if (!/%f([0-9]{1,2})/); |
| $_=$1; |
| if ($1>=32) { |
| return $ref if ($1&1); |
| # re-encode for upper double register addressing |
| $_=($1|$1>>5)&31; |
| } |
| } |
| |
| return sprintf ".word\t0x%08x !%s", |
| 0x81b80000|$rd<<25|$rs1<<14|$rs3<<9|$opf<<5|$rs2, |
| $ref; |
| } else { |
| return $ref; |
| } |
| } |
| |
| foreach (split("\n",$code)) { |
| s/\`([^\`]*)\`/eval $1/ge; |
| |
| s/\b(umulxhi|addxc[c]{0,2})\s+(%[goli][0-7]),\s*(%[goli][0-7]),\s*(%[goli][0-7])/ |
| &unvis3($1,$2,$3,$4) |
| /ge or |
| s/\b(fmadd[sd])\s+(%f[0-9]+),\s*(%f[0-9]+),\s*(%f[0-9]+),\s*(%f[0-9]+)/ |
| &unfma($1,$2,$3,$4,$5) |
| /ge; |
| |
| print $_,"\n"; |
| } |
| |
| close STDOUT or die "error closing STDOUT: $!"; |