| ;; Copyright 2012-2016 The OpenSSL Project Authors. All Rights Reserved. |
| ;; |
| ;; Licensed under the Apache License 2.0 (the "License"). You may not use |
| ;; this file except in compliance with the License. You can obtain a copy |
| ;; in the file LICENSE in the source distribution or at |
| ;; https://www.openssl.org/source/license.html |
| ;; |
| ;;==================================================================== |
| ;; Written by Andy Polyakov <appro@openssl.org> for the OpenSSL |
| ;; project. |
| ;; |
| ;; Rights for redistribution and usage in source and binary forms are |
| ;; granted according to the License. Warranty of any kind is disclaimed. |
| ;;==================================================================== |
| ;; Compiler-generated multiply-n-add SPLOOP runs at 12*n cycles, n |
| ;; being the number of 32-bit words, addition - 8*n. Corresponding 4x |
| ;; unrolled SPLOOP-free loops - at ~8*n and ~5*n. Below assembler |
| ;; SPLOOPs spin at ... 2*n cycles [plus epilogue]. |
| ;;==================================================================== |
| .text |
| |
| .if .ASSEMBLER_VERSION<7000000 |
| .asg 0,__TI_EABI__ |
| .endif |
| .if __TI_EABI__ |
| .asg bn_mul_add_words,_bn_mul_add_words |
| .asg bn_mul_words,_bn_mul_words |
| .asg bn_sqr_words,_bn_sqr_words |
| .asg bn_add_words,_bn_add_words |
| .asg bn_sub_words,_bn_sub_words |
| .asg bn_div_words,_bn_div_words |
| .asg bn_sqr_comba8,_bn_sqr_comba8 |
| .asg bn_mul_comba8,_bn_mul_comba8 |
| .asg bn_sqr_comba4,_bn_sqr_comba4 |
| .asg bn_mul_comba4,_bn_mul_comba4 |
| .endif |
| |
| .asg B3,RA |
| .asg A4,ARG0 |
| .asg B4,ARG1 |
| .asg A6,ARG2 |
| .asg B6,ARG3 |
| .asg A8,ARG4 |
| .asg B8,ARG5 |
| .asg A4,RET |
| .asg A15,FP |
| .asg B14,DP |
| .asg B15,SP |
| |
| .global _bn_mul_add_words |
| _bn_mul_add_words: |
| .asmfunc |
| MV ARG2,B0 |
| [!B0] BNOP RA |
| ||[!B0] MVK 0,RET |
| [B0] MVC B0,ILC |
| [B0] ZERO A19 ; high part of accumulator |
| || [B0] MV ARG0,A2 |
| || [B0] MV ARG3,A3 |
| NOP 3 |
| |
| SPLOOP 2 ; 2*n+10 |
| ;;==================================================================== |
| LDW *ARG1++,B7 ; ap[i] |
| NOP 3 |
| LDW *ARG0++,A7 ; rp[i] |
| MPY32U B7,A3,A17:A16 |
| NOP 3 ; [2,0] in epilogue |
| ADDU A16,A7,A21:A20 |
| ADDU A19,A21:A20,A19:A18 |
| || MV.S A17,A23 |
| SPKERNEL 2,1 ; leave slot for "return value" |
| || STW A18,*A2++ ; rp[i] |
| || ADD A19,A23,A19 |
| ;;==================================================================== |
| BNOP RA,4 |
| MV A19,RET ; return value |
| .endasmfunc |
| |
| .global _bn_mul_words |
| _bn_mul_words: |
| .asmfunc |
| MV ARG2,B0 |
| [!B0] BNOP RA |
| ||[!B0] MVK 0,RET |
| [B0] MVC B0,ILC |
| [B0] ZERO A19 ; high part of accumulator |
| NOP 3 |
| |
| SPLOOP 2 ; 2*n+10 |
| ;;==================================================================== |
| LDW *ARG1++,A7 ; ap[i] |
| NOP 4 |
| MPY32U A7,ARG3,A17:A16 |
| NOP 4 ; [2,0] in epiloque |
| ADDU A19,A16,A19:A18 |
| || MV.S A17,A21 |
| SPKERNEL 2,1 ; leave slot for "return value" |
| || STW A18,*ARG0++ ; rp[i] |
| || ADD.L A19,A21,A19 |
| ;;==================================================================== |
| BNOP RA,4 |
| MV A19,RET ; return value |
| .endasmfunc |
| |
| .global _bn_sqr_words |
| _bn_sqr_words: |
| .asmfunc |
| MV ARG2,B0 |
| [!B0] BNOP RA |
| ||[!B0] MVK 0,RET |
| [B0] MVC B0,ILC |
| [B0] MV ARG0,B2 |
| || [B0] ADD 4,ARG0,ARG0 |
| NOP 3 |
| |
| SPLOOP 2 ; 2*n+10 |
| ;;==================================================================== |
| LDW *ARG1++,B7 ; ap[i] |
| NOP 4 |
| MPY32U B7,B7,B1:B0 |
| NOP 3 ; [2,0] in epilogue |
| STW B0,*B2++(8) ; rp[2*i] |
| MV B1,A1 |
| SPKERNEL 2,0 ; fully overlap BNOP RA,5 |
| || STW A1,*ARG0++(8) ; rp[2*i+1] |
| ;;==================================================================== |
| BNOP RA,5 |
| .endasmfunc |
| |
| .global _bn_add_words |
| _bn_add_words: |
| .asmfunc |
| MV ARG3,B0 |
| [!B0] BNOP RA |
| ||[!B0] MVK 0,RET |
| [B0] MVC B0,ILC |
| [B0] ZERO A1 ; carry flag |
| || [B0] MV ARG0,A3 |
| NOP 3 |
| |
| SPLOOP 2 ; 2*n+6 |
| ;;==================================================================== |
| LDW *ARG2++,A7 ; bp[i] |
| || LDW *ARG1++,B7 ; ap[i] |
| NOP 4 |
| ADDU A7,B7,A9:A8 |
| ADDU A1,A9:A8,A1:A0 |
| SPKERNEL 0,0 ; fully overlap BNOP RA,5 |
| || STW A0,*A3++ ; write result |
| || MV A1,RET ; keep carry flag in RET |
| ;;==================================================================== |
| BNOP RA,5 |
| .endasmfunc |
| |
| .global _bn_sub_words |
| _bn_sub_words: |
| .asmfunc |
| MV ARG3,B0 |
| [!B0] BNOP RA |
| ||[!B0] MVK 0,RET |
| [B0] MVC B0,ILC |
| [B0] ZERO A2 ; borrow flag |
| || [B0] MV ARG0,A3 |
| NOP 3 |
| |
| SPLOOP 2 ; 2*n+6 |
| ;;==================================================================== |
| LDW *ARG2++,A7 ; bp[i] |
| || LDW *ARG1++,B7 ; ap[i] |
| NOP 4 |
| SUBU B7,A7,A1:A0 |
| [A2] SUB A1:A0,1,A1:A0 |
| SPKERNEL 0,1 ; leave slot for "return borrow flag" |
| || STW A0,*A3++ ; write result |
| || AND 1,A1,A2 ; pass on borrow flag |
| ;;==================================================================== |
| BNOP RA,4 |
| AND 1,A1,RET ; return borrow flag |
| .endasmfunc |
| |
| .global _bn_div_words |
| _bn_div_words: |
| .asmfunc |
| LMBD 1,A6,A0 ; leading zero bits in dv |
| LMBD 1,A4,A1 ; leading zero bits in hi |
| || MVK 32,B0 |
| CMPLTU A1,A0,A2 |
| || ADD A0,B0,B0 |
| [ A2] BNOP RA |
| ||[ A2] MVK -1,A4 ; return overflow |
| ||[!A2] MV A4,A3 ; reassign hi |
| [!A2] MV B4,A4 ; reassign lo, will be quotient |
| ||[!A2] MVC B0,ILC |
| [!A2] SHL A6,A0,A6 ; normalize dv |
| || MVK 1,A1 |
| |
| [!A2] CMPLTU A3,A6,A1 ; hi<dv? |
| ||[!A2] SHL A4,1,A5:A4 ; lo<<1 |
| [!A1] SUB A3,A6,A3 ; hi-=dv |
| ||[!A1] OR 1,A4,A4 |
| [!A2] SHRU A3,31,A1 ; upper bit |
| ||[!A2] ADDAH A5,A3,A3 ; hi<<1|lo>>31 |
| |
| SPLOOP 3 |
| [!A1] CMPLTU A3,A6,A1 ; hi<dv? |
| ||[ A1] ZERO A1 |
| || SHL A4,1,A5:A4 ; lo<<1 |
| [!A1] SUB A3,A6,A3 ; hi-=dv |
| ||[!A1] OR 1,A4,A4 ; quotient |
| SHRU A3,31,A1 ; upper bit |
| || ADDAH A5,A3,A3 ; hi<<1|lo>>31 |
| SPKERNEL |
| |
| BNOP RA,5 |
| .endasmfunc |
| |
| ;;==================================================================== |
| ;; Not really Comba algorithm, just straightforward NxM... Dedicated |
| ;; fully unrolled real Comba implementations are asymptotically 2x |
| ;; faster, but naturally larger undertaking. Purpose of this exercise |
| ;; was rather to learn to master nested SPLOOPs... |
| ;;==================================================================== |
| .global _bn_sqr_comba8 |
| .global _bn_mul_comba8 |
| _bn_sqr_comba8: |
| MV ARG1,ARG2 |
| _bn_mul_comba8: |
| .asmfunc |
| MVK 8,B0 ; N, RILC |
| || MVK 8,A0 ; M, outer loop counter |
| || MV ARG1,A5 ; copy ap |
| || MV ARG0,B4 ; copy rp |
| || ZERO B19 ; high part of accumulator |
| MVC B0,RILC |
| || SUB B0,2,B1 ; N-2, initial ILC |
| || SUB B0,1,B2 ; const B2=N-1 |
| || LDW *A5++,B6 ; ap[0] |
| || MV A0,A3 ; const A3=M |
| sploopNxM?: ; for best performance arrange M<=N |
| [A0] SPLOOPD 2 ; 2*n+10 |
| || MVC B1,ILC |
| || ADDAW B4,B0,B5 |
| || ZERO B7 |
| || LDW *A5++,A9 ; pre-fetch ap[1] |
| || ZERO A1 |
| || SUB A0,1,A0 |
| ;;==================================================================== |
| ;; SPLOOP from bn_mul_add_words, but with flipped A<>B register files. |
| ;; This is because of Advisory 15 from TI publication SPRZ247I. |
| LDW *ARG2++,A7 ; bp[i] |
| NOP 3 |
| [A1] LDW *B5++,B7 ; rp[i] |
| MPY32U A7,B6,B17:B16 |
| NOP 3 |
| ADDU B16,B7,B21:B20 |
| ADDU B19,B21:B20,B19:B18 |
| || MV.S B17,B23 |
| SPKERNEL |
| || STW B18,*B4++ ; rp[i] |
| || ADD.S B19,B23,B19 |
| ;;==================================================================== |
| outer?: ; m*2*(n+1)+10 |
| SUBAW ARG2,A3,ARG2 ; rewind bp to bp[0] |
| SPMASKR |
| || CMPGT A0,1,A2 ; done pre-fetching ap[i+1]? |
| MVD A9,B6 ; move through .M unit(*) |
| [A2] LDW *A5++,A9 ; pre-fetch ap[i+1] |
| SUBAW B5,B2,B5 ; rewind rp to rp[1] |
| MVK 1,A1 |
| [A0] BNOP.S1 outer?,4 |
| || [A0] SUB.L A0,1,A0 |
| STW B19,*B4--[B2] ; rewind rp tp rp[1] |
| || ZERO.S B19 ; high part of accumulator |
| ;; end of outer? |
| BNOP RA,5 ; return |
| .endasmfunc |
| ;; (*) It should be noted that B6 is used as input to MPY32U in |
| ;; chronologically next cycle in *preceding* SPLOOP iteration. |
| ;; Normally such arrangement would require DINT, but at this |
| ;; point SPLOOP is draining and interrupts are disabled |
| ;; implicitly. |
| |
| .global _bn_sqr_comba4 |
| .global _bn_mul_comba4 |
| _bn_sqr_comba4: |
| MV ARG1,ARG2 |
| _bn_mul_comba4: |
| .asmfunc |
| .if 0 |
| BNOP sploopNxM?,3 |
| ;; Above mentioned m*2*(n+1)+10 does not apply in n=m=4 case, |
| ;; because of low-counter effect, when prologue phase finishes |
| ;; before SPKERNEL instruction is reached. As result it's 25% |
| ;; slower than expected... |
| MVK 4,B0 ; N, RILC |
| || MVK 4,A0 ; M, outer loop counter |
| || MV ARG1,A5 ; copy ap |
| || MV ARG0,B4 ; copy rp |
| || ZERO B19 ; high part of accumulator |
| MVC B0,RILC |
| || SUB B0,2,B1 ; first ILC |
| || SUB B0,1,B2 ; const B2=N-1 |
| || LDW *A5++,B6 ; ap[0] |
| || MV A0,A3 ; const A3=M |
| .else |
| ;; This alternative is an exercise in fully unrolled Comba |
| ;; algorithm implementation that operates at n*(n+1)+12, or |
| ;; as little as 32 cycles... |
| LDW *ARG1[0],B16 ; a[0] |
| || LDW *ARG2[0],A16 ; b[0] |
| LDW *ARG1[1],B17 ; a[1] |
| || LDW *ARG2[1],A17 ; b[1] |
| LDW *ARG1[2],B18 ; a[2] |
| || LDW *ARG2[2],A18 ; b[2] |
| LDW *ARG1[3],B19 ; a[3] |
| || LDW *ARG2[3],A19 ; b[3] |
| NOP |
| MPY32U A16,B16,A1:A0 ; a[0]*b[0] |
| MPY32U A17,B16,A23:A22 ; a[0]*b[1] |
| MPY32U A16,B17,A25:A24 ; a[1]*b[0] |
| MPY32U A16,B18,A27:A26 ; a[2]*b[0] |
| STW A0,*ARG0[0] |
| || MPY32U A17,B17,A29:A28 ; a[1]*b[1] |
| MPY32U A18,B16,A31:A30 ; a[0]*b[2] |
| || ADDU A22,A1,A1:A0 |
| MV A23,B0 |
| || MPY32U A19,B16,A21:A20 ; a[3]*b[0] |
| || ADDU A24,A1:A0,A1:A0 |
| ADDU A25,B0,B1:B0 |
| || STW A0,*ARG0[1] |
| || MPY32U A18,B17,A23:A22 ; a[2]*b[1] |
| || ADDU A26,A1,A9:A8 |
| ADDU A27,B1,B9:B8 |
| || MPY32U A17,B18,A25:A24 ; a[1]*b[2] |
| || ADDU A28,A9:A8,A9:A8 |
| ADDU A29,B9:B8,B9:B8 |
| || MPY32U A16,B19,A27:A26 ; a[0]*b[3] |
| || ADDU A30,A9:A8,A9:A8 |
| ADDU A31,B9:B8,B9:B8 |
| || ADDU B0,A9:A8,A9:A8 |
| STW A8,*ARG0[2] |
| || ADDU A20,A9,A1:A0 |
| ADDU A21,B9,B1:B0 |
| || MPY32U A19,B17,A21:A20 ; a[3]*b[1] |
| || ADDU A22,A1:A0,A1:A0 |
| ADDU A23,B1:B0,B1:B0 |
| || MPY32U A18,B18,A23:A22 ; a[2]*b[2] |
| || ADDU A24,A1:A0,A1:A0 |
| ADDU A25,B1:B0,B1:B0 |
| || MPY32U A17,B19,A25:A24 ; a[1]*b[3] |
| || ADDU A26,A1:A0,A1:A0 |
| ADDU A27,B1:B0,B1:B0 |
| || ADDU B8,A1:A0,A1:A0 |
| STW A0,*ARG0[3] |
| || MPY32U A19,B18,A27:A26 ; a[3]*b[2] |
| || ADDU A20,A1,A9:A8 |
| ADDU A21,B1,B9:B8 |
| || MPY32U A18,B19,A29:A28 ; a[2]*b[3] |
| || ADDU A22,A9:A8,A9:A8 |
| ADDU A23,B9:B8,B9:B8 |
| || MPY32U A19,B19,A31:A30 ; a[3]*b[3] |
| || ADDU A24,A9:A8,A9:A8 |
| ADDU A25,B9:B8,B9:B8 |
| || ADDU B0,A9:A8,A9:A8 |
| STW A8,*ARG0[4] |
| || ADDU A26,A9,A1:A0 |
| ADDU A27,B9,B1:B0 |
| || ADDU A28,A1:A0,A1:A0 |
| ADDU A29,B1:B0,B1:B0 |
| || BNOP RA |
| || ADDU B8,A1:A0,A1:A0 |
| STW A0,*ARG0[5] |
| || ADDU A30,A1,A9:A8 |
| ADD A31,B1,B8 |
| ADDU B0,A9:A8,A9:A8 ; removed || to avoid cross-path stall below |
| ADD B8,A9,A9 |
| || STW A8,*ARG0[6] |
| STW A9,*ARG0[7] |
| .endif |
| .endasmfunc |