blob: 65ffaf98bca606279f7eeb9ff05be88c86e37d0e [file] [log] [blame]
Rich Salz6aa36e82016-05-21 08:23:39 -04001#! /usr/bin/env perl
2# Copyright 2010-2016 The OpenSSL Project Authors. All Rights Reserved.
3#
4# Licensed under the OpenSSL license (the "License"). You may not use
5# this file except in compliance with the License. You can obtain a copy
6# in the file LICENSE in the source distribution or at
7# https://www.openssl.org/source/license.html
8
Andy Polyakov7d1f55e2010-09-10 14:50:17 +00009
10# ====================================================================
11# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
12# project. The module is, however, dual licensed under OpenSSL and
13# CRYPTOGAMS licenses depending on where you obtain it. For further
14# details see http://www.openssl.org/~appro/cryptogams/.
15# ====================================================================
16
17# September 2010.
Andy Polyakov8986e372010-09-21 11:37:00 +000018#
19# The module implements "4-bit" GCM GHASH function and underlying
20# single multiplication operation in GF(2^128). "4-bit" means that it
21# uses 256 bytes per-key table [+128 bytes shared table]. Performance
22# was measured to be ~18 cycles per processed byte on z10, which is
23# almost 40% better than gcc-generated code. It should be noted that
24# 18 cycles is worse result than expected: loop is scheduled for 12
25# and the result should be close to 12. In the lack of instruction-
26# level profiling data it's impossible to tell why...
Andy Polyakov7d1f55e2010-09-10 14:50:17 +000027
Andy Polyakove822c752010-11-29 20:52:43 +000028# November 2010.
29#
30# Adapt for -m31 build. If kernel supports what's called "highgprs"
31# feature on Linux [see /proc/cpuinfo], it's possible to use 64-bit
32# instructions and achieve "64-bit" performance even in 31-bit legacy
33# application context. The feature is not specific to any particular
34# processor, as long as it's "z-CPU". Latter implies that the code
35# remains z/Architecture specific. On z990 it was measured to perform
36# 2.8x better than 32-bit code generated by gcc 4.3.
37
Andy Polyakov0ab8fd52011-03-04 13:09:16 +000038# March 2011.
39#
40# Support for hardware KIMD-GHASH is verified to produce correct
41# result and therefore is engaged. On z196 it was measured to process
42# 8KB buffer ~7 faster than software implementation. It's not as
43# impressive for smaller buffer sizes and for smallest 16-bytes buffer
44# it's actually almost 2 times slower. Which is the reason why
45# KIMD-GHASH is not used in gcm_gmult_4bit.
46
Andy Polyakove822c752010-11-29 20:52:43 +000047$flavour = shift;
48
49if ($flavour =~ /3[12]/) {
50 $SIZE_T=4;
51 $g="";
52} else {
53 $SIZE_T=8;
54 $g="g";
55}
56
Richard Levittea5aa63a2016-03-11 00:49:47 +010057while (($output=shift) && ($output!~/\w[\w\-]*\.\w+$/)) {}
Andy Polyakov7d1f55e2010-09-10 14:50:17 +000058open STDOUT,">$output";
59
Andy Polyakov0ab8fd52011-03-04 13:09:16 +000060$softonly=0;
Andy Polyakov8986e372010-09-21 11:37:00 +000061
Andy Polyakov7d1f55e2010-09-10 14:50:17 +000062$Zhi="%r0";
63$Zlo="%r1";
64
65$Xi="%r2"; # argument block
66$Htbl="%r3";
67$inp="%r4";
68$len="%r5";
69
70$rem0="%r6"; # variables
71$rem1="%r7";
72$nlo="%r8";
73$nhi="%r9";
74$xi="%r10";
75$cnt="%r11";
76$tmp="%r12";
77$x78="%r13";
78$rem_4bit="%r14";
79
80$sp="%r15";
81
82$code.=<<___;
83.text
84
85.globl gcm_gmult_4bit
86.align 32
87gcm_gmult_4bit:
Andy Polyakov8986e372010-09-21 11:37:00 +000088___
Andy Polyakov0ab8fd52011-03-04 13:09:16 +000089$code.=<<___ if(!$softonly && 0); # hardware is slow for single block...
Andy Polyakov8986e372010-09-21 11:37:00 +000090 larl %r1,OPENSSL_s390xcap_P
91 lg %r0,0(%r1)
92 tmhl %r0,0x4000 # check for message-security-assist
93 jz .Lsoft_gmult
94 lghi %r0,0
Andy Polyakov670ad0f2016-04-15 16:39:22 +020095 lg %r1,24(%r1) # load second word of kimd capabilities vector
Andy Polyakov8986e372010-09-21 11:37:00 +000096 tmhh %r1,0x4000 # check for function 65
97 jz .Lsoft_gmult
98 stg %r0,16($sp) # arrange 16 bytes of zero input
99 stg %r0,24($sp)
100 lghi %r0,65 # function 65
101 la %r1,0($Xi) # H lies right after Xi in gcm128_context
102 la $inp,16($sp)
103 lghi $len,16
104 .long 0xb93e0004 # kimd %r0,$inp
105 brc 1,.-4 # pay attention to "partial completion"
106 br %r14
107.align 32
108.Lsoft_gmult:
109___
110$code.=<<___;
Andy Polyakove822c752010-11-29 20:52:43 +0000111 stm${g} %r6,%r14,6*$SIZE_T($sp)
Andy Polyakov7d1f55e2010-09-10 14:50:17 +0000112
113 aghi $Xi,-1
114 lghi $len,1
115 lghi $x78,`0xf<<3`
116 larl $rem_4bit,rem_4bit
117
118 lg $Zlo,8+1($Xi) # Xi
119 j .Lgmult_shortcut
120.type gcm_gmult_4bit,\@function
121.size gcm_gmult_4bit,(.-gcm_gmult_4bit)
122
123.globl gcm_ghash_4bit
124.align 32
125gcm_ghash_4bit:
Andy Polyakov8986e372010-09-21 11:37:00 +0000126___
127$code.=<<___ if(!$softonly);
128 larl %r1,OPENSSL_s390xcap_P
129 lg %r0,0(%r1)
130 tmhl %r0,0x4000 # check for message-security-assist
131 jz .Lsoft_ghash
132 lghi %r0,0
133 la %r1,16($sp)
134 .long 0xb93e0004 # kimd %r0,%r4
135 lg %r1,24($sp)
136 tmhh %r1,0x4000 # check for function 65
137 jz .Lsoft_ghash
138 lghi %r0,65 # function 65
139 la %r1,0($Xi) # H lies right after Xi in gcm128_context
140 .long 0xb93e0004 # kimd %r0,$inp
141 brc 1,.-4 # pay attention to "partial completion"
142 br %r14
143.align 32
144.Lsoft_ghash:
145___
Andy Polyakov26e6bac2012-04-12 06:44:34 +0000146$code.=<<___ if ($flavour =~ /3[12]/);
Andy Polyakove822c752010-11-29 20:52:43 +0000147 llgfr $len,$len
148___
Andy Polyakov8986e372010-09-21 11:37:00 +0000149$code.=<<___;
Andy Polyakove822c752010-11-29 20:52:43 +0000150 stm${g} %r6,%r14,6*$SIZE_T($sp)
Andy Polyakov7d1f55e2010-09-10 14:50:17 +0000151
152 aghi $Xi,-1
153 srlg $len,$len,4
154 lghi $x78,`0xf<<3`
155 larl $rem_4bit,rem_4bit
156
157 lg $Zlo,8+1($Xi) # Xi
158 lg $Zhi,0+1($Xi)
Andy Polyakov8986e372010-09-21 11:37:00 +0000159 lghi $tmp,0
Andy Polyakov7d1f55e2010-09-10 14:50:17 +0000160.Louter:
Andy Polyakov8986e372010-09-21 11:37:00 +0000161 xg $Zhi,0($inp) # Xi ^= inp
162 xg $Zlo,8($inp)
163 xgr $Zhi,$tmp
Andy Polyakov7d1f55e2010-09-10 14:50:17 +0000164 stg $Zlo,8+1($Xi)
165 stg $Zhi,0+1($Xi)
166
167.Lgmult_shortcut:
Andy Polyakov8986e372010-09-21 11:37:00 +0000168 lghi $tmp,0xf0
169 sllg $nlo,$Zlo,4
170 srlg $xi,$Zlo,8 # extract second byte
171 ngr $nlo,$tmp
Andy Polyakov7d1f55e2010-09-10 14:50:17 +0000172 lgr $nhi,$Zlo
Andy Polyakov7d1f55e2010-09-10 14:50:17 +0000173 lghi $cnt,14
Andy Polyakov8986e372010-09-21 11:37:00 +0000174 ngr $nhi,$tmp
Andy Polyakov7d1f55e2010-09-10 14:50:17 +0000175
176 lg $Zlo,8($nlo,$Htbl)
177 lg $Zhi,0($nlo,$Htbl)
178
179 sllg $nlo,$xi,4
Andy Polyakov7d1f55e2010-09-10 14:50:17 +0000180 sllg $rem0,$Zlo,3
Andy Polyakov8986e372010-09-21 11:37:00 +0000181 ngr $nlo,$tmp
Andy Polyakov7d1f55e2010-09-10 14:50:17 +0000182 ngr $rem0,$x78
Andy Polyakov8986e372010-09-21 11:37:00 +0000183 ngr $xi,$tmp
184
Andy Polyakov7d1f55e2010-09-10 14:50:17 +0000185 sllg $tmp,$Zhi,60
Andy Polyakov8986e372010-09-21 11:37:00 +0000186 srlg $Zlo,$Zlo,4
Andy Polyakov7d1f55e2010-09-10 14:50:17 +0000187 srlg $Zhi,$Zhi,4
Andy Polyakov8986e372010-09-21 11:37:00 +0000188 xg $Zlo,8($nhi,$Htbl)
Andy Polyakov7d1f55e2010-09-10 14:50:17 +0000189 xg $Zhi,0($nhi,$Htbl)
190 lgr $nhi,$xi
191 sllg $rem1,$Zlo,3
Andy Polyakov8986e372010-09-21 11:37:00 +0000192 xgr $Zlo,$tmp
193 ngr $rem1,$x78
Andy Polyakovd1625842014-02-02 00:09:17 +0100194 sllg $tmp,$Zhi,60
Andy Polyakov8986e372010-09-21 11:37:00 +0000195 j .Lghash_inner
196.align 16
Andy Polyakov7d1f55e2010-09-10 14:50:17 +0000197.Lghash_inner:
198 srlg $Zlo,$Zlo,4
Andy Polyakov7d1f55e2010-09-10 14:50:17 +0000199 srlg $Zhi,$Zhi,4
Andy Polyakovd1625842014-02-02 00:09:17 +0100200 xg $Zlo,8($nlo,$Htbl)
Andy Polyakov7d1f55e2010-09-10 14:50:17 +0000201 llgc $xi,0($cnt,$Xi)
Andy Polyakov7d1f55e2010-09-10 14:50:17 +0000202 xg $Zhi,0($nlo,$Htbl)
203 sllg $nlo,$xi,4
Andy Polyakov8986e372010-09-21 11:37:00 +0000204 xg $Zhi,0($rem0,$rem_4bit)
Andy Polyakov7d1f55e2010-09-10 14:50:17 +0000205 nill $nlo,0xf0
Andy Polyakov8986e372010-09-21 11:37:00 +0000206 sllg $rem0,$Zlo,3
Andy Polyakov7d1f55e2010-09-10 14:50:17 +0000207 xgr $Zlo,$tmp
Andy Polyakov8986e372010-09-21 11:37:00 +0000208 ngr $rem0,$x78
209 nill $xi,0xf0
210
211 sllg $tmp,$Zhi,60
212 srlg $Zlo,$Zlo,4
Andy Polyakov7d1f55e2010-09-10 14:50:17 +0000213 srlg $Zhi,$Zhi,4
Andy Polyakov8986e372010-09-21 11:37:00 +0000214 xg $Zlo,8($nhi,$Htbl)
Andy Polyakov7d1f55e2010-09-10 14:50:17 +0000215 xg $Zhi,0($nhi,$Htbl)
216 lgr $nhi,$xi
Andy Polyakov8986e372010-09-21 11:37:00 +0000217 xg $Zhi,0($rem1,$rem_4bit)
218 sllg $rem1,$Zlo,3
219 xgr $Zlo,$tmp
220 ngr $rem1,$x78
Andy Polyakovd1625842014-02-02 00:09:17 +0100221 sllg $tmp,$Zhi,60
Andy Polyakov7d1f55e2010-09-10 14:50:17 +0000222 brct $cnt,.Lghash_inner
223
224 srlg $Zlo,$Zlo,4
Andy Polyakov8986e372010-09-21 11:37:00 +0000225 srlg $Zhi,$Zhi,4
Andy Polyakov7d1f55e2010-09-10 14:50:17 +0000226 xg $Zlo,8($nlo,$Htbl)
Andy Polyakov7d1f55e2010-09-10 14:50:17 +0000227 xg $Zhi,0($nlo,$Htbl)
Andy Polyakov8986e372010-09-21 11:37:00 +0000228 sllg $xi,$Zlo,3
Andy Polyakov7d1f55e2010-09-10 14:50:17 +0000229 xg $Zhi,0($rem0,$rem_4bit)
Andy Polyakov8986e372010-09-21 11:37:00 +0000230 xgr $Zlo,$tmp
231 ngr $xi,$x78
232
233 sllg $tmp,$Zhi,60
234 srlg $Zlo,$Zlo,4
235 srlg $Zhi,$Zhi,4
236 xg $Zlo,8($nhi,$Htbl)
237 xg $Zhi,0($nhi,$Htbl)
238 xgr $Zlo,$tmp
239 xg $Zhi,0($rem1,$rem_4bit)
240
241 lg $tmp,0($xi,$rem_4bit)
242 la $inp,16($inp)
243 sllg $tmp,$tmp,4 # correct last rem_4bit[rem]
Andy Polyakov7d1f55e2010-09-10 14:50:17 +0000244 brctg $len,.Louter
245
Andy Polyakov8986e372010-09-21 11:37:00 +0000246 xgr $Zhi,$tmp
Andy Polyakov7d1f55e2010-09-10 14:50:17 +0000247 stg $Zlo,8+1($Xi)
248 stg $Zhi,0+1($Xi)
Andy Polyakove822c752010-11-29 20:52:43 +0000249 lm${g} %r6,%r14,6*$SIZE_T($sp)
Andy Polyakov7d1f55e2010-09-10 14:50:17 +0000250 br %r14
251.type gcm_ghash_4bit,\@function
252.size gcm_ghash_4bit,(.-gcm_ghash_4bit)
253
254.align 64
255rem_4bit:
Andy Polyakov8986e372010-09-21 11:37:00 +0000256 .long `0x0000<<12`,0,`0x1C20<<12`,0,`0x3840<<12`,0,`0x2460<<12`,0
257 .long `0x7080<<12`,0,`0x6CA0<<12`,0,`0x48C0<<12`,0,`0x54E0<<12`,0
258 .long `0xE100<<12`,0,`0xFD20<<12`,0,`0xD940<<12`,0,`0xC560<<12`,0
259 .long `0x9180<<12`,0,`0x8DA0<<12`,0,`0xA9C0<<12`,0,`0xB5E0<<12`,0
Andy Polyakov7d1f55e2010-09-10 14:50:17 +0000260.type rem_4bit,\@object
261.size rem_4bit,(.-rem_4bit)
262.string "GHASH for s390x, CRYPTOGAMS by <appro\@openssl.org>"
263___
264
265$code =~ s/\`([^\`]*)\`/eval $1/gem;
266print $code;
267close STDOUT;