crypto/bn/asm/ppc64-mont-fixed.pl - third_party/openssl - Git at Google

 #! /usr/bin/env perl
 # Copyright 2021 The OpenSSL Project Authors. All Rights Reserved.
 #
 # Licensed under the Apache License 2.0 (the "License").  You may not use
 # this file except in compliance with the License.  You can obtain a copy
 # in the file LICENSE in the source distribution or at
 # https://www.openssl.org/source/license.html

 # ====================================================================
 # Written by Amitay Isaacs <amitay@ozlabs.org>, Martin Schwenke
 # <martin@meltin.net> & Alastair D'Silva <alastair@d-silva.org> for
 # the OpenSSL project.
 # ====================================================================

 #
 # Fixed length (n=6), unrolled PPC Montgomery Multiplication
 #

 # 2021
 #
 # Although this is a generic implementation for unrolling Montgomery
 # Multiplication for arbitrary values of n, this is currently only
 # used for n = 6 to improve the performance of ECC p384.
 #
 # Unrolling allows intermediate results to be stored in registers,
 # rather than on the stack, improving performance by ~7% compared to
 # the existing PPC assembly code.
 #
 # The ISA 3.0 implementation uses combination multiply/add
 # instructions (maddld, maddhdu) to improve performance by an
 # additional ~10% on Power 9.
 #
 # Finally, saving non-volatile registers into volatile vector
 # registers instead of onto the stack saves a little more.
 #
 # On a Power 9 machine we see an overall improvement of ~18%.
 #

 use strict;
 use warnings;

 my ($flavour, $output, $dir, $xlate);

 # $output is the last argument if it looks like a file (it has an extension)
 # $flavour is the first argument if it doesn't look like a file
 $output = $#ARGV >= 0 && $ARGV[$#ARGV] =~ m|\.\w+$| ? pop : undef;
 $flavour = $#ARGV >= 0 && $ARGV[0] !~ m|\.| ? shift : undef;

 $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
 ( $xlate="${dir}ppc-xlate.pl" and -f $xlate ) or
 ( $xlate="${dir}../../perlasm/ppc-xlate.pl" and -f $xlate) or
 die "can't locate ppc-xlate.pl";

 open STDOUT,"| $^X $xlate $flavour \"$output\""
     or die "can't call $xlate: $!";

 if ($flavour !~ /64/) {
 	die "bad flavour ($flavour) - only ppc64 permitted";
 }

 my $SIZE_T= 8;

 # Registers are global so the code is remotely readable

 # Parameters for Montgomery multiplication
 my $sp	= "r1";
 my $toc	= "r2";
 my $rp	= "r3";
 my $ap	= "r4";
 my $bp	= "r5";
 my $np	= "r6";
 my $n0	= "r7";
 my $num	= "r8";

 my $i	= "r9";
 my $c0	= "r10";
 my $bp0	= "r11";
 my $bpi	= "r11";
 my $bpj	= "r11";
 my $tj	= "r12";
 my $apj	= "r12";
 my $npj	= "r12";
 my $lo	= "r14";
 my $c1	= "r14";

 # Non-volatile registers used for tp[i]
 #
 # 12 registers are available but the limit on unrolling is 10,
 # since registers from $tp[0] to $tp[$n+1] are used.
 my @tp = ("r20" .. "r31");

 # volatile VSRs for saving non-volatile GPRs - faster than stack
 my @vsrs = ("v32" .. "v46");

 package Mont;

 sub new($$)
 {
 	my ($class, $n) = @_;

 	if ($n > 10) {
 		die "Can't unroll for BN length ${n} (maximum 10)"
 	}

 	my $self = {
 		code => "",
 		n => $n,
 	};
 	bless $self, $class;

 	return $self;
 }

 sub add_code($$)
 {
 	my ($self, $c) = @_;

 	$self->{code} .= $c;
 }

 sub get_code($)
 {
 	my ($self) = @_;

 	return $self->{code};
 }

 sub get_function_name($)
 {
 	my ($self) = @_;

 	return "bn_mul_mont_fixed_n" . $self->{n};
 }

 sub get_label($$)
 {
 	my ($self, $l) = @_;

 	return "L" . $l . "_" . $self->{n};
 }

 sub get_labels($@)
 {
 	my ($self, @labels) = @_;

 	my %out = ();

 	foreach my $l (@labels) {
 		$out{"$l"} = $self->get_label("$l");
 	}

 	return \%out;
 }

 sub nl($)
 {
 	my ($self) = @_;

 	$self->add_code("\n");
 }

 sub copy_result($)
 {
 	my ($self) = @_;

 	my ($n) = $self->{n};

 	for (my $j = 0; $j < $n; $j++) {
 		$self->add_code(<<___);
 	std		$tp[$j],`$j*$SIZE_T`($rp)
 ___
 	}

 }

 sub mul_mont_fixed($)
 {
 	my ($self) = @_;

 	my ($n) = $self->{n};
 	my $fname = $self->get_function_name();
 	my $label = $self->get_labels("outer", "enter", "sub", "copy", "end");

 	$self->add_code(<<___);

 .globl	.${fname}
 .align	5
 .${fname}:

 ___

 	$self->save_registers();

 	$self->add_code(<<___);
 	ld		$n0,0($n0)

 	ld		$bp0,0($bp)

 	ld		$apj,0($ap)
 ___

 	$self->mul_c_0($tp[0], $apj, $bp0, $c0);

 	for (my $j = 1; $j < $n - 1; $j++) {
 		$self->add_code(<<___);
 	ld		$apj,`$j*$SIZE_T`($ap)
 ___
 		$self->mul($tp[$j], $apj, $bp0, $c0);
 	}

 	$self->add_code(<<___);
 	ld		$apj,`($n-1)*$SIZE_T`($ap)
 ___

 	$self->mul_last($tp[$n-1], $tp[$n], $apj, $bp0, $c0);

 	$self->add_code(<<___);
 	li		$tp[$n+1],0

 ___

 	$self->add_code(<<___);
 	li		$i,0
 	mtctr		$num
 	b		$label->{"enter"}

 .align	4
 $label->{"outer"}:
 	ldx		$bpi,$bp,$i

 	ld		$apj,0($ap)
 ___

 	$self->mul_add_c_0($tp[0], $tp[0], $apj, $bpi, $c0);

 	for (my $j = 1; $j < $n; $j++) {
 		$self->add_code(<<___);
 	ld		$apj,`$j*$SIZE_T`($ap)
 ___
 		$self->mul_add($tp[$j], $tp[$j], $apj, $bpi, $c0);
 	}

 	$self->add_code(<<___);
 	addc		$tp[$n],$tp[$n],$c0
 	addze		$tp[$n+1],$tp[$n+1]
 ___

 	$self->add_code(<<___);
 .align	4
 $label->{"enter"}:
 	mulld		$bpi,$tp[0],$n0

 	ld		$npj,0($np)
 ___

 	$self->mul_add_c_0($lo, $tp[0], $bpi, $npj, $c0);

 	for (my $j = 1; $j < $n; $j++) {
 		$self->add_code(<<___);
 	ld		$npj,`$j*$SIZE_T`($np)
 ___
 		$self->mul_add($tp[$j-1], $tp[$j], $npj, $bpi, $c0);
 	}

 	$self->add_code(<<___);
 	addc		$tp[$n-1],$tp[$n],$c0
 	addze		$tp[$n],$tp[$n+1]

 	addi		$i,$i,$SIZE_T
 	bdnz		$label->{"outer"}

 	and.		$tp[$n],$tp[$n],$tp[$n]
 	bne		$label->{"sub"}

 	cmpld	$tp[$n-1],$npj
 	blt		$label->{"copy"}

 $label->{"sub"}:
 ___

 	#
 	# Reduction
 	#

 		$self->add_code(<<___);
 	ld		$bpj,`0*$SIZE_T`($np)
 	subfc		$c1,$bpj,$tp[0]
 	std		$c1,`0*$SIZE_T`($rp)

 ___
 	for (my $j = 1; $j < $n - 1; $j++) {
 		$self->add_code(<<___);
 	ld		$bpj,`$j*$SIZE_T`($np)
 	subfe		$c1,$bpj,$tp[$j]
 	std		$c1,`$j*$SIZE_T`($rp)

 ___
 	}

 		$self->add_code(<<___);
 	subfe		$c1,$npj,$tp[$n-1]
 	std		$c1,`($n-1)*$SIZE_T`($rp)

 ___

 	$self->add_code(<<___);
 	addme.		$tp[$n],$tp[$n]
 	beq		$label->{"end"}

 $label->{"copy"}:
 ___

 	$self->copy_result();

 	$self->add_code(<<___);

 $label->{"end"}:
 ___

 	$self->restore_registers();

 	$self->add_code(<<___);
 	li		r3,1
 	blr
 .size .${fname},.-.${fname}
 ___

 }

 package Mont::GPR;

 our @ISA = ('Mont');

 sub new($$)
 {
     my ($class, $n) = @_;

     return $class->SUPER::new($n);
 }

 sub save_registers($)
 {
 	my ($self) = @_;

 	my $n = $self->{n};

 	$self->add_code(<<___);
 	std	$lo,-8($sp)
 ___

 	for (my $j = 0; $j <= $n+1; $j++) {
 		$self->{code}.=<<___;
 	std	$tp[$j],-`($j+2)*8`($sp)
 ___
 	}

 	$self->add_code(<<___);

 ___
 }

 sub restore_registers($)
 {
 	my ($self) = @_;

 	my $n = $self->{n};

 	$self->add_code(<<___);
 	ld	$lo,-8($sp)
 ___

 	for (my $j = 0; $j <= $n+1; $j++) {
 		$self->{code}.=<<___;
 	ld	$tp[$j],-`($j+2)*8`($sp)
 ___
 	}

 	$self->{code} .=<<___;

 ___
 }

 # Direct translation of C mul()
 sub mul($$$$$)
 {
 	my ($self, $r, $a, $w, $c) = @_;

 	$self->add_code(<<___);
 	mulld		$lo,$a,$w
 	addc		$r,$lo,$c
 	mulhdu		$c,$a,$w
 	addze		$c,$c

 ___
 }

 # Like mul() but $c is ignored as an input - an optimisation to save a
 # preliminary instruction that would set input $c to 0
 sub mul_c_0($$$$$)
 {
 	my ($self, $r, $a, $w, $c) = @_;

 	$self->add_code(<<___);
 	mulld		$r,$a,$w
 	mulhdu		$c,$a,$w

 ___
 }

 # Like mul() but does not to the final addition of CA into $c - an
 # optimisation to save an instruction
 sub mul_last($$$$$$)
 {
 	my ($self, $r1, $r2, $a, $w, $c) = @_;

 	$self->add_code(<<___);
 	mulld		$lo,$a,$w
 	addc		$r1,$lo,$c
 	mulhdu		$c,$a,$w

 	addze		$r2,$c
 ___
 }

 # Like C mul_add() but allow $r_out and $r_in to be different
 sub mul_add($$$$$$)
 {
 	my ($self, $r_out, $r_in, $a, $w, $c) = @_;

 	$self->add_code(<<___);
 	mulld		$lo,$a,$w
 	addc		$lo,$lo,$c
 	mulhdu		$c,$a,$w
 	addze		$c,$c
 	addc		$r_out,$r_in,$lo
 	addze		$c,$c

 ___
 }

 # Like mul_add() but $c is ignored as an input - an optimisation to save a
 # preliminary instruction that would set input $c to 0
 sub mul_add_c_0($$$$$$)
 {
 	my ($self, $r_out, $r_in, $a, $w, $c) = @_;

 	$self->add_code(<<___);
 	mulld		$lo,$a,$w
 	addc		$r_out,$r_in,$lo
 	mulhdu		$c,$a,$w
 	addze		$c,$c

 ___
 }

 package Mont::GPR_300;

 our @ISA = ('Mont::GPR');

 sub new($$)
 {
 	my ($class, $n) = @_;

 	my $mont = $class->SUPER::new($n);

 	return $mont;
 }

 sub get_function_name($)
 {
 	my ($self) = @_;

 	return "bn_mul_mont_300_fixed_n" . $self->{n};
 }

 sub get_label($$)
 {
 	my ($self, $l) = @_;

 	return "L" . $l . "_300_" . $self->{n};
 }

 # Direct translation of C mul()
 sub mul($$$$$)
 {
 	my ($self, $r, $a, $w, $c, $last) = @_;

 	$self->add_code(<<___);
 	maddld		$r,$a,$w,$c
 	maddhdu		$c,$a,$w,$c

 ___
 }

 # Save the last carry as the final entry
 sub mul_last($$$$$)
 {
 	my ($self, $r1, $r2, $a, $w, $c) = @_;

 	$self->add_code(<<___);
 	maddld		$r1,$a,$w,$c
 	maddhdu		$r2,$a,$w,$c

 ___
 }

 # Like mul() but $c is ignored as an input - an optimisation to save a
 # preliminary instruction that would set input $c to 0
 sub mul_c_0($$$$$)
 {
 	my ($self, $r, $a, $w, $c) = @_;

 	$self->add_code(<<___);
 	mulld          $r,$a,$w
 	mulhdu          $c,$a,$w

 ___
 }

 # Like C mul_add() but allow $r_out and $r_in to be different
 sub mul_add($$$$$$)
 {
 	my ($self, $r_out, $r_in, $a, $w, $c) = @_;

 	$self->add_code(<<___);
 	maddld		$lo,$a,$w,$c
 	maddhdu		$c,$a,$w,$c
 	addc		$r_out,$r_in,$lo
 	addze		$c,$c

 ___
 }

 # Like mul_add() but $c is ignored as an input - an optimisation to save a
 # preliminary instruction that would set input $c to 0
 sub mul_add_c_0($$$$$$)
 {
 	my ($self, $r_out, $r_in, $a, $w, $c) = @_;

 	$self->add_code(<<___);
 	maddld		$lo,$a,$w,$r_in
 	maddhdu		$c,$a,$w,$r_in
 ___

 	if ($r_out ne $lo) {
 		$self->add_code(<<___);
 	mr			$r_out,$lo
 ___
 	}

 	$self->nl();
 }


 package main;

 my $code;

 $code.=<<___;
 .machine "any"
 .text
 ___

 my $mont;

 $mont = new Mont::GPR(6);
 $mont->mul_mont_fixed();
 $code .= $mont->get_code();

 $mont = new Mont::GPR_300(6);
 $mont->mul_mont_fixed();
 $code .= $mont->get_code();

 $code =~ s/\`([^\`]*)\`/eval $1/gem;

 $code.=<<___;
 .asciz  "Montgomery Multiplication for PPC by <amitay\@ozlabs.org>, <alastair\@d-silva.org>"
 ___

 print $code;
 close STDOUT or die "error closing STDOUT: $!";
	#! /usr/bin/env perl
	# Copyright 2021 The OpenSSL Project Authors. All Rights Reserved.
	#
	# Licensed under the Apache License 2.0 (the "License"). You may not use
	# this file except in compliance with the License. You can obtain a copy
	# in the file LICENSE in the source distribution or at
	# https://www.openssl.org/source/license.html

	# ====================================================================
	# Written by Amitay Isaacs <amitay@ozlabs.org>, Martin Schwenke
	# <martin@meltin.net> & Alastair D'Silva <alastair@d-silva.org> for
	# the OpenSSL project.
	# ====================================================================

	#
	# Fixed length (n=6), unrolled PPC Montgomery Multiplication
	#

	# 2021
	#
	# Although this is a generic implementation for unrolling Montgomery
	# Multiplication for arbitrary values of n, this is currently only
	# used for n = 6 to improve the performance of ECC p384.
	#
	# Unrolling allows intermediate results to be stored in registers,
	# rather than on the stack, improving performance by ~7% compared to
	# the existing PPC assembly code.
	#
	# The ISA 3.0 implementation uses combination multiply/add
	# instructions (maddld, maddhdu) to improve performance by an
	# additional ~10% on Power 9.
	#
	# Finally, saving non-volatile registers into volatile vector
	# registers instead of onto the stack saves a little more.
	#
	# On a Power 9 machine we see an overall improvement of ~18%.
	#

	use strict;
	use warnings;

	my ($flavour, $output, $dir, $xlate);

	# $output is the last argument if it looks like a file (it has an extension)
	# $flavour is the first argument if it doesn't look like a file
	$output = $#ARGV >= 0 && $ARGV[$#ARGV] =~ m\|\.\w+$\| ? pop : undef;
	$flavour = $#ARGV >= 0 && $ARGV[0] !~ m\|\.\| ? shift : undef;

	$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
	( $xlate="${dir}ppc-xlate.pl" and -f $xlate ) or
	( $xlate="${dir}../../perlasm/ppc-xlate.pl" and -f $xlate) or
	die "can't locate ppc-xlate.pl";

	open STDOUT,"\| $^X $xlate $flavour \"$output\""
	or die "can't call $xlate: $!";

	if ($flavour !~ /64/) {
	die "bad flavour ($flavour) - only ppc64 permitted";
	}

	my $SIZE_T= 8;

	# Registers are global so the code is remotely readable

	# Parameters for Montgomery multiplication
	my $sp = "r1";
	my $toc = "r2";
	my $rp = "r3";
	my $ap = "r4";
	my $bp = "r5";
	my $np = "r6";
	my $n0 = "r7";
	my $num = "r8";

	my $i = "r9";
	my $c0 = "r10";
	my $bp0 = "r11";
	my $bpi = "r11";
	my $bpj = "r11";
	my $tj = "r12";
	my $apj = "r12";
	my $npj = "r12";
	my $lo = "r14";
	my $c1 = "r14";

	# Non-volatile registers used for tp[i]
	#
	# 12 registers are available but the limit on unrolling is 10,
	# since registers from $tp[0] to $tp[$n+1] are used.
	my @tp = ("r20" .. "r31");

	# volatile VSRs for saving non-volatile GPRs - faster than stack
	my @vsrs = ("v32" .. "v46");

	package Mont;

	sub new($$)
	{
	my ($class, $n) = @_;

	if ($n > 10) {
	die "Can't unroll for BN length ${n} (maximum 10)"
	}

	my $self = {
	code => "",
	n => $n,
	};
	bless $self, $class;

	return $self;
	}

	sub add_code($$)
	{
	my ($self, $c) = @_;

	$self->{code} .= $c;
	}

	sub get_code($)
	{
	my ($self) = @_;

	return $self->{code};
	}

	sub get_function_name($)
	{
	my ($self) = @_;

	return "bn_mul_mont_fixed_n" . $self->{n};
	}

	sub get_label($$)
	{
	my ($self, $l) = @_;

	return "L" . $l . "_" . $self->{n};
	}

	sub get_labels($@)
	{
	my ($self, @labels) = @_;

	my %out = ();

	foreach my $l (@labels) {
	$out{"$l"} = $self->get_label("$l");
	}

	return \%out;
	}

	sub nl($)
	{
	my ($self) = @_;

	$self->add_code("\n");
	}

	sub copy_result($)
	{
	my ($self) = @_;

	my ($n) = $self->{n};

	for (my $j = 0; $j < $n; $j++) {
	$self->add_code(<<___);
	std $tp[$j],`$j*$SIZE_T`($rp)
	___
	}

	}

	sub mul_mont_fixed($)
	{
	my ($self) = @_;

	my ($n) = $self->{n};
	my $fname = $self->get_function_name();
	my $label = $self->get_labels("outer", "enter", "sub", "copy", "end");

	$self->add_code(<<___);

	.globl .${fname}
	.align 5
	.${fname}:

	___

	$self->save_registers();

	$self->add_code(<<___);
	ld $n0,0($n0)

	ld $bp0,0($bp)

	ld $apj,0($ap)
	___

	$self->mul_c_0($tp[0], $apj, $bp0, $c0);

	for (my $j = 1; $j < $n - 1; $j++) {
	$self->add_code(<<___);
	ld $apj,`$j*$SIZE_T`($ap)
	___
	$self->mul($tp[$j], $apj, $bp0, $c0);
	}

	$self->add_code(<<___);
	ld $apj,`($n-1)*$SIZE_T`($ap)
	___

	$self->mul_last($tp[$n-1], $tp[$n], $apj, $bp0, $c0);

	$self->add_code(<<___);
	li $tp[$n+1],0

	___

	$self->add_code(<<___);
	li $i,0
	mtctr $num
	b $label->{"enter"}

	.align 4
	$label->{"outer"}:
	ldx $bpi,$bp,$i

	ld $apj,0($ap)
	___

	$self->mul_add_c_0($tp[0], $tp[0], $apj, $bpi, $c0);

	for (my $j = 1; $j < $n; $j++) {
	$self->add_code(<<___);
	ld $apj,`$j*$SIZE_T`($ap)
	___
	$self->mul_add($tp[$j], $tp[$j], $apj, $bpi, $c0);
	}

	$self->add_code(<<___);
	addc $tp[$n],$tp[$n],$c0
	addze $tp[$n+1],$tp[$n+1]
	___

	$self->add_code(<<___);
	.align 4
	$label->{"enter"}:
	mulld $bpi,$tp[0],$n0

	ld $npj,0($np)
	___

	$self->mul_add_c_0($lo, $tp[0], $bpi, $npj, $c0);

	for (my $j = 1; $j < $n; $j++) {
	$self->add_code(<<___);
	ld $npj,`$j*$SIZE_T`($np)
	___
	$self->mul_add($tp[$j-1], $tp[$j], $npj, $bpi, $c0);
	}

	$self->add_code(<<___);
	addc $tp[$n-1],$tp[$n],$c0
	addze $tp[$n],$tp[$n+1]

	addi $i,$i,$SIZE_T
	bdnz $label->{"outer"}

	and. $tp[$n],$tp[$n],$tp[$n]
	bne $label->{"sub"}

	cmpld $tp[$n-1],$npj
	blt $label->{"copy"}

	$label->{"sub"}:
	___

	#
	# Reduction
	#

	$self->add_code(<<___);
	ld $bpj,`0*$SIZE_T`($np)
	subfc $c1,$bpj,$tp[0]
	std $c1,`0*$SIZE_T`($rp)

	___
	for (my $j = 1; $j < $n - 1; $j++) {
	$self->add_code(<<___);
	ld $bpj,`$j*$SIZE_T`($np)
	subfe $c1,$bpj,$tp[$j]
	std $c1,`$j*$SIZE_T`($rp)

	___
	}

	$self->add_code(<<___);
	subfe $c1,$npj,$tp[$n-1]
	std $c1,`($n-1)*$SIZE_T`($rp)

	___

	$self->add_code(<<___);
	addme. $tp[$n],$tp[$n]
	beq $label->{"end"}

	$label->{"copy"}:
	___

	$self->copy_result();

	$self->add_code(<<___);

	$label->{"end"}:
	___

	$self->restore_registers();

	$self->add_code(<<___);
	li r3,1
	blr
	.size .${fname},.-.${fname}
	___

	}

	package Mont::GPR;

	our @ISA = ('Mont');

	sub new($$)
	{
	my ($class, $n) = @_;

	return $class->SUPER::new($n);
	}

	sub save_registers($)
	{
	my ($self) = @_;

	my $n = $self->{n};

	$self->add_code(<<___);
	std $lo,-8($sp)
	___

	for (my $j = 0; $j <= $n+1; $j++) {
	$self->{code}.=<<___;
	std $tp[$j],-`($j+2)*8`($sp)
	___
	}

	$self->add_code(<<___);

	___
	}

	sub restore_registers($)
	{
	my ($self) = @_;

	my $n = $self->{n};

	$self->add_code(<<___);
	ld $lo,-8($sp)
	___

	for (my $j = 0; $j <= $n+1; $j++) {
	$self->{code}.=<<___;
	ld $tp[$j],-`($j+2)*8`($sp)
	___
	}

	$self->{code} .=<<___;

	___
	}

	# Direct translation of C mul()
	sub mul($$$$$)
	{
	my ($self, $r, $a, $w, $c) = @_;

	$self->add_code(<<___);
	mulld $lo,$a,$w
	addc $r,$lo,$c
	mulhdu $c,$a,$w
	addze $c,$c

	___
	}

	# Like mul() but $c is ignored as an input - an optimisation to save a
	# preliminary instruction that would set input $c to 0
	sub mul_c_0($$$$$)
	{
	my ($self, $r, $a, $w, $c) = @_;

	$self->add_code(<<___);
	mulld $r,$a,$w
	mulhdu $c,$a,$w

	___
	}

	# Like mul() but does not to the final addition of CA into $c - an
	# optimisation to save an instruction
	sub mul_last($$$$$$)
	{
	my ($self, $r1, $r2, $a, $w, $c) = @_;

	$self->add_code(<<___);
	mulld $lo,$a,$w
	addc $r1,$lo,$c
	mulhdu $c,$a,$w

	addze $r2,$c
	___
	}

	# Like C mul_add() but allow $r_out and $r_in to be different
	sub mul_add($$$$$$)
	{
	my ($self, $r_out, $r_in, $a, $w, $c) = @_;

	$self->add_code(<<___);
	mulld $lo,$a,$w
	addc $lo,$lo,$c
	mulhdu $c,$a,$w
	addze $c,$c
	addc $r_out,$r_in,$lo
	addze $c,$c

	___
	}

	# Like mul_add() but $c is ignored as an input - an optimisation to save a
	# preliminary instruction that would set input $c to 0
	sub mul_add_c_0($$$$$$)
	{
	my ($self, $r_out, $r_in, $a, $w, $c) = @_;

	$self->add_code(<<___);
	mulld $lo,$a,$w
	addc $r_out,$r_in,$lo
	mulhdu $c,$a,$w
	addze $c,$c

	___
	}

	package Mont::GPR_300;

	our @ISA = ('Mont::GPR');

	sub new($$)
	{
	my ($class, $n) = @_;

	my $mont = $class->SUPER::new($n);

	return $mont;
	}

	sub get_function_name($)
	{
	my ($self) = @_;

	return "bn_mul_mont_300_fixed_n" . $self->{n};
	}

	sub get_label($$)
	{
	my ($self, $l) = @_;

	return "L" . $l . "_300_" . $self->{n};
	}

	# Direct translation of C mul()
	sub mul($$$$$)
	{
	my ($self, $r, $a, $w, $c, $last) = @_;

	$self->add_code(<<___);
	maddld $r,$a,$w,$c
	maddhdu $c,$a,$w,$c

	___
	}

	# Save the last carry as the final entry
	sub mul_last($$$$$)
	{
	my ($self, $r1, $r2, $a, $w, $c) = @_;

	$self->add_code(<<___);
	maddld $r1,$a,$w,$c
	maddhdu $r2,$a,$w,$c

	___
	}

	# Like mul() but $c is ignored as an input - an optimisation to save a
	# preliminary instruction that would set input $c to 0
	sub mul_c_0($$$$$)
	{
	my ($self, $r, $a, $w, $c) = @_;

	$self->add_code(<<___);
	mulld $r,$a,$w
	mulhdu $c,$a,$w

	___
	}

	# Like C mul_add() but allow $r_out and $r_in to be different
	sub mul_add($$$$$$)
	{
	my ($self, $r_out, $r_in, $a, $w, $c) = @_;

	$self->add_code(<<___);
	maddld $lo,$a,$w,$c
	maddhdu $c,$a,$w,$c
	addc $r_out,$r_in,$lo
	addze $c,$c

	___
	}

	# Like mul_add() but $c is ignored as an input - an optimisation to save a
	# preliminary instruction that would set input $c to 0
	sub mul_add_c_0($$$$$$)
	{
	my ($self, $r_out, $r_in, $a, $w, $c) = @_;

	$self->add_code(<<___);
	maddld $lo,$a,$w,$r_in
	maddhdu $c,$a,$w,$r_in
	___

	if ($r_out ne $lo) {
	$self->add_code(<<___);
	mr $r_out,$lo
	___
	}

	$self->nl();
	}


	package main;

	my $code;

	$code.=<<___;
	.machine "any"
	.text
	___

	my $mont;

	$mont = new Mont::GPR(6);
	$mont->mul_mont_fixed();
	$code .= $mont->get_code();

	$mont = new Mont::GPR_300(6);
	$mont->mul_mont_fixed();
	$code .= $mont->get_code();

	$code =~ s/\`([^\`]*)\`/eval $1/gem;

	$code.=<<___;
	.asciz "Montgomery Multiplication for PPC by <amitay\@ozlabs.org>, <alastair\@d-silva.org>"
	___

	print $code;
	close STDOUT or die "error closing STDOUT: $!";