simd/jdsample-altivec.c - third_party/libjpeg-turbo - Git at Google

 /*
  * AltiVec optimizations for libjpeg-turbo
  *
  * Copyright (C) 2015, D. R. Commander.
  * All rights reserved.
  * This software is provided 'as-is', without any express or implied
  * warranty.  In no event will the authors be held liable for any damages
  * arising from the use of this software.
  *
  * Permission is granted to anyone to use this software for any purpose,
  * including commercial applications, and to alter it and redistribute it
  * freely, subject to the following restrictions:
  *
  * 1. The origin of this software must not be misrepresented; you must not
  *    claim that you wrote the original software. If you use this software
  *    in a product, an acknowledgment in the product documentation would be
  *    appreciated but is not required.
  * 2. Altered source versions must be plainly marked as such, and must not be
  *    misrepresented as being the original software.
  * 3. This notice may not be removed or altered from any source distribution.
  */

 /* CHROMA UPSAMPLING */

 #include "jsimd_altivec.h"


 void
 jsimd_h2v1_fancy_upsample_altivec (int max_v_samp_factor,
                                    JDIMENSION downsampled_width,
                                    JSAMPARRAY input_data,
                                    JSAMPARRAY *output_data_ptr)
 {
   JSAMPARRAY output_data = *output_data_ptr;
   JSAMPROW inptr, outptr;
   int inrow, incol;

   __vector unsigned char this0, last0, p_last0, next0 = {0}, p_next0,
     out;
   __vector short this0e, this0o, this0l, this0h, last0l, last0h,
     next0l, next0h, outle, outhe, outlo, outho;

   /* Constants */
   __vector unsigned char pb_zero = { __16X(0) }, pb_three = { __16X(3) },
     last_index_col0 = {0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14},
     last_index = {15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30},
     next_index = {1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16},
     next_index_lastcol = {1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,15},
 #if __BIG_ENDIAN__
     merge_pack_index = {1,17,3,19,5,21,7,23,9,25,11,27,13,29,15,31};
 #else
     merge_pack_index = {0,16,2,18,4,20,6,22,8,24,10,26,12,28,14,30};
 #endif
   __vector short pw_one = { __8X(1) }, pw_two = { __8X(2) };

   for (inrow = 0; inrow < max_v_samp_factor; inrow++) {
     inptr = input_data[inrow];
     outptr = output_data[inrow];

     if (downsampled_width & 15)
       inptr[downsampled_width] = inptr[downsampled_width - 1];

     this0 = vec_ld(0, inptr);
     p_last0 = vec_perm(this0, this0, last_index_col0);
     last0 = this0;

     for (incol = downsampled_width; incol > 0;
          incol -= 16, inptr += 16, outptr += 32) {

       if (downsampled_width - incol > 0) {
         p_last0 = vec_perm(last0, this0, last_index);
         last0 = this0;
       }

       if (incol <= 16)
         p_next0 = vec_perm(this0, this0, next_index_lastcol);
       else {
         next0 = vec_ld(16, inptr);
         p_next0 = vec_perm(this0, next0, next_index);
       }

       this0e = (__vector short)vec_mule(this0, pb_three);
       this0o = (__vector short)vec_mulo(this0, pb_three);
       this0l = vec_mergeh(this0e, this0o);
       this0h = vec_mergel(this0e, this0o);

       last0l = (__vector short)VEC_UNPACKHU(p_last0);
       last0h = (__vector short)VEC_UNPACKLU(p_last0);
       last0l = vec_add(last0l, pw_one);

       next0l = (__vector short)VEC_UNPACKHU(p_next0);
       next0h = (__vector short)VEC_UNPACKLU(p_next0);
       next0l = vec_add(next0l, pw_two);

       outle = vec_add(this0l, last0l);
       outlo = vec_add(this0l, next0l);
       outle = vec_sr(outle, (__vector unsigned short)pw_two);
       outlo = vec_sr(outlo, (__vector unsigned short)pw_two);

       out = vec_perm((__vector unsigned char)outle,
                      (__vector unsigned char)outlo, merge_pack_index);
       vec_st(out, 0, outptr);

       if (incol > 8) {
         last0h = vec_add(last0h, pw_one);
         next0h = vec_add(next0h, pw_two);

         outhe = vec_add(this0h, last0h);
         outho = vec_add(this0h, next0h);
         outhe = vec_sr(outhe, (__vector unsigned short)pw_two);
         outho = vec_sr(outho, (__vector unsigned short)pw_two);

         out = vec_perm((__vector unsigned char)outhe,
                        (__vector unsigned char)outho, merge_pack_index);
         vec_st(out, 16, outptr);
       }

       this0 = next0;
     }
   }
 }


 void
 jsimd_h2v2_fancy_upsample_altivec (int max_v_samp_factor,
                                    JDIMENSION downsampled_width,
                                    JSAMPARRAY input_data,
                                    JSAMPARRAY *output_data_ptr)
 {
   JSAMPARRAY output_data = *output_data_ptr;
   JSAMPROW inptr_1, inptr0, inptr1, outptr0, outptr1;
   int inrow, outrow, incol;

   __vector unsigned char this_1, this0, this1, out;
   __vector short this_1l, this_1h, this0l, this0h, this1l, this1h,
     lastcolsum_1h, lastcolsum1h,
     p_lastcolsum_1l, p_lastcolsum_1h, p_lastcolsum1l, p_lastcolsum1h,
     thiscolsum_1l, thiscolsum_1h, thiscolsum1l, thiscolsum1h,
     nextcolsum_1l = {0}, nextcolsum_1h = {0},
     nextcolsum1l = {0}, nextcolsum1h = {0},
     p_nextcolsum_1l, p_nextcolsum_1h, p_nextcolsum1l, p_nextcolsum1h,
     tmpl, tmph, outle, outhe, outlo, outho;

   /* Constants */
   __vector unsigned char pb_zero = { __16X(0) },
     last_index_col0 = {0,1,0,1,2,3,4,5,6,7,8,9,10,11,12,13},
     last_index={14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29},
     next_index = {2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17},
     next_index_lastcol = {2,3,4,5,6,7,8,9,10,11,12,13,14,15,14,15},
 #if __BIG_ENDIAN__
     merge_pack_index = {1,17,3,19,5,21,7,23,9,25,11,27,13,29,15,31};
 #else
     merge_pack_index = {0,16,2,18,4,20,6,22,8,24,10,26,12,28,14,30};
 #endif
   __vector short pw_zero = { __8X(0) }, pw_three = { __8X(3) },
     pw_seven = { __8X(7) }, pw_eight = { __8X(8) };
   __vector unsigned short pw_four = { __8X(4) };

   for (inrow = 0, outrow = 0; outrow < max_v_samp_factor; inrow++) {

     inptr_1 = input_data[inrow - 1];
     inptr0 = input_data[inrow];
     inptr1 = input_data[inrow + 1];
     outptr0 = output_data[outrow++];
     outptr1 = output_data[outrow++];

     if (downsampled_width & 15) {
       inptr_1[downsampled_width] = inptr_1[downsampled_width - 1];
       inptr0[downsampled_width] = inptr0[downsampled_width - 1];
       inptr1[downsampled_width] = inptr1[downsampled_width - 1];
     }

     this0 = vec_ld(0, inptr0);
     this0l = (__vector short)VEC_UNPACKHU(this0);
     this0h = (__vector short)VEC_UNPACKLU(this0);
     this0l = vec_mladd(this0l, pw_three, pw_zero);
     this0h = vec_mladd(this0h, pw_three, pw_zero);

     this_1 = vec_ld(0, inptr_1);
     this_1l = (__vector short)VEC_UNPACKHU(this_1);
     this_1h = (__vector short)VEC_UNPACKLU(this_1);
     thiscolsum_1l = vec_add(this0l, this_1l);
     thiscolsum_1h = vec_add(this0h, this_1h);
     lastcolsum_1h = thiscolsum_1h;
     p_lastcolsum_1l = vec_perm(thiscolsum_1l, thiscolsum_1l, last_index_col0);
     p_lastcolsum_1h = vec_perm(thiscolsum_1l, thiscolsum_1h, last_index);

     this1 = vec_ld(0, inptr1);
     this1l = (__vector short)VEC_UNPACKHU(this1);
     this1h = (__vector short)VEC_UNPACKLU(this1);
     thiscolsum1l = vec_add(this0l, this1l);
     thiscolsum1h = vec_add(this0h, this1h);
     lastcolsum1h = thiscolsum1h;
     p_lastcolsum1l = vec_perm(thiscolsum1l, thiscolsum1l, last_index_col0);
     p_lastcolsum1h = vec_perm(thiscolsum1l, thiscolsum1h, last_index);

     for (incol = downsampled_width; incol > 0;
          incol -= 16, inptr_1 += 16, inptr0 += 16, inptr1 += 16,
          outptr0 += 32, outptr1 += 32) {

       if (downsampled_width - incol > 0) {
         p_lastcolsum_1l = vec_perm(lastcolsum_1h, thiscolsum_1l, last_index);
         p_lastcolsum_1h = vec_perm(thiscolsum_1l, thiscolsum_1h, last_index);
         p_lastcolsum1l = vec_perm(lastcolsum1h, thiscolsum1l, last_index);
         p_lastcolsum1h = vec_perm(thiscolsum1l, thiscolsum1h, last_index);
         lastcolsum_1h = thiscolsum_1h;  lastcolsum1h = thiscolsum1h;
       }

       if (incol <= 16) {
         p_nextcolsum_1l = vec_perm(thiscolsum_1l, thiscolsum_1h, next_index);
         p_nextcolsum_1h = vec_perm(thiscolsum_1h, thiscolsum_1h,
                                    next_index_lastcol);
         p_nextcolsum1l = vec_perm(thiscolsum1l, thiscolsum1h, next_index);
         p_nextcolsum1h = vec_perm(thiscolsum1h, thiscolsum1h,
                                   next_index_lastcol);
       } else {
         this0 = vec_ld(16, inptr0);
         this0l = (__vector short)VEC_UNPACKHU(this0);
         this0h = (__vector short)VEC_UNPACKLU(this0);
         this0l = vec_mladd(this0l, pw_three, pw_zero);
         this0h = vec_mladd(this0h, pw_three, pw_zero);

         this_1 = vec_ld(16, inptr_1);
         this_1l = (__vector short)VEC_UNPACKHU(this_1);
         this_1h = (__vector short)VEC_UNPACKLU(this_1);
         nextcolsum_1l = vec_add(this0l, this_1l);
         nextcolsum_1h = vec_add(this0h, this_1h);
         p_nextcolsum_1l = vec_perm(thiscolsum_1l, thiscolsum_1h, next_index);
         p_nextcolsum_1h = vec_perm(thiscolsum_1h, nextcolsum_1l, next_index);

         this1 = vec_ld(16, inptr1);
         this1l = (__vector short)VEC_UNPACKHU(this1);
         this1h = (__vector short)VEC_UNPACKLU(this1);
         nextcolsum1l = vec_add(this0l, this1l);
         nextcolsum1h = vec_add(this0h, this1h);
         p_nextcolsum1l = vec_perm(thiscolsum1l, thiscolsum1h, next_index);
         p_nextcolsum1h = vec_perm(thiscolsum1h, nextcolsum1l, next_index);
       }

       /* Process the upper row */

       tmpl = vec_mladd(thiscolsum_1l, pw_three, pw_zero);
       outle = vec_add(tmpl, p_lastcolsum_1l);
       outle = vec_add(outle, pw_eight);
       outle = vec_sr(outle, pw_four);

       outlo = vec_add(tmpl, p_nextcolsum_1l);
       outlo = vec_add(outlo, pw_seven);
       outlo = vec_sr(outlo, pw_four);

       out = vec_perm((__vector unsigned char)outle,
                      (__vector unsigned char)outlo, merge_pack_index);
       vec_st(out, 0, outptr0);

       if (incol > 8) {
         tmph = vec_mladd(thiscolsum_1h, pw_three, pw_zero);
         outhe = vec_add(tmph, p_lastcolsum_1h);
         outhe = vec_add(outhe, pw_eight);
         outhe = vec_sr(outhe, pw_four);

         outho = vec_add(tmph, p_nextcolsum_1h);
         outho = vec_add(outho, pw_seven);
         outho = vec_sr(outho, pw_four);

         out = vec_perm((__vector unsigned char)outhe,
                        (__vector unsigned char)outho, merge_pack_index);
         vec_st(out, 16, outptr0);
       }

       /* Process the lower row */

       tmpl = vec_mladd(thiscolsum1l, pw_three, pw_zero);
       outle = vec_add(tmpl, p_lastcolsum1l);
       outle = vec_add(outle, pw_eight);
       outle = vec_sr(outle, pw_four);

       outlo = vec_add(tmpl, p_nextcolsum1l);
       outlo = vec_add(outlo, pw_seven);
       outlo = vec_sr(outlo, pw_four);

       out = vec_perm((__vector unsigned char)outle,
                      (__vector unsigned char)outlo, merge_pack_index);
       vec_st(out, 0, outptr1);

       if (incol > 8) {
         tmph = vec_mladd(thiscolsum1h, pw_three, pw_zero);
         outhe = vec_add(tmph, p_lastcolsum1h);
         outhe = vec_add(outhe, pw_eight);
         outhe = vec_sr(outhe, pw_four);

         outho = vec_add(tmph, p_nextcolsum1h);
         outho = vec_add(outho, pw_seven);
         outho = vec_sr(outho, pw_four);

         out = vec_perm((__vector unsigned char)outhe,
                        (__vector unsigned char)outho, merge_pack_index);
         vec_st(out, 16, outptr1);
       }

       thiscolsum_1l = nextcolsum_1l;  thiscolsum_1h = nextcolsum_1h;
       thiscolsum1l = nextcolsum1l;  thiscolsum1h = nextcolsum1h;
     }
   }
 }


 /* These are rarely used (mainly just for decompressing YCCK images) */

 void
 jsimd_h2v1_upsample_altivec (int max_v_samp_factor,
                              JDIMENSION output_width,
                              JSAMPARRAY input_data,
                              JSAMPARRAY *output_data_ptr)
 {
   JSAMPARRAY output_data = *output_data_ptr;
   JSAMPROW inptr, outptr;
   int inrow, incol;

   __vector unsigned char in, inl, inh;

   for (inrow = 0; inrow < max_v_samp_factor; inrow++) {
     inptr = input_data[inrow];
     outptr = output_data[inrow];

     for (incol = (output_width + 31) & (~31); incol > 0;
          incol -= 64, inptr += 32, outptr += 64) {

       in = vec_ld(0, inptr);
       inl = vec_mergeh(in, in);
       inh = vec_mergel(in, in);

       vec_st(inl, 0, outptr);
       vec_st(inh, 16, outptr);

       if (incol > 32) {
         in = vec_ld(16, inptr);
         inl = vec_mergeh(in, in);
         inh = vec_mergel(in, in);

         vec_st(inl, 32, outptr);
         vec_st(inh, 48, outptr);
       }
     }
   }
 }


 void
 jsimd_h2v2_upsample_altivec (int max_v_samp_factor,
                              JDIMENSION output_width,
                              JSAMPARRAY input_data,
                              JSAMPARRAY *output_data_ptr)
 {
   JSAMPARRAY output_data = *output_data_ptr;
   JSAMPROW inptr, outptr0, outptr1;
   int inrow, outrow, incol;

   __vector unsigned char in, inl, inh;

   for (inrow = 0, outrow = 0; outrow < max_v_samp_factor; inrow++) {

     inptr = input_data[inrow];
     outptr0 = output_data[outrow++];
     outptr1 = output_data[outrow++];

     for (incol = (output_width + 31) & (~31); incol > 0;
          incol -= 64, inptr += 32, outptr0 += 64, outptr1 += 64) {

       in = vec_ld(0, inptr);
       inl = vec_mergeh(in, in);
       inh = vec_mergel(in, in);

       vec_st(inl, 0, outptr0);
       vec_st(inl, 0, outptr1);

       vec_st(inh, 16, outptr0);
       vec_st(inh, 16, outptr1);

       if (incol > 32) {
         in = vec_ld(16, inptr);
         inl = vec_mergeh(in, in);
         inh = vec_mergel(in, in);

         vec_st(inl, 32, outptr0);
         vec_st(inl, 32, outptr1);

         vec_st(inh, 48, outptr0);
         vec_st(inh, 48, outptr1);
       }
     }
   }
 }
	/*
	* AltiVec optimizations for libjpeg-turbo
	*
	* Copyright (C) 2015, D. R. Commander.
	* All rights reserved.
	* This software is provided 'as-is', without any express or implied
	* warranty. In no event will the authors be held liable for any damages
	* arising from the use of this software.
	*
	* Permission is granted to anyone to use this software for any purpose,
	* including commercial applications, and to alter it and redistribute it
	* freely, subject to the following restrictions:
	*
	* 1. The origin of this software must not be misrepresented; you must not
	* claim that you wrote the original software. If you use this software
	* in a product, an acknowledgment in the product documentation would be
	* appreciated but is not required.
	* 2. Altered source versions must be plainly marked as such, and must not be
	* misrepresented as being the original software.
	* 3. This notice may not be removed or altered from any source distribution.
	*/

	/* CHROMA UPSAMPLING */

	#include "jsimd_altivec.h"


	void
	jsimd_h2v1_fancy_upsample_altivec (int max_v_samp_factor,
	JDIMENSION downsampled_width,
	JSAMPARRAY input_data,
	JSAMPARRAY *output_data_ptr)
	{
	JSAMPARRAY output_data = *output_data_ptr;
	JSAMPROW inptr, outptr;
	int inrow, incol;

	__vector unsigned char this0, last0, p_last0, next0 = {0}, p_next0,
	out;
	__vector short this0e, this0o, this0l, this0h, last0l, last0h,
	next0l, next0h, outle, outhe, outlo, outho;

	/* Constants */
	__vector unsigned char pb_zero = { __16X(0) }, pb_three = { __16X(3) },
	last_index_col0 = {0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14},
	last_index = {15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30},
	next_index = {1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16},
	next_index_lastcol = {1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,15},
	#if __BIG_ENDIAN__
	merge_pack_index = {1,17,3,19,5,21,7,23,9,25,11,27,13,29,15,31};
	#else
	merge_pack_index = {0,16,2,18,4,20,6,22,8,24,10,26,12,28,14,30};
	#endif
	__vector short pw_one = { __8X(1) }, pw_two = { __8X(2) };

	for (inrow = 0; inrow < max_v_samp_factor; inrow++) {
	inptr = input_data[inrow];
	outptr = output_data[inrow];

	if (downsampled_width & 15)
	inptr[downsampled_width] = inptr[downsampled_width - 1];

	this0 = vec_ld(0, inptr);
	p_last0 = vec_perm(this0, this0, last_index_col0);
	last0 = this0;

	for (incol = downsampled_width; incol > 0;
	incol -= 16, inptr += 16, outptr += 32) {

	if (downsampled_width - incol > 0) {
	p_last0 = vec_perm(last0, this0, last_index);
	last0 = this0;
	}

	if (incol <= 16)
	p_next0 = vec_perm(this0, this0, next_index_lastcol);
	else {
	next0 = vec_ld(16, inptr);
	p_next0 = vec_perm(this0, next0, next_index);
	}

	this0e = (__vector short)vec_mule(this0, pb_three);
	this0o = (__vector short)vec_mulo(this0, pb_three);
	this0l = vec_mergeh(this0e, this0o);
	this0h = vec_mergel(this0e, this0o);

	last0l = (__vector short)VEC_UNPACKHU(p_last0);
	last0h = (__vector short)VEC_UNPACKLU(p_last0);
	last0l = vec_add(last0l, pw_one);

	next0l = (__vector short)VEC_UNPACKHU(p_next0);
	next0h = (__vector short)VEC_UNPACKLU(p_next0);
	next0l = vec_add(next0l, pw_two);

	outle = vec_add(this0l, last0l);
	outlo = vec_add(this0l, next0l);
	outle = vec_sr(outle, (__vector unsigned short)pw_two);
	outlo = vec_sr(outlo, (__vector unsigned short)pw_two);

	out = vec_perm((__vector unsigned char)outle,
	(__vector unsigned char)outlo, merge_pack_index);
	vec_st(out, 0, outptr);

	if (incol > 8) {
	last0h = vec_add(last0h, pw_one);
	next0h = vec_add(next0h, pw_two);

	outhe = vec_add(this0h, last0h);
	outho = vec_add(this0h, next0h);
	outhe = vec_sr(outhe, (__vector unsigned short)pw_two);
	outho = vec_sr(outho, (__vector unsigned short)pw_two);

	out = vec_perm((__vector unsigned char)outhe,
	(__vector unsigned char)outho, merge_pack_index);
	vec_st(out, 16, outptr);
	}

	this0 = next0;
	}
	}
	}


	void
	jsimd_h2v2_fancy_upsample_altivec (int max_v_samp_factor,
	JDIMENSION downsampled_width,
	JSAMPARRAY input_data,
	JSAMPARRAY *output_data_ptr)
	{
	JSAMPARRAY output_data = *output_data_ptr;
	JSAMPROW inptr_1, inptr0, inptr1, outptr0, outptr1;
	int inrow, outrow, incol;

	__vector unsigned char this_1, this0, this1, out;
	__vector short this_1l, this_1h, this0l, this0h, this1l, this1h,
	lastcolsum_1h, lastcolsum1h,
	p_lastcolsum_1l, p_lastcolsum_1h, p_lastcolsum1l, p_lastcolsum1h,
	thiscolsum_1l, thiscolsum_1h, thiscolsum1l, thiscolsum1h,
	nextcolsum_1l = {0}, nextcolsum_1h = {0},
	nextcolsum1l = {0}, nextcolsum1h = {0},
	p_nextcolsum_1l, p_nextcolsum_1h, p_nextcolsum1l, p_nextcolsum1h,
	tmpl, tmph, outle, outhe, outlo, outho;

	/* Constants */
	__vector unsigned char pb_zero = { __16X(0) },
	last_index_col0 = {0,1,0,1,2,3,4,5,6,7,8,9,10,11,12,13},
	last_index={14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29},
	next_index = {2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17},
	next_index_lastcol = {2,3,4,5,6,7,8,9,10,11,12,13,14,15,14,15},
	#if __BIG_ENDIAN__
	merge_pack_index = {1,17,3,19,5,21,7,23,9,25,11,27,13,29,15,31};
	#else
	merge_pack_index = {0,16,2,18,4,20,6,22,8,24,10,26,12,28,14,30};
	#endif
	__vector short pw_zero = { __8X(0) }, pw_three = { __8X(3) },
	pw_seven = { __8X(7) }, pw_eight = { __8X(8) };
	__vector unsigned short pw_four = { __8X(4) };

	for (inrow = 0, outrow = 0; outrow < max_v_samp_factor; inrow++) {

	inptr_1 = input_data[inrow - 1];
	inptr0 = input_data[inrow];
	inptr1 = input_data[inrow + 1];
	outptr0 = output_data[outrow++];
	outptr1 = output_data[outrow++];

	if (downsampled_width & 15) {
	inptr_1[downsampled_width] = inptr_1[downsampled_width - 1];
	inptr0[downsampled_width] = inptr0[downsampled_width - 1];
	inptr1[downsampled_width] = inptr1[downsampled_width - 1];
	}

	this0 = vec_ld(0, inptr0);
	this0l = (__vector short)VEC_UNPACKHU(this0);
	this0h = (__vector short)VEC_UNPACKLU(this0);
	this0l = vec_mladd(this0l, pw_three, pw_zero);
	this0h = vec_mladd(this0h, pw_three, pw_zero);

	this_1 = vec_ld(0, inptr_1);
	this_1l = (__vector short)VEC_UNPACKHU(this_1);
	this_1h = (__vector short)VEC_UNPACKLU(this_1);
	thiscolsum_1l = vec_add(this0l, this_1l);
	thiscolsum_1h = vec_add(this0h, this_1h);
	lastcolsum_1h = thiscolsum_1h;
	p_lastcolsum_1l = vec_perm(thiscolsum_1l, thiscolsum_1l, last_index_col0);
	p_lastcolsum_1h = vec_perm(thiscolsum_1l, thiscolsum_1h, last_index);

	this1 = vec_ld(0, inptr1);
	this1l = (__vector short)VEC_UNPACKHU(this1);
	this1h = (__vector short)VEC_UNPACKLU(this1);
	thiscolsum1l = vec_add(this0l, this1l);
	thiscolsum1h = vec_add(this0h, this1h);
	lastcolsum1h = thiscolsum1h;
	p_lastcolsum1l = vec_perm(thiscolsum1l, thiscolsum1l, last_index_col0);
	p_lastcolsum1h = vec_perm(thiscolsum1l, thiscolsum1h, last_index);

	for (incol = downsampled_width; incol > 0;
	incol -= 16, inptr_1 += 16, inptr0 += 16, inptr1 += 16,
	outptr0 += 32, outptr1 += 32) {

	if (downsampled_width - incol > 0) {
	p_lastcolsum_1l = vec_perm(lastcolsum_1h, thiscolsum_1l, last_index);
	p_lastcolsum_1h = vec_perm(thiscolsum_1l, thiscolsum_1h, last_index);
	p_lastcolsum1l = vec_perm(lastcolsum1h, thiscolsum1l, last_index);
	p_lastcolsum1h = vec_perm(thiscolsum1l, thiscolsum1h, last_index);
	lastcolsum_1h = thiscolsum_1h; lastcolsum1h = thiscolsum1h;
	}

	if (incol <= 16) {
	p_nextcolsum_1l = vec_perm(thiscolsum_1l, thiscolsum_1h, next_index);
	p_nextcolsum_1h = vec_perm(thiscolsum_1h, thiscolsum_1h,
	next_index_lastcol);
	p_nextcolsum1l = vec_perm(thiscolsum1l, thiscolsum1h, next_index);
	p_nextcolsum1h = vec_perm(thiscolsum1h, thiscolsum1h,
	next_index_lastcol);
	} else {
	this0 = vec_ld(16, inptr0);
	this0l = (__vector short)VEC_UNPACKHU(this0);
	this0h = (__vector short)VEC_UNPACKLU(this0);
	this0l = vec_mladd(this0l, pw_three, pw_zero);
	this0h = vec_mladd(this0h, pw_three, pw_zero);

	this_1 = vec_ld(16, inptr_1);
	this_1l = (__vector short)VEC_UNPACKHU(this_1);
	this_1h = (__vector short)VEC_UNPACKLU(this_1);
	nextcolsum_1l = vec_add(this0l, this_1l);
	nextcolsum_1h = vec_add(this0h, this_1h);
	p_nextcolsum_1l = vec_perm(thiscolsum_1l, thiscolsum_1h, next_index);
	p_nextcolsum_1h = vec_perm(thiscolsum_1h, nextcolsum_1l, next_index);

	this1 = vec_ld(16, inptr1);
	this1l = (__vector short)VEC_UNPACKHU(this1);
	this1h = (__vector short)VEC_UNPACKLU(this1);
	nextcolsum1l = vec_add(this0l, this1l);
	nextcolsum1h = vec_add(this0h, this1h);
	p_nextcolsum1l = vec_perm(thiscolsum1l, thiscolsum1h, next_index);
	p_nextcolsum1h = vec_perm(thiscolsum1h, nextcolsum1l, next_index);
	}

	/* Process the upper row */

	tmpl = vec_mladd(thiscolsum_1l, pw_three, pw_zero);
	outle = vec_add(tmpl, p_lastcolsum_1l);
	outle = vec_add(outle, pw_eight);
	outle = vec_sr(outle, pw_four);

	outlo = vec_add(tmpl, p_nextcolsum_1l);
	outlo = vec_add(outlo, pw_seven);
	outlo = vec_sr(outlo, pw_four);

	out = vec_perm((__vector unsigned char)outle,
	(__vector unsigned char)outlo, merge_pack_index);
	vec_st(out, 0, outptr0);

	if (incol > 8) {
	tmph = vec_mladd(thiscolsum_1h, pw_three, pw_zero);
	outhe = vec_add(tmph, p_lastcolsum_1h);
	outhe = vec_add(outhe, pw_eight);
	outhe = vec_sr(outhe, pw_four);

	outho = vec_add(tmph, p_nextcolsum_1h);
	outho = vec_add(outho, pw_seven);
	outho = vec_sr(outho, pw_four);

	out = vec_perm((__vector unsigned char)outhe,
	(__vector unsigned char)outho, merge_pack_index);
	vec_st(out, 16, outptr0);
	}

	/* Process the lower row */

	tmpl = vec_mladd(thiscolsum1l, pw_three, pw_zero);
	outle = vec_add(tmpl, p_lastcolsum1l);
	outle = vec_add(outle, pw_eight);
	outle = vec_sr(outle, pw_four);

	outlo = vec_add(tmpl, p_nextcolsum1l);
	outlo = vec_add(outlo, pw_seven);
	outlo = vec_sr(outlo, pw_four);

	out = vec_perm((__vector unsigned char)outle,
	(__vector unsigned char)outlo, merge_pack_index);
	vec_st(out, 0, outptr1);

	if (incol > 8) {
	tmph = vec_mladd(thiscolsum1h, pw_three, pw_zero);
	outhe = vec_add(tmph, p_lastcolsum1h);
	outhe = vec_add(outhe, pw_eight);
	outhe = vec_sr(outhe, pw_four);

	outho = vec_add(tmph, p_nextcolsum1h);
	outho = vec_add(outho, pw_seven);
	outho = vec_sr(outho, pw_four);

	out = vec_perm((__vector unsigned char)outhe,
	(__vector unsigned char)outho, merge_pack_index);
	vec_st(out, 16, outptr1);
	}

	thiscolsum_1l = nextcolsum_1l; thiscolsum_1h = nextcolsum_1h;
	thiscolsum1l = nextcolsum1l; thiscolsum1h = nextcolsum1h;
	}
	}
	}


	/* These are rarely used (mainly just for decompressing YCCK images) */

	void
	jsimd_h2v1_upsample_altivec (int max_v_samp_factor,
	JDIMENSION output_width,
	JSAMPARRAY input_data,
	JSAMPARRAY *output_data_ptr)
	{
	JSAMPARRAY output_data = *output_data_ptr;
	JSAMPROW inptr, outptr;
	int inrow, incol;

	__vector unsigned char in, inl, inh;

	for (inrow = 0; inrow < max_v_samp_factor; inrow++) {
	inptr = input_data[inrow];
	outptr = output_data[inrow];

	for (incol = (output_width + 31) & (~31); incol > 0;
	incol -= 64, inptr += 32, outptr += 64) {

	in = vec_ld(0, inptr);
	inl = vec_mergeh(in, in);
	inh = vec_mergel(in, in);

	vec_st(inl, 0, outptr);
	vec_st(inh, 16, outptr);

	if (incol > 32) {
	in = vec_ld(16, inptr);
	inl = vec_mergeh(in, in);
	inh = vec_mergel(in, in);

	vec_st(inl, 32, outptr);
	vec_st(inh, 48, outptr);
	}
	}
	}
	}


	void
	jsimd_h2v2_upsample_altivec (int max_v_samp_factor,
	JDIMENSION output_width,
	JSAMPARRAY input_data,
	JSAMPARRAY *output_data_ptr)
	{
	JSAMPARRAY output_data = *output_data_ptr;
	JSAMPROW inptr, outptr0, outptr1;
	int inrow, outrow, incol;

	__vector unsigned char in, inl, inh;

	for (inrow = 0, outrow = 0; outrow < max_v_samp_factor; inrow++) {

	inptr = input_data[inrow];
	outptr0 = output_data[outrow++];
	outptr1 = output_data[outrow++];

	for (incol = (output_width + 31) & (~31); incol > 0;
	incol -= 64, inptr += 32, outptr0 += 64, outptr1 += 64) {

	in = vec_ld(0, inptr);
	inl = vec_mergeh(in, in);
	inh = vec_mergel(in, in);

	vec_st(inl, 0, outptr0);
	vec_st(inl, 0, outptr1);

	vec_st(inh, 16, outptr0);
	vec_st(inh, 16, outptr1);

	if (incol > 32) {
	in = vec_ld(16, inptr);
	inl = vec_mergeh(in, in);
	inh = vec_mergel(in, in);

	vec_st(inl, 32, outptr0);
	vec_st(inl, 32, outptr1);

	vec_st(inh, 48, outptr0);
	vec_st(inh, 48, outptr1);
	}
	}
	}
	}