| /* |
| * AltiVec optimizations for libjpeg-turbo |
| * |
| * Copyright (C) 2015, D. R. Commander. |
| * All rights reserved. |
| * This software is provided 'as-is', without any express or implied |
| * warranty. In no event will the authors be held liable for any damages |
| * arising from the use of this software. |
| * |
| * Permission is granted to anyone to use this software for any purpose, |
| * including commercial applications, and to alter it and redistribute it |
| * freely, subject to the following restrictions: |
| * |
| * 1. The origin of this software must not be misrepresented; you must not |
| * claim that you wrote the original software. If you use this software |
| * in a product, an acknowledgment in the product documentation would be |
| * appreciated but is not required. |
| * 2. Altered source versions must be plainly marked as such, and must not be |
| * misrepresented as being the original software. |
| * 3. This notice may not be removed or altered from any source distribution. |
| */ |
| |
| /* CHROMA UPSAMPLING */ |
| |
| #include "jsimd_altivec.h" |
| |
| |
| void |
| jsimd_h2v1_fancy_upsample_altivec (int max_v_samp_factor, |
| JDIMENSION downsampled_width, |
| JSAMPARRAY input_data, |
| JSAMPARRAY *output_data_ptr) |
| { |
| JSAMPARRAY output_data = *output_data_ptr; |
| JSAMPROW inptr, outptr; |
| int inrow, incol; |
| |
| __vector unsigned char this0, last0, p_last0, next0 = {0}, p_next0, |
| out; |
| __vector short this0e, this0o, this0l, this0h, last0l, last0h, |
| next0l, next0h, outle, outhe, outlo, outho; |
| |
| /* Constants */ |
| __vector unsigned char pb_zero = { __16X(0) }, pb_three = { __16X(3) }, |
| last_index_col0 = {0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14}, |
| last_index = {15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30}, |
| next_index = {1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16}, |
| next_index_lastcol = {1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,15}, |
| #if __BIG_ENDIAN__ |
| merge_pack_index = {1,17,3,19,5,21,7,23,9,25,11,27,13,29,15,31}; |
| #else |
| merge_pack_index = {0,16,2,18,4,20,6,22,8,24,10,26,12,28,14,30}; |
| #endif |
| __vector short pw_one = { __8X(1) }, pw_two = { __8X(2) }; |
| |
| for (inrow = 0; inrow < max_v_samp_factor; inrow++) { |
| inptr = input_data[inrow]; |
| outptr = output_data[inrow]; |
| |
| if (downsampled_width & 15) |
| inptr[downsampled_width] = inptr[downsampled_width - 1]; |
| |
| this0 = vec_ld(0, inptr); |
| p_last0 = vec_perm(this0, this0, last_index_col0); |
| last0 = this0; |
| |
| for (incol = downsampled_width; incol > 0; |
| incol -= 16, inptr += 16, outptr += 32) { |
| |
| if (downsampled_width - incol > 0) { |
| p_last0 = vec_perm(last0, this0, last_index); |
| last0 = this0; |
| } |
| |
| if (incol <= 16) |
| p_next0 = vec_perm(this0, this0, next_index_lastcol); |
| else { |
| next0 = vec_ld(16, inptr); |
| p_next0 = vec_perm(this0, next0, next_index); |
| } |
| |
| this0e = (__vector short)vec_mule(this0, pb_three); |
| this0o = (__vector short)vec_mulo(this0, pb_three); |
| this0l = vec_mergeh(this0e, this0o); |
| this0h = vec_mergel(this0e, this0o); |
| |
| last0l = (__vector short)VEC_UNPACKHU(p_last0); |
| last0h = (__vector short)VEC_UNPACKLU(p_last0); |
| last0l = vec_add(last0l, pw_one); |
| |
| next0l = (__vector short)VEC_UNPACKHU(p_next0); |
| next0h = (__vector short)VEC_UNPACKLU(p_next0); |
| next0l = vec_add(next0l, pw_two); |
| |
| outle = vec_add(this0l, last0l); |
| outlo = vec_add(this0l, next0l); |
| outle = vec_sr(outle, (__vector unsigned short)pw_two); |
| outlo = vec_sr(outlo, (__vector unsigned short)pw_two); |
| |
| out = vec_perm((__vector unsigned char)outle, |
| (__vector unsigned char)outlo, merge_pack_index); |
| vec_st(out, 0, outptr); |
| |
| if (incol > 8) { |
| last0h = vec_add(last0h, pw_one); |
| next0h = vec_add(next0h, pw_two); |
| |
| outhe = vec_add(this0h, last0h); |
| outho = vec_add(this0h, next0h); |
| outhe = vec_sr(outhe, (__vector unsigned short)pw_two); |
| outho = vec_sr(outho, (__vector unsigned short)pw_two); |
| |
| out = vec_perm((__vector unsigned char)outhe, |
| (__vector unsigned char)outho, merge_pack_index); |
| vec_st(out, 16, outptr); |
| } |
| |
| this0 = next0; |
| } |
| } |
| } |
| |
| |
| void |
| jsimd_h2v2_fancy_upsample_altivec (int max_v_samp_factor, |
| JDIMENSION downsampled_width, |
| JSAMPARRAY input_data, |
| JSAMPARRAY *output_data_ptr) |
| { |
| JSAMPARRAY output_data = *output_data_ptr; |
| JSAMPROW inptr_1, inptr0, inptr1, outptr0, outptr1; |
| int inrow, outrow, incol; |
| |
| __vector unsigned char this_1, this0, this1, out; |
| __vector short this_1l, this_1h, this0l, this0h, this1l, this1h, |
| lastcolsum_1h, lastcolsum1h, |
| p_lastcolsum_1l, p_lastcolsum_1h, p_lastcolsum1l, p_lastcolsum1h, |
| thiscolsum_1l, thiscolsum_1h, thiscolsum1l, thiscolsum1h, |
| nextcolsum_1l = {0}, nextcolsum_1h = {0}, |
| nextcolsum1l = {0}, nextcolsum1h = {0}, |
| p_nextcolsum_1l, p_nextcolsum_1h, p_nextcolsum1l, p_nextcolsum1h, |
| tmpl, tmph, outle, outhe, outlo, outho; |
| |
| /* Constants */ |
| __vector unsigned char pb_zero = { __16X(0) }, |
| last_index_col0 = {0,1,0,1,2,3,4,5,6,7,8,9,10,11,12,13}, |
| last_index={14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29}, |
| next_index = {2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17}, |
| next_index_lastcol = {2,3,4,5,6,7,8,9,10,11,12,13,14,15,14,15}, |
| #if __BIG_ENDIAN__ |
| merge_pack_index = {1,17,3,19,5,21,7,23,9,25,11,27,13,29,15,31}; |
| #else |
| merge_pack_index = {0,16,2,18,4,20,6,22,8,24,10,26,12,28,14,30}; |
| #endif |
| __vector short pw_zero = { __8X(0) }, pw_three = { __8X(3) }, |
| pw_seven = { __8X(7) }, pw_eight = { __8X(8) }; |
| __vector unsigned short pw_four = { __8X(4) }; |
| |
| for (inrow = 0, outrow = 0; outrow < max_v_samp_factor; inrow++) { |
| |
| inptr_1 = input_data[inrow - 1]; |
| inptr0 = input_data[inrow]; |
| inptr1 = input_data[inrow + 1]; |
| outptr0 = output_data[outrow++]; |
| outptr1 = output_data[outrow++]; |
| |
| if (downsampled_width & 15) { |
| inptr_1[downsampled_width] = inptr_1[downsampled_width - 1]; |
| inptr0[downsampled_width] = inptr0[downsampled_width - 1]; |
| inptr1[downsampled_width] = inptr1[downsampled_width - 1]; |
| } |
| |
| this0 = vec_ld(0, inptr0); |
| this0l = (__vector short)VEC_UNPACKHU(this0); |
| this0h = (__vector short)VEC_UNPACKLU(this0); |
| this0l = vec_mladd(this0l, pw_three, pw_zero); |
| this0h = vec_mladd(this0h, pw_three, pw_zero); |
| |
| this_1 = vec_ld(0, inptr_1); |
| this_1l = (__vector short)VEC_UNPACKHU(this_1); |
| this_1h = (__vector short)VEC_UNPACKLU(this_1); |
| thiscolsum_1l = vec_add(this0l, this_1l); |
| thiscolsum_1h = vec_add(this0h, this_1h); |
| lastcolsum_1h = thiscolsum_1h; |
| p_lastcolsum_1l = vec_perm(thiscolsum_1l, thiscolsum_1l, last_index_col0); |
| p_lastcolsum_1h = vec_perm(thiscolsum_1l, thiscolsum_1h, last_index); |
| |
| this1 = vec_ld(0, inptr1); |
| this1l = (__vector short)VEC_UNPACKHU(this1); |
| this1h = (__vector short)VEC_UNPACKLU(this1); |
| thiscolsum1l = vec_add(this0l, this1l); |
| thiscolsum1h = vec_add(this0h, this1h); |
| lastcolsum1h = thiscolsum1h; |
| p_lastcolsum1l = vec_perm(thiscolsum1l, thiscolsum1l, last_index_col0); |
| p_lastcolsum1h = vec_perm(thiscolsum1l, thiscolsum1h, last_index); |
| |
| for (incol = downsampled_width; incol > 0; |
| incol -= 16, inptr_1 += 16, inptr0 += 16, inptr1 += 16, |
| outptr0 += 32, outptr1 += 32) { |
| |
| if (downsampled_width - incol > 0) { |
| p_lastcolsum_1l = vec_perm(lastcolsum_1h, thiscolsum_1l, last_index); |
| p_lastcolsum_1h = vec_perm(thiscolsum_1l, thiscolsum_1h, last_index); |
| p_lastcolsum1l = vec_perm(lastcolsum1h, thiscolsum1l, last_index); |
| p_lastcolsum1h = vec_perm(thiscolsum1l, thiscolsum1h, last_index); |
| lastcolsum_1h = thiscolsum_1h; lastcolsum1h = thiscolsum1h; |
| } |
| |
| if (incol <= 16) { |
| p_nextcolsum_1l = vec_perm(thiscolsum_1l, thiscolsum_1h, next_index); |
| p_nextcolsum_1h = vec_perm(thiscolsum_1h, thiscolsum_1h, |
| next_index_lastcol); |
| p_nextcolsum1l = vec_perm(thiscolsum1l, thiscolsum1h, next_index); |
| p_nextcolsum1h = vec_perm(thiscolsum1h, thiscolsum1h, |
| next_index_lastcol); |
| } else { |
| this0 = vec_ld(16, inptr0); |
| this0l = (__vector short)VEC_UNPACKHU(this0); |
| this0h = (__vector short)VEC_UNPACKLU(this0); |
| this0l = vec_mladd(this0l, pw_three, pw_zero); |
| this0h = vec_mladd(this0h, pw_three, pw_zero); |
| |
| this_1 = vec_ld(16, inptr_1); |
| this_1l = (__vector short)VEC_UNPACKHU(this_1); |
| this_1h = (__vector short)VEC_UNPACKLU(this_1); |
| nextcolsum_1l = vec_add(this0l, this_1l); |
| nextcolsum_1h = vec_add(this0h, this_1h); |
| p_nextcolsum_1l = vec_perm(thiscolsum_1l, thiscolsum_1h, next_index); |
| p_nextcolsum_1h = vec_perm(thiscolsum_1h, nextcolsum_1l, next_index); |
| |
| this1 = vec_ld(16, inptr1); |
| this1l = (__vector short)VEC_UNPACKHU(this1); |
| this1h = (__vector short)VEC_UNPACKLU(this1); |
| nextcolsum1l = vec_add(this0l, this1l); |
| nextcolsum1h = vec_add(this0h, this1h); |
| p_nextcolsum1l = vec_perm(thiscolsum1l, thiscolsum1h, next_index); |
| p_nextcolsum1h = vec_perm(thiscolsum1h, nextcolsum1l, next_index); |
| } |
| |
| /* Process the upper row */ |
| |
| tmpl = vec_mladd(thiscolsum_1l, pw_three, pw_zero); |
| outle = vec_add(tmpl, p_lastcolsum_1l); |
| outle = vec_add(outle, pw_eight); |
| outle = vec_sr(outle, pw_four); |
| |
| outlo = vec_add(tmpl, p_nextcolsum_1l); |
| outlo = vec_add(outlo, pw_seven); |
| outlo = vec_sr(outlo, pw_four); |
| |
| out = vec_perm((__vector unsigned char)outle, |
| (__vector unsigned char)outlo, merge_pack_index); |
| vec_st(out, 0, outptr0); |
| |
| if (incol > 8) { |
| tmph = vec_mladd(thiscolsum_1h, pw_three, pw_zero); |
| outhe = vec_add(tmph, p_lastcolsum_1h); |
| outhe = vec_add(outhe, pw_eight); |
| outhe = vec_sr(outhe, pw_four); |
| |
| outho = vec_add(tmph, p_nextcolsum_1h); |
| outho = vec_add(outho, pw_seven); |
| outho = vec_sr(outho, pw_four); |
| |
| out = vec_perm((__vector unsigned char)outhe, |
| (__vector unsigned char)outho, merge_pack_index); |
| vec_st(out, 16, outptr0); |
| } |
| |
| /* Process the lower row */ |
| |
| tmpl = vec_mladd(thiscolsum1l, pw_three, pw_zero); |
| outle = vec_add(tmpl, p_lastcolsum1l); |
| outle = vec_add(outle, pw_eight); |
| outle = vec_sr(outle, pw_four); |
| |
| outlo = vec_add(tmpl, p_nextcolsum1l); |
| outlo = vec_add(outlo, pw_seven); |
| outlo = vec_sr(outlo, pw_four); |
| |
| out = vec_perm((__vector unsigned char)outle, |
| (__vector unsigned char)outlo, merge_pack_index); |
| vec_st(out, 0, outptr1); |
| |
| if (incol > 8) { |
| tmph = vec_mladd(thiscolsum1h, pw_three, pw_zero); |
| outhe = vec_add(tmph, p_lastcolsum1h); |
| outhe = vec_add(outhe, pw_eight); |
| outhe = vec_sr(outhe, pw_four); |
| |
| outho = vec_add(tmph, p_nextcolsum1h); |
| outho = vec_add(outho, pw_seven); |
| outho = vec_sr(outho, pw_four); |
| |
| out = vec_perm((__vector unsigned char)outhe, |
| (__vector unsigned char)outho, merge_pack_index); |
| vec_st(out, 16, outptr1); |
| } |
| |
| thiscolsum_1l = nextcolsum_1l; thiscolsum_1h = nextcolsum_1h; |
| thiscolsum1l = nextcolsum1l; thiscolsum1h = nextcolsum1h; |
| } |
| } |
| } |
| |
| |
| /* These are rarely used (mainly just for decompressing YCCK images) */ |
| |
| void |
| jsimd_h2v1_upsample_altivec (int max_v_samp_factor, |
| JDIMENSION output_width, |
| JSAMPARRAY input_data, |
| JSAMPARRAY *output_data_ptr) |
| { |
| JSAMPARRAY output_data = *output_data_ptr; |
| JSAMPROW inptr, outptr; |
| int inrow, incol; |
| |
| __vector unsigned char in, inl, inh; |
| |
| for (inrow = 0; inrow < max_v_samp_factor; inrow++) { |
| inptr = input_data[inrow]; |
| outptr = output_data[inrow]; |
| |
| for (incol = (output_width + 31) & (~31); incol > 0; |
| incol -= 64, inptr += 32, outptr += 64) { |
| |
| in = vec_ld(0, inptr); |
| inl = vec_mergeh(in, in); |
| inh = vec_mergel(in, in); |
| |
| vec_st(inl, 0, outptr); |
| vec_st(inh, 16, outptr); |
| |
| if (incol > 32) { |
| in = vec_ld(16, inptr); |
| inl = vec_mergeh(in, in); |
| inh = vec_mergel(in, in); |
| |
| vec_st(inl, 32, outptr); |
| vec_st(inh, 48, outptr); |
| } |
| } |
| } |
| } |
| |
| |
| void |
| jsimd_h2v2_upsample_altivec (int max_v_samp_factor, |
| JDIMENSION output_width, |
| JSAMPARRAY input_data, |
| JSAMPARRAY *output_data_ptr) |
| { |
| JSAMPARRAY output_data = *output_data_ptr; |
| JSAMPROW inptr, outptr0, outptr1; |
| int inrow, outrow, incol; |
| |
| __vector unsigned char in, inl, inh; |
| |
| for (inrow = 0, outrow = 0; outrow < max_v_samp_factor; inrow++) { |
| |
| inptr = input_data[inrow]; |
| outptr0 = output_data[outrow++]; |
| outptr1 = output_data[outrow++]; |
| |
| for (incol = (output_width + 31) & (~31); incol > 0; |
| incol -= 64, inptr += 32, outptr0 += 64, outptr1 += 64) { |
| |
| in = vec_ld(0, inptr); |
| inl = vec_mergeh(in, in); |
| inh = vec_mergel(in, in); |
| |
| vec_st(inl, 0, outptr0); |
| vec_st(inl, 0, outptr1); |
| |
| vec_st(inh, 16, outptr0); |
| vec_st(inh, 16, outptr1); |
| |
| if (incol > 32) { |
| in = vec_ld(16, inptr); |
| inl = vec_mergeh(in, in); |
| inh = vec_mergel(in, in); |
| |
| vec_st(inl, 32, outptr0); |
| vec_st(inl, 32, outptr1); |
| |
| vec_st(inh, 48, outptr0); |
| vec_st(inh, 48, outptr1); |
| } |
| } |
| } |
| } |