blob: 51176d4596ad794d6890f783227d18fbbd8e347f [file] [log] [blame]
MIYASAKA Masarua2e6a9d2006-02-04 00:00:00 +00001;
DRC72130be2014-05-09 20:14:26 +00002; jdsample.asm - upsampling (SSE2)
MIYASAKA Masarua2e6a9d2006-02-04 00:00:00 +00003;
Pierre Ossmaneea72152009-03-09 13:34:17 +00004; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
5;
6; Based on
MIYASAKA Masarua2e6a9d2006-02-04 00:00:00 +00007; x86 SIMD extension for IJG JPEG library
8; Copyright (C) 1999-2006, MIYASAKA Masaru.
9; For conditions of distribution and use, see copyright notice in jsimdext.inc
10;
11; This file should be assembled with NASM (Netwide Assembler),
12; can *not* be assembled with Microsoft's MASM or any compatible
13; assembler (including Borland's Turbo Assembler).
14; NASM is available from http://nasm.sourceforge.net/ or
15; http://sourceforge.net/project/showfiles.php?group_id=6208
16;
MIYASAKA Masarua2e6a9d2006-02-04 00:00:00 +000017; [TAB8]
18
Pierre Ossman3a65ef42009-03-16 13:34:18 +000019%include "jsimdext.inc"
MIYASAKA Masarua2e6a9d2006-02-04 00:00:00 +000020
21; --------------------------------------------------------------------------
DRCe5eaf372014-05-09 18:00:32 +000022 SECTION SEG_CONST
MIYASAKA Masarua2e6a9d2006-02-04 00:00:00 +000023
DRCe5eaf372014-05-09 18:00:32 +000024 alignz 16
25 global EXTN(jconst_fancy_upsample_sse2)
MIYASAKA Masarua2e6a9d2006-02-04 00:00:00 +000026
27EXTN(jconst_fancy_upsample_sse2):
28
DRCe5eaf372014-05-09 18:00:32 +000029PW_ONE times 8 dw 1
30PW_TWO times 8 dw 2
31PW_THREE times 8 dw 3
32PW_SEVEN times 8 dw 7
33PW_EIGHT times 8 dw 8
MIYASAKA Masarua2e6a9d2006-02-04 00:00:00 +000034
DRCe5eaf372014-05-09 18:00:32 +000035 alignz 16
MIYASAKA Masarua2e6a9d2006-02-04 00:00:00 +000036
37; --------------------------------------------------------------------------
DRCe5eaf372014-05-09 18:00:32 +000038 SECTION SEG_TEXT
39 BITS 32
MIYASAKA Masarua2e6a9d2006-02-04 00:00:00 +000040;
41; Fancy processing for the common case of 2:1 horizontal and 1:1 vertical.
42;
43; The upsampling algorithm is linear interpolation between pixel centers,
44; also known as a "triangle filter". This is a good compromise between
45; speed and visual quality. The centers of the output pixels are 1/4 and 3/4
46; of the way between input pixel centers.
47;
48; GLOBAL(void)
Pierre Ossmaneea72152009-03-09 13:34:17 +000049; jsimd_h2v1_fancy_upsample_sse2 (int max_v_samp_factor,
50; JDIMENSION downsampled_width,
51; JSAMPARRAY input_data,
52; JSAMPARRAY * output_data_ptr);
MIYASAKA Masarua2e6a9d2006-02-04 00:00:00 +000053;
54
DRCe5eaf372014-05-09 18:00:32 +000055%define max_v_samp(b) (b)+8 ; int max_v_samp_factor
56%define downsamp_width(b) (b)+12 ; JDIMENSION downsampled_width
57%define input_data(b) (b)+16 ; JSAMPARRAY input_data
58%define output_data_ptr(b) (b)+20 ; JSAMPARRAY * output_data_ptr
MIYASAKA Masarua2e6a9d2006-02-04 00:00:00 +000059
DRCe5eaf372014-05-09 18:00:32 +000060 align 16
61 global EXTN(jsimd_h2v1_fancy_upsample_sse2)
MIYASAKA Masarua2e6a9d2006-02-04 00:00:00 +000062
Pierre Ossmaneea72152009-03-09 13:34:17 +000063EXTN(jsimd_h2v1_fancy_upsample_sse2):
DRCe5eaf372014-05-09 18:00:32 +000064 push ebp
65 mov ebp,esp
66 pushpic ebx
67; push ecx ; need not be preserved
68; push edx ; need not be preserved
69 push esi
70 push edi
MIYASAKA Masarua2e6a9d2006-02-04 00:00:00 +000071
DRCe5eaf372014-05-09 18:00:32 +000072 get_GOT ebx ; get GOT address
MIYASAKA Masarua2e6a9d2006-02-04 00:00:00 +000073
DRCe5eaf372014-05-09 18:00:32 +000074 mov eax, JDIMENSION [downsamp_width(ebp)] ; colctr
75 test eax,eax
76 jz near .return
MIYASAKA Masarua2e6a9d2006-02-04 00:00:00 +000077
DRCe5eaf372014-05-09 18:00:32 +000078 mov ecx, INT [max_v_samp(ebp)] ; rowctr
79 test ecx,ecx
80 jz near .return
MIYASAKA Masarua2e6a9d2006-02-04 00:00:00 +000081
DRCe5eaf372014-05-09 18:00:32 +000082 mov esi, JSAMPARRAY [input_data(ebp)] ; input_data
83 mov edi, POINTER [output_data_ptr(ebp)]
84 mov edi, JSAMPARRAY [edi] ; output_data
85 alignx 16,7
MIYASAKA Masarua2e6a9d2006-02-04 00:00:00 +000086.rowloop:
DRCe5eaf372014-05-09 18:00:32 +000087 push eax ; colctr
88 push edi
89 push esi
MIYASAKA Masarua2e6a9d2006-02-04 00:00:00 +000090
DRCe5eaf372014-05-09 18:00:32 +000091 mov esi, JSAMPROW [esi] ; inptr
92 mov edi, JSAMPROW [edi] ; outptr
MIYASAKA Masarua2e6a9d2006-02-04 00:00:00 +000093
DRCe5eaf372014-05-09 18:00:32 +000094 test eax, SIZEOF_XMMWORD-1
95 jz short .skip
96 mov dl, JSAMPLE [esi+(eax-1)*SIZEOF_JSAMPLE]
97 mov JSAMPLE [esi+eax*SIZEOF_JSAMPLE], dl ; insert a dummy sample
MIYASAKA Masarua2e6a9d2006-02-04 00:00:00 +000098.skip:
DRCe5eaf372014-05-09 18:00:32 +000099 pxor xmm0,xmm0 ; xmm0=(all 0's)
100 pcmpeqb xmm7,xmm7
101 psrldq xmm7,(SIZEOF_XMMWORD-1)
102 pand xmm7, XMMWORD [esi+0*SIZEOF_XMMWORD]
MIYASAKA Masarua2e6a9d2006-02-04 00:00:00 +0000103
DRCe5eaf372014-05-09 18:00:32 +0000104 add eax, byte SIZEOF_XMMWORD-1
105 and eax, byte -SIZEOF_XMMWORD
106 cmp eax, byte SIZEOF_XMMWORD
107 ja short .columnloop
108 alignx 16,7
MIYASAKA Masarua2e6a9d2006-02-04 00:00:00 +0000109
110.columnloop_last:
DRCe5eaf372014-05-09 18:00:32 +0000111 pcmpeqb xmm6,xmm6
112 pslldq xmm6,(SIZEOF_XMMWORD-1)
113 pand xmm6, XMMWORD [esi+0*SIZEOF_XMMWORD]
114 jmp short .upsample
115 alignx 16,7
MIYASAKA Masarua2e6a9d2006-02-04 00:00:00 +0000116
117.columnloop:
DRCe5eaf372014-05-09 18:00:32 +0000118 movdqa xmm6, XMMWORD [esi+1*SIZEOF_XMMWORD]
119 pslldq xmm6,(SIZEOF_XMMWORD-1)
MIYASAKA Masarua2e6a9d2006-02-04 00:00:00 +0000120
121.upsample:
DRCe5eaf372014-05-09 18:00:32 +0000122 movdqa xmm1, XMMWORD [esi+0*SIZEOF_XMMWORD]
123 movdqa xmm2,xmm1
124 movdqa xmm3,xmm1 ; xmm1=( 0 1 2 ... 13 14 15)
125 pslldq xmm2,1 ; xmm2=(-- 0 1 ... 12 13 14)
126 psrldq xmm3,1 ; xmm3=( 1 2 3 ... 14 15 --)
MIYASAKA Masarua2e6a9d2006-02-04 00:00:00 +0000127
DRCe5eaf372014-05-09 18:00:32 +0000128 por xmm2,xmm7 ; xmm2=(-1 0 1 ... 12 13 14)
129 por xmm3,xmm6 ; xmm3=( 1 2 3 ... 14 15 16)
MIYASAKA Masarua2e6a9d2006-02-04 00:00:00 +0000130
DRCe5eaf372014-05-09 18:00:32 +0000131 movdqa xmm7,xmm1
132 psrldq xmm7,(SIZEOF_XMMWORD-1) ; xmm7=(15 -- -- ... -- -- --)
MIYASAKA Masarua2e6a9d2006-02-04 00:00:00 +0000133
DRCe5eaf372014-05-09 18:00:32 +0000134 movdqa xmm4,xmm1
135 punpcklbw xmm1,xmm0 ; xmm1=( 0 1 2 3 4 5 6 7)
136 punpckhbw xmm4,xmm0 ; xmm4=( 8 9 10 11 12 13 14 15)
137 movdqa xmm5,xmm2
138 punpcklbw xmm2,xmm0 ; xmm2=(-1 0 1 2 3 4 5 6)
139 punpckhbw xmm5,xmm0 ; xmm5=( 7 8 9 10 11 12 13 14)
140 movdqa xmm6,xmm3
141 punpcklbw xmm3,xmm0 ; xmm3=( 1 2 3 4 5 6 7 8)
142 punpckhbw xmm6,xmm0 ; xmm6=( 9 10 11 12 13 14 15 16)
MIYASAKA Masarua2e6a9d2006-02-04 00:00:00 +0000143
DRCe5eaf372014-05-09 18:00:32 +0000144 pmullw xmm1,[GOTOFF(ebx,PW_THREE)]
145 pmullw xmm4,[GOTOFF(ebx,PW_THREE)]
146 paddw xmm2,[GOTOFF(ebx,PW_ONE)]
147 paddw xmm5,[GOTOFF(ebx,PW_ONE)]
148 paddw xmm3,[GOTOFF(ebx,PW_TWO)]
149 paddw xmm6,[GOTOFF(ebx,PW_TWO)]
MIYASAKA Masarua2e6a9d2006-02-04 00:00:00 +0000150
DRCe5eaf372014-05-09 18:00:32 +0000151 paddw xmm2,xmm1
152 paddw xmm5,xmm4
153 psrlw xmm2,2 ; xmm2=OutLE=( 0 2 4 6 8 10 12 14)
154 psrlw xmm5,2 ; xmm5=OutHE=(16 18 20 22 24 26 28 30)
155 paddw xmm3,xmm1
156 paddw xmm6,xmm4
157 psrlw xmm3,2 ; xmm3=OutLO=( 1 3 5 7 9 11 13 15)
158 psrlw xmm6,2 ; xmm6=OutHO=(17 19 21 23 25 27 29 31)
MIYASAKA Masarua2e6a9d2006-02-04 00:00:00 +0000159
DRCe5eaf372014-05-09 18:00:32 +0000160 psllw xmm3,BYTE_BIT
161 psllw xmm6,BYTE_BIT
162 por xmm2,xmm3 ; xmm2=OutL=( 0 1 2 ... 13 14 15)
163 por xmm5,xmm6 ; xmm5=OutH=(16 17 18 ... 29 30 31)
MIYASAKA Masarua2e6a9d2006-02-04 00:00:00 +0000164
DRCe5eaf372014-05-09 18:00:32 +0000165 movdqa XMMWORD [edi+0*SIZEOF_XMMWORD], xmm2
166 movdqa XMMWORD [edi+1*SIZEOF_XMMWORD], xmm5
MIYASAKA Masarua2e6a9d2006-02-04 00:00:00 +0000167
DRCe5eaf372014-05-09 18:00:32 +0000168 sub eax, byte SIZEOF_XMMWORD
169 add esi, byte 1*SIZEOF_XMMWORD ; inptr
170 add edi, byte 2*SIZEOF_XMMWORD ; outptr
171 cmp eax, byte SIZEOF_XMMWORD
172 ja near .columnloop
173 test eax,eax
174 jnz near .columnloop_last
MIYASAKA Masarua2e6a9d2006-02-04 00:00:00 +0000175
DRCe5eaf372014-05-09 18:00:32 +0000176 pop esi
177 pop edi
178 pop eax
MIYASAKA Masarua2e6a9d2006-02-04 00:00:00 +0000179
DRCe5eaf372014-05-09 18:00:32 +0000180 add esi, byte SIZEOF_JSAMPROW ; input_data
181 add edi, byte SIZEOF_JSAMPROW ; output_data
182 dec ecx ; rowctr
183 jg near .rowloop
MIYASAKA Masarua2e6a9d2006-02-04 00:00:00 +0000184
185.return:
DRCe5eaf372014-05-09 18:00:32 +0000186 pop edi
187 pop esi
188; pop edx ; need not be preserved
189; pop ecx ; need not be preserved
190 poppic ebx
191 pop ebp
192 ret
MIYASAKA Masarua2e6a9d2006-02-04 00:00:00 +0000193
194; --------------------------------------------------------------------------
195;
196; Fancy processing for the common case of 2:1 horizontal and 2:1 vertical.
197; Again a triangle filter; see comments for h2v1 case, above.
198;
199; GLOBAL(void)
Pierre Ossmaneea72152009-03-09 13:34:17 +0000200; jsimd_h2v2_fancy_upsample_sse2 (int max_v_samp_factor,
201; JDIMENSION downsampled_width,
202; JSAMPARRAY input_data,
203; JSAMPARRAY * output_data_ptr);
MIYASAKA Masarua2e6a9d2006-02-04 00:00:00 +0000204;
205
DRCe5eaf372014-05-09 18:00:32 +0000206%define max_v_samp(b) (b)+8 ; int max_v_samp_factor
207%define downsamp_width(b) (b)+12 ; JDIMENSION downsampled_width
208%define input_data(b) (b)+16 ; JSAMPARRAY input_data
209%define output_data_ptr(b) (b)+20 ; JSAMPARRAY * output_data_ptr
MIYASAKA Masarua2e6a9d2006-02-04 00:00:00 +0000210
DRCe5eaf372014-05-09 18:00:32 +0000211%define original_ebp ebp+0
212%define wk(i) ebp-(WK_NUM-(i))*SIZEOF_XMMWORD ; xmmword wk[WK_NUM]
213%define WK_NUM 4
214%define gotptr wk(0)-SIZEOF_POINTER ; void * gotptr
MIYASAKA Masarua2e6a9d2006-02-04 00:00:00 +0000215
DRCe5eaf372014-05-09 18:00:32 +0000216 align 16
217 global EXTN(jsimd_h2v2_fancy_upsample_sse2)
MIYASAKA Masarua2e6a9d2006-02-04 00:00:00 +0000218
Pierre Ossmaneea72152009-03-09 13:34:17 +0000219EXTN(jsimd_h2v2_fancy_upsample_sse2):
DRCe5eaf372014-05-09 18:00:32 +0000220 push ebp
221 mov eax,esp ; eax = original ebp
222 sub esp, byte 4
223 and esp, byte (-SIZEOF_XMMWORD) ; align to 128 bits
224 mov [esp],eax
225 mov ebp,esp ; ebp = aligned ebp
226 lea esp, [wk(0)]
227 pushpic eax ; make a room for GOT address
228 push ebx
229; push ecx ; need not be preserved
230; push edx ; need not be preserved
231 push esi
232 push edi
MIYASAKA Masarua2e6a9d2006-02-04 00:00:00 +0000233
DRCe5eaf372014-05-09 18:00:32 +0000234 get_GOT ebx ; get GOT address
235 movpic POINTER [gotptr], ebx ; save GOT address
MIYASAKA Masarua2e6a9d2006-02-04 00:00:00 +0000236
DRCe5eaf372014-05-09 18:00:32 +0000237 mov edx,eax ; edx = original ebp
238 mov eax, JDIMENSION [downsamp_width(edx)] ; colctr
239 test eax,eax
240 jz near .return
MIYASAKA Masarua2e6a9d2006-02-04 00:00:00 +0000241
DRCe5eaf372014-05-09 18:00:32 +0000242 mov ecx, INT [max_v_samp(edx)] ; rowctr
243 test ecx,ecx
244 jz near .return
MIYASAKA Masarua2e6a9d2006-02-04 00:00:00 +0000245
DRCe5eaf372014-05-09 18:00:32 +0000246 mov esi, JSAMPARRAY [input_data(edx)] ; input_data
247 mov edi, POINTER [output_data_ptr(edx)]
248 mov edi, JSAMPARRAY [edi] ; output_data
249 alignx 16,7
MIYASAKA Masarua2e6a9d2006-02-04 00:00:00 +0000250.rowloop:
DRCe5eaf372014-05-09 18:00:32 +0000251 push eax ; colctr
252 push ecx
253 push edi
254 push esi
MIYASAKA Masarua2e6a9d2006-02-04 00:00:00 +0000255
DRCe5eaf372014-05-09 18:00:32 +0000256 mov ecx, JSAMPROW [esi-1*SIZEOF_JSAMPROW] ; inptr1(above)
257 mov ebx, JSAMPROW [esi+0*SIZEOF_JSAMPROW] ; inptr0
258 mov esi, JSAMPROW [esi+1*SIZEOF_JSAMPROW] ; inptr1(below)
259 mov edx, JSAMPROW [edi+0*SIZEOF_JSAMPROW] ; outptr0
260 mov edi, JSAMPROW [edi+1*SIZEOF_JSAMPROW] ; outptr1
MIYASAKA Masarua2e6a9d2006-02-04 00:00:00 +0000261
DRCe5eaf372014-05-09 18:00:32 +0000262 test eax, SIZEOF_XMMWORD-1
263 jz short .skip
264 push edx
265 mov dl, JSAMPLE [ecx+(eax-1)*SIZEOF_JSAMPLE]
266 mov JSAMPLE [ecx+eax*SIZEOF_JSAMPLE], dl
267 mov dl, JSAMPLE [ebx+(eax-1)*SIZEOF_JSAMPLE]
268 mov JSAMPLE [ebx+eax*SIZEOF_JSAMPLE], dl
269 mov dl, JSAMPLE [esi+(eax-1)*SIZEOF_JSAMPLE]
270 mov JSAMPLE [esi+eax*SIZEOF_JSAMPLE], dl ; insert a dummy sample
271 pop edx
MIYASAKA Masarua2e6a9d2006-02-04 00:00:00 +0000272.skip:
DRCe5eaf372014-05-09 18:00:32 +0000273 ; -- process the first column block
MIYASAKA Masarua2e6a9d2006-02-04 00:00:00 +0000274
DRCe5eaf372014-05-09 18:00:32 +0000275 movdqa xmm0, XMMWORD [ebx+0*SIZEOF_XMMWORD] ; xmm0=row[ 0][0]
276 movdqa xmm1, XMMWORD [ecx+0*SIZEOF_XMMWORD] ; xmm1=row[-1][0]
277 movdqa xmm2, XMMWORD [esi+0*SIZEOF_XMMWORD] ; xmm2=row[+1][0]
MIYASAKA Masarua2e6a9d2006-02-04 00:00:00 +0000278
DRCe5eaf372014-05-09 18:00:32 +0000279 pushpic ebx
280 movpic ebx, POINTER [gotptr] ; load GOT address
MIYASAKA Masarua2e6a9d2006-02-04 00:00:00 +0000281
DRCe5eaf372014-05-09 18:00:32 +0000282 pxor xmm3,xmm3 ; xmm3=(all 0's)
283 movdqa xmm4,xmm0
284 punpcklbw xmm0,xmm3 ; xmm0=row[ 0]( 0 1 2 3 4 5 6 7)
285 punpckhbw xmm4,xmm3 ; xmm4=row[ 0]( 8 9 10 11 12 13 14 15)
286 movdqa xmm5,xmm1
287 punpcklbw xmm1,xmm3 ; xmm1=row[-1]( 0 1 2 3 4 5 6 7)
288 punpckhbw xmm5,xmm3 ; xmm5=row[-1]( 8 9 10 11 12 13 14 15)
289 movdqa xmm6,xmm2
290 punpcklbw xmm2,xmm3 ; xmm2=row[+1]( 0 1 2 3 4 5 6 7)
291 punpckhbw xmm6,xmm3 ; xmm6=row[+1]( 8 9 10 11 12 13 14 15)
MIYASAKA Masarua2e6a9d2006-02-04 00:00:00 +0000292
DRCe5eaf372014-05-09 18:00:32 +0000293 pmullw xmm0,[GOTOFF(ebx,PW_THREE)]
294 pmullw xmm4,[GOTOFF(ebx,PW_THREE)]
MIYASAKA Masarua2e6a9d2006-02-04 00:00:00 +0000295
DRCe5eaf372014-05-09 18:00:32 +0000296 pcmpeqb xmm7,xmm7
297 psrldq xmm7,(SIZEOF_XMMWORD-2)
MIYASAKA Masarua2e6a9d2006-02-04 00:00:00 +0000298
DRCe5eaf372014-05-09 18:00:32 +0000299 paddw xmm1,xmm0 ; xmm1=Int0L=( 0 1 2 3 4 5 6 7)
300 paddw xmm5,xmm4 ; xmm5=Int0H=( 8 9 10 11 12 13 14 15)
301 paddw xmm2,xmm0 ; xmm2=Int1L=( 0 1 2 3 4 5 6 7)
302 paddw xmm6,xmm4 ; xmm6=Int1H=( 8 9 10 11 12 13 14 15)
MIYASAKA Masarua2e6a9d2006-02-04 00:00:00 +0000303
DRCe5eaf372014-05-09 18:00:32 +0000304 movdqa XMMWORD [edx+0*SIZEOF_XMMWORD], xmm1 ; temporarily save
305 movdqa XMMWORD [edx+1*SIZEOF_XMMWORD], xmm5 ; the intermediate data
306 movdqa XMMWORD [edi+0*SIZEOF_XMMWORD], xmm2
307 movdqa XMMWORD [edi+1*SIZEOF_XMMWORD], xmm6
MIYASAKA Masarua2e6a9d2006-02-04 00:00:00 +0000308
DRCe5eaf372014-05-09 18:00:32 +0000309 pand xmm1,xmm7 ; xmm1=( 0 -- -- -- -- -- -- --)
310 pand xmm2,xmm7 ; xmm2=( 0 -- -- -- -- -- -- --)
MIYASAKA Masarua2e6a9d2006-02-04 00:00:00 +0000311
DRCe5eaf372014-05-09 18:00:32 +0000312 movdqa XMMWORD [wk(0)], xmm1
313 movdqa XMMWORD [wk(1)], xmm2
MIYASAKA Masarua2e6a9d2006-02-04 00:00:00 +0000314
DRCe5eaf372014-05-09 18:00:32 +0000315 poppic ebx
MIYASAKA Masarua2e6a9d2006-02-04 00:00:00 +0000316
DRCe5eaf372014-05-09 18:00:32 +0000317 add eax, byte SIZEOF_XMMWORD-1
318 and eax, byte -SIZEOF_XMMWORD
319 cmp eax, byte SIZEOF_XMMWORD
320 ja short .columnloop
321 alignx 16,7
MIYASAKA Masarua2e6a9d2006-02-04 00:00:00 +0000322
323.columnloop_last:
DRCe5eaf372014-05-09 18:00:32 +0000324 ; -- process the last column block
MIYASAKA Masarua2e6a9d2006-02-04 00:00:00 +0000325
DRCe5eaf372014-05-09 18:00:32 +0000326 pushpic ebx
327 movpic ebx, POINTER [gotptr] ; load GOT address
MIYASAKA Masarua2e6a9d2006-02-04 00:00:00 +0000328
DRCe5eaf372014-05-09 18:00:32 +0000329 pcmpeqb xmm1,xmm1
330 pslldq xmm1,(SIZEOF_XMMWORD-2)
331 movdqa xmm2,xmm1
MIYASAKA Masarua2e6a9d2006-02-04 00:00:00 +0000332
DRCe5eaf372014-05-09 18:00:32 +0000333 pand xmm1, XMMWORD [edx+1*SIZEOF_XMMWORD]
334 pand xmm2, XMMWORD [edi+1*SIZEOF_XMMWORD]
MIYASAKA Masarua2e6a9d2006-02-04 00:00:00 +0000335
DRCe5eaf372014-05-09 18:00:32 +0000336 movdqa XMMWORD [wk(2)], xmm1 ; xmm1=(-- -- -- -- -- -- -- 15)
337 movdqa XMMWORD [wk(3)], xmm2 ; xmm2=(-- -- -- -- -- -- -- 15)
MIYASAKA Masarua2e6a9d2006-02-04 00:00:00 +0000338
DRCe5eaf372014-05-09 18:00:32 +0000339 jmp near .upsample
340 alignx 16,7
MIYASAKA Masarua2e6a9d2006-02-04 00:00:00 +0000341
342.columnloop:
DRCe5eaf372014-05-09 18:00:32 +0000343 ; -- process the next column block
MIYASAKA Masarua2e6a9d2006-02-04 00:00:00 +0000344
DRCe5eaf372014-05-09 18:00:32 +0000345 movdqa xmm0, XMMWORD [ebx+1*SIZEOF_XMMWORD] ; xmm0=row[ 0][1]
346 movdqa xmm1, XMMWORD [ecx+1*SIZEOF_XMMWORD] ; xmm1=row[-1][1]
347 movdqa xmm2, XMMWORD [esi+1*SIZEOF_XMMWORD] ; xmm2=row[+1][1]
MIYASAKA Masarua2e6a9d2006-02-04 00:00:00 +0000348
DRCe5eaf372014-05-09 18:00:32 +0000349 pushpic ebx
350 movpic ebx, POINTER [gotptr] ; load GOT address
MIYASAKA Masarua2e6a9d2006-02-04 00:00:00 +0000351
DRCe5eaf372014-05-09 18:00:32 +0000352 pxor xmm3,xmm3 ; xmm3=(all 0's)
353 movdqa xmm4,xmm0
354 punpcklbw xmm0,xmm3 ; xmm0=row[ 0]( 0 1 2 3 4 5 6 7)
355 punpckhbw xmm4,xmm3 ; xmm4=row[ 0]( 8 9 10 11 12 13 14 15)
356 movdqa xmm5,xmm1
357 punpcklbw xmm1,xmm3 ; xmm1=row[-1]( 0 1 2 3 4 5 6 7)
358 punpckhbw xmm5,xmm3 ; xmm5=row[-1]( 8 9 10 11 12 13 14 15)
359 movdqa xmm6,xmm2
360 punpcklbw xmm2,xmm3 ; xmm2=row[+1]( 0 1 2 3 4 5 6 7)
361 punpckhbw xmm6,xmm3 ; xmm6=row[+1]( 8 9 10 11 12 13 14 15)
MIYASAKA Masarua2e6a9d2006-02-04 00:00:00 +0000362
DRCe5eaf372014-05-09 18:00:32 +0000363 pmullw xmm0,[GOTOFF(ebx,PW_THREE)]
364 pmullw xmm4,[GOTOFF(ebx,PW_THREE)]
MIYASAKA Masarua2e6a9d2006-02-04 00:00:00 +0000365
DRCe5eaf372014-05-09 18:00:32 +0000366 paddw xmm1,xmm0 ; xmm1=Int0L=( 0 1 2 3 4 5 6 7)
367 paddw xmm5,xmm4 ; xmm5=Int0H=( 8 9 10 11 12 13 14 15)
368 paddw xmm2,xmm0 ; xmm2=Int1L=( 0 1 2 3 4 5 6 7)
369 paddw xmm6,xmm4 ; xmm6=Int1H=( 8 9 10 11 12 13 14 15)
MIYASAKA Masarua2e6a9d2006-02-04 00:00:00 +0000370
DRCe5eaf372014-05-09 18:00:32 +0000371 movdqa XMMWORD [edx+2*SIZEOF_XMMWORD], xmm1 ; temporarily save
372 movdqa XMMWORD [edx+3*SIZEOF_XMMWORD], xmm5 ; the intermediate data
373 movdqa XMMWORD [edi+2*SIZEOF_XMMWORD], xmm2
374 movdqa XMMWORD [edi+3*SIZEOF_XMMWORD], xmm6
MIYASAKA Masarua2e6a9d2006-02-04 00:00:00 +0000375
DRCe5eaf372014-05-09 18:00:32 +0000376 pslldq xmm1,(SIZEOF_XMMWORD-2) ; xmm1=(-- -- -- -- -- -- -- 0)
377 pslldq xmm2,(SIZEOF_XMMWORD-2) ; xmm2=(-- -- -- -- -- -- -- 0)
MIYASAKA Masarua2e6a9d2006-02-04 00:00:00 +0000378
DRCe5eaf372014-05-09 18:00:32 +0000379 movdqa XMMWORD [wk(2)], xmm1
380 movdqa XMMWORD [wk(3)], xmm2
MIYASAKA Masarua2e6a9d2006-02-04 00:00:00 +0000381
382.upsample:
DRCe5eaf372014-05-09 18:00:32 +0000383 ; -- process the upper row
MIYASAKA Masarua2e6a9d2006-02-04 00:00:00 +0000384
DRCe5eaf372014-05-09 18:00:32 +0000385 movdqa xmm7, XMMWORD [edx+0*SIZEOF_XMMWORD]
386 movdqa xmm3, XMMWORD [edx+1*SIZEOF_XMMWORD]
MIYASAKA Masarua2e6a9d2006-02-04 00:00:00 +0000387
DRCe5eaf372014-05-09 18:00:32 +0000388 movdqa xmm0,xmm7 ; xmm7=Int0L=( 0 1 2 3 4 5 6 7)
389 movdqa xmm4,xmm3 ; xmm3=Int0H=( 8 9 10 11 12 13 14 15)
390 psrldq xmm0,2 ; xmm0=( 1 2 3 4 5 6 7 --)
391 pslldq xmm4,(SIZEOF_XMMWORD-2) ; xmm4=(-- -- -- -- -- -- -- 8)
392 movdqa xmm5,xmm7
393 movdqa xmm6,xmm3
394 psrldq xmm5,(SIZEOF_XMMWORD-2) ; xmm5=( 7 -- -- -- -- -- -- --)
395 pslldq xmm6,2 ; xmm6=(-- 8 9 10 11 12 13 14)
MIYASAKA Masarua2e6a9d2006-02-04 00:00:00 +0000396
DRCe5eaf372014-05-09 18:00:32 +0000397 por xmm0,xmm4 ; xmm0=( 1 2 3 4 5 6 7 8)
398 por xmm5,xmm6 ; xmm5=( 7 8 9 10 11 12 13 14)
MIYASAKA Masarua2e6a9d2006-02-04 00:00:00 +0000399
DRCe5eaf372014-05-09 18:00:32 +0000400 movdqa xmm1,xmm7
401 movdqa xmm2,xmm3
402 pslldq xmm1,2 ; xmm1=(-- 0 1 2 3 4 5 6)
403 psrldq xmm2,2 ; xmm2=( 9 10 11 12 13 14 15 --)
404 movdqa xmm4,xmm3
405 psrldq xmm4,(SIZEOF_XMMWORD-2) ; xmm4=(15 -- -- -- -- -- -- --)
MIYASAKA Masarua2e6a9d2006-02-04 00:00:00 +0000406
DRCe5eaf372014-05-09 18:00:32 +0000407 por xmm1, XMMWORD [wk(0)] ; xmm1=(-1 0 1 2 3 4 5 6)
408 por xmm2, XMMWORD [wk(2)] ; xmm2=( 9 10 11 12 13 14 15 16)
MIYASAKA Masarua2e6a9d2006-02-04 00:00:00 +0000409
DRCe5eaf372014-05-09 18:00:32 +0000410 movdqa XMMWORD [wk(0)], xmm4
MIYASAKA Masarua2e6a9d2006-02-04 00:00:00 +0000411
DRCe5eaf372014-05-09 18:00:32 +0000412 pmullw xmm7,[GOTOFF(ebx,PW_THREE)]
413 pmullw xmm3,[GOTOFF(ebx,PW_THREE)]
414 paddw xmm1,[GOTOFF(ebx,PW_EIGHT)]
415 paddw xmm5,[GOTOFF(ebx,PW_EIGHT)]
416 paddw xmm0,[GOTOFF(ebx,PW_SEVEN)]
417 paddw xmm2,[GOTOFF(ebx,PW_SEVEN)]
MIYASAKA Masarua2e6a9d2006-02-04 00:00:00 +0000418
DRCe5eaf372014-05-09 18:00:32 +0000419 paddw xmm1,xmm7
420 paddw xmm5,xmm3
421 psrlw xmm1,4 ; xmm1=Out0LE=( 0 2 4 6 8 10 12 14)
422 psrlw xmm5,4 ; xmm5=Out0HE=(16 18 20 22 24 26 28 30)
423 paddw xmm0,xmm7
424 paddw xmm2,xmm3
425 psrlw xmm0,4 ; xmm0=Out0LO=( 1 3 5 7 9 11 13 15)
426 psrlw xmm2,4 ; xmm2=Out0HO=(17 19 21 23 25 27 29 31)
MIYASAKA Masarua2e6a9d2006-02-04 00:00:00 +0000427
DRCe5eaf372014-05-09 18:00:32 +0000428 psllw xmm0,BYTE_BIT
429 psllw xmm2,BYTE_BIT
430 por xmm1,xmm0 ; xmm1=Out0L=( 0 1 2 ... 13 14 15)
431 por xmm5,xmm2 ; xmm5=Out0H=(16 17 18 ... 29 30 31)
MIYASAKA Masarua2e6a9d2006-02-04 00:00:00 +0000432
DRCe5eaf372014-05-09 18:00:32 +0000433 movdqa XMMWORD [edx+0*SIZEOF_XMMWORD], xmm1
434 movdqa XMMWORD [edx+1*SIZEOF_XMMWORD], xmm5
MIYASAKA Masarua2e6a9d2006-02-04 00:00:00 +0000435
DRCe5eaf372014-05-09 18:00:32 +0000436 ; -- process the lower row
MIYASAKA Masarua2e6a9d2006-02-04 00:00:00 +0000437
DRCe5eaf372014-05-09 18:00:32 +0000438 movdqa xmm6, XMMWORD [edi+0*SIZEOF_XMMWORD]
439 movdqa xmm4, XMMWORD [edi+1*SIZEOF_XMMWORD]
MIYASAKA Masarua2e6a9d2006-02-04 00:00:00 +0000440
DRCe5eaf372014-05-09 18:00:32 +0000441 movdqa xmm7,xmm6 ; xmm6=Int1L=( 0 1 2 3 4 5 6 7)
442 movdqa xmm3,xmm4 ; xmm4=Int1H=( 8 9 10 11 12 13 14 15)
443 psrldq xmm7,2 ; xmm7=( 1 2 3 4 5 6 7 --)
444 pslldq xmm3,(SIZEOF_XMMWORD-2) ; xmm3=(-- -- -- -- -- -- -- 8)
445 movdqa xmm0,xmm6
446 movdqa xmm2,xmm4
447 psrldq xmm0,(SIZEOF_XMMWORD-2) ; xmm0=( 7 -- -- -- -- -- -- --)
448 pslldq xmm2,2 ; xmm2=(-- 8 9 10 11 12 13 14)
MIYASAKA Masarua2e6a9d2006-02-04 00:00:00 +0000449
DRCe5eaf372014-05-09 18:00:32 +0000450 por xmm7,xmm3 ; xmm7=( 1 2 3 4 5 6 7 8)
451 por xmm0,xmm2 ; xmm0=( 7 8 9 10 11 12 13 14)
MIYASAKA Masarua2e6a9d2006-02-04 00:00:00 +0000452
DRCe5eaf372014-05-09 18:00:32 +0000453 movdqa xmm1,xmm6
454 movdqa xmm5,xmm4
455 pslldq xmm1,2 ; xmm1=(-- 0 1 2 3 4 5 6)
456 psrldq xmm5,2 ; xmm5=( 9 10 11 12 13 14 15 --)
457 movdqa xmm3,xmm4
458 psrldq xmm3,(SIZEOF_XMMWORD-2) ; xmm3=(15 -- -- -- -- -- -- --)
MIYASAKA Masarua2e6a9d2006-02-04 00:00:00 +0000459
DRCe5eaf372014-05-09 18:00:32 +0000460 por xmm1, XMMWORD [wk(1)] ; xmm1=(-1 0 1 2 3 4 5 6)
461 por xmm5, XMMWORD [wk(3)] ; xmm5=( 9 10 11 12 13 14 15 16)
MIYASAKA Masarua2e6a9d2006-02-04 00:00:00 +0000462
DRCe5eaf372014-05-09 18:00:32 +0000463 movdqa XMMWORD [wk(1)], xmm3
MIYASAKA Masarua2e6a9d2006-02-04 00:00:00 +0000464
DRCe5eaf372014-05-09 18:00:32 +0000465 pmullw xmm6,[GOTOFF(ebx,PW_THREE)]
466 pmullw xmm4,[GOTOFF(ebx,PW_THREE)]
467 paddw xmm1,[GOTOFF(ebx,PW_EIGHT)]
468 paddw xmm0,[GOTOFF(ebx,PW_EIGHT)]
469 paddw xmm7,[GOTOFF(ebx,PW_SEVEN)]
470 paddw xmm5,[GOTOFF(ebx,PW_SEVEN)]
MIYASAKA Masarua2e6a9d2006-02-04 00:00:00 +0000471
DRCe5eaf372014-05-09 18:00:32 +0000472 paddw xmm1,xmm6
473 paddw xmm0,xmm4
474 psrlw xmm1,4 ; xmm1=Out1LE=( 0 2 4 6 8 10 12 14)
475 psrlw xmm0,4 ; xmm0=Out1HE=(16 18 20 22 24 26 28 30)
476 paddw xmm7,xmm6
477 paddw xmm5,xmm4
478 psrlw xmm7,4 ; xmm7=Out1LO=( 1 3 5 7 9 11 13 15)
479 psrlw xmm5,4 ; xmm5=Out1HO=(17 19 21 23 25 27 29 31)
MIYASAKA Masarua2e6a9d2006-02-04 00:00:00 +0000480
DRCe5eaf372014-05-09 18:00:32 +0000481 psllw xmm7,BYTE_BIT
482 psllw xmm5,BYTE_BIT
483 por xmm1,xmm7 ; xmm1=Out1L=( 0 1 2 ... 13 14 15)
484 por xmm0,xmm5 ; xmm0=Out1H=(16 17 18 ... 29 30 31)
MIYASAKA Masarua2e6a9d2006-02-04 00:00:00 +0000485
DRCe5eaf372014-05-09 18:00:32 +0000486 movdqa XMMWORD [edi+0*SIZEOF_XMMWORD], xmm1
487 movdqa XMMWORD [edi+1*SIZEOF_XMMWORD], xmm0
MIYASAKA Masarua2e6a9d2006-02-04 00:00:00 +0000488
DRCe5eaf372014-05-09 18:00:32 +0000489 poppic ebx
MIYASAKA Masarua2e6a9d2006-02-04 00:00:00 +0000490
DRCe5eaf372014-05-09 18:00:32 +0000491 sub eax, byte SIZEOF_XMMWORD
492 add ecx, byte 1*SIZEOF_XMMWORD ; inptr1(above)
493 add ebx, byte 1*SIZEOF_XMMWORD ; inptr0
494 add esi, byte 1*SIZEOF_XMMWORD ; inptr1(below)
495 add edx, byte 2*SIZEOF_XMMWORD ; outptr0
496 add edi, byte 2*SIZEOF_XMMWORD ; outptr1
497 cmp eax, byte SIZEOF_XMMWORD
498 ja near .columnloop
499 test eax,eax
500 jnz near .columnloop_last
MIYASAKA Masarua2e6a9d2006-02-04 00:00:00 +0000501
DRCe5eaf372014-05-09 18:00:32 +0000502 pop esi
503 pop edi
504 pop ecx
505 pop eax
MIYASAKA Masarua2e6a9d2006-02-04 00:00:00 +0000506
DRCe5eaf372014-05-09 18:00:32 +0000507 add esi, byte 1*SIZEOF_JSAMPROW ; input_data
508 add edi, byte 2*SIZEOF_JSAMPROW ; output_data
509 sub ecx, byte 2 ; rowctr
510 jg near .rowloop
MIYASAKA Masarua2e6a9d2006-02-04 00:00:00 +0000511
512.return:
DRCe5eaf372014-05-09 18:00:32 +0000513 pop edi
514 pop esi
515; pop edx ; need not be preserved
516; pop ecx ; need not be preserved
517 pop ebx
518 mov esp,ebp ; esp <- aligned ebp
519 pop esp ; esp <- original ebp
520 pop ebp
521 ret
MIYASAKA Masarua2e6a9d2006-02-04 00:00:00 +0000522
MIYASAKA Masarua2e6a9d2006-02-04 00:00:00 +0000523; --------------------------------------------------------------------------
524;
MIYASAKA Masarua2e6a9d2006-02-04 00:00:00 +0000525; Fast processing for the common case of 2:1 horizontal and 1:1 vertical.
526; It's still a box filter.
527;
528; GLOBAL(void)
Pierre Ossmaneea72152009-03-09 13:34:17 +0000529; jsimd_h2v1_upsample_sse2 (int max_v_samp_factor,
530; JDIMENSION output_width,
531; JSAMPARRAY input_data,
532; JSAMPARRAY * output_data_ptr);
MIYASAKA Masarua2e6a9d2006-02-04 00:00:00 +0000533;
534
DRCe5eaf372014-05-09 18:00:32 +0000535%define max_v_samp(b) (b)+8 ; int max_v_samp_factor
536%define output_width(b) (b)+12 ; JDIMENSION output_width
537%define input_data(b) (b)+16 ; JSAMPARRAY input_data
538%define output_data_ptr(b) (b)+20 ; JSAMPARRAY * output_data_ptr
MIYASAKA Masarua2e6a9d2006-02-04 00:00:00 +0000539
DRCe5eaf372014-05-09 18:00:32 +0000540 align 16
541 global EXTN(jsimd_h2v1_upsample_sse2)
MIYASAKA Masarua2e6a9d2006-02-04 00:00:00 +0000542
Pierre Ossmaneea72152009-03-09 13:34:17 +0000543EXTN(jsimd_h2v1_upsample_sse2):
DRCe5eaf372014-05-09 18:00:32 +0000544 push ebp
545 mov ebp,esp
546; push ebx ; unused
547; push ecx ; need not be preserved
548; push edx ; need not be preserved
549 push esi
550 push edi
MIYASAKA Masarua2e6a9d2006-02-04 00:00:00 +0000551
DRCe5eaf372014-05-09 18:00:32 +0000552 mov edx, JDIMENSION [output_width(ebp)]
553 add edx, byte (2*SIZEOF_XMMWORD)-1
554 and edx, byte -(2*SIZEOF_XMMWORD)
555 jz short .return
MIYASAKA Masarua2e6a9d2006-02-04 00:00:00 +0000556
DRCe5eaf372014-05-09 18:00:32 +0000557 mov ecx, INT [max_v_samp(ebp)] ; rowctr
558 test ecx,ecx
559 jz short .return
MIYASAKA Masarua2e6a9d2006-02-04 00:00:00 +0000560
DRCe5eaf372014-05-09 18:00:32 +0000561 mov esi, JSAMPARRAY [input_data(ebp)] ; input_data
562 mov edi, POINTER [output_data_ptr(ebp)]
563 mov edi, JSAMPARRAY [edi] ; output_data
564 alignx 16,7
MIYASAKA Masarua2e6a9d2006-02-04 00:00:00 +0000565.rowloop:
DRCe5eaf372014-05-09 18:00:32 +0000566 push edi
567 push esi
MIYASAKA Masarua2e6a9d2006-02-04 00:00:00 +0000568
DRCe5eaf372014-05-09 18:00:32 +0000569 mov esi, JSAMPROW [esi] ; inptr
570 mov edi, JSAMPROW [edi] ; outptr
571 mov eax,edx ; colctr
572 alignx 16,7
MIYASAKA Masarua2e6a9d2006-02-04 00:00:00 +0000573.columnloop:
574
DRCe5eaf372014-05-09 18:00:32 +0000575 movdqa xmm0, XMMWORD [esi+0*SIZEOF_XMMWORD]
MIYASAKA Masarua2e6a9d2006-02-04 00:00:00 +0000576
DRCe5eaf372014-05-09 18:00:32 +0000577 movdqa xmm1,xmm0
578 punpcklbw xmm0,xmm0
579 punpckhbw xmm1,xmm1
MIYASAKA Masarua2e6a9d2006-02-04 00:00:00 +0000580
DRCe5eaf372014-05-09 18:00:32 +0000581 movdqa XMMWORD [edi+0*SIZEOF_XMMWORD], xmm0
582 movdqa XMMWORD [edi+1*SIZEOF_XMMWORD], xmm1
MIYASAKA Masarua2e6a9d2006-02-04 00:00:00 +0000583
DRCe5eaf372014-05-09 18:00:32 +0000584 sub eax, byte 2*SIZEOF_XMMWORD
585 jz short .nextrow
MIYASAKA Masarua2e6a9d2006-02-04 00:00:00 +0000586
DRCe5eaf372014-05-09 18:00:32 +0000587 movdqa xmm2, XMMWORD [esi+1*SIZEOF_XMMWORD]
MIYASAKA Masarua2e6a9d2006-02-04 00:00:00 +0000588
DRCe5eaf372014-05-09 18:00:32 +0000589 movdqa xmm3,xmm2
590 punpcklbw xmm2,xmm2
591 punpckhbw xmm3,xmm3
MIYASAKA Masarua2e6a9d2006-02-04 00:00:00 +0000592
DRCe5eaf372014-05-09 18:00:32 +0000593 movdqa XMMWORD [edi+2*SIZEOF_XMMWORD], xmm2
594 movdqa XMMWORD [edi+3*SIZEOF_XMMWORD], xmm3
MIYASAKA Masarua2e6a9d2006-02-04 00:00:00 +0000595
DRCe5eaf372014-05-09 18:00:32 +0000596 sub eax, byte 2*SIZEOF_XMMWORD
597 jz short .nextrow
MIYASAKA Masarua2e6a9d2006-02-04 00:00:00 +0000598
DRCe5eaf372014-05-09 18:00:32 +0000599 add esi, byte 2*SIZEOF_XMMWORD ; inptr
600 add edi, byte 4*SIZEOF_XMMWORD ; outptr
601 jmp short .columnloop
602 alignx 16,7
MIYASAKA Masarua2e6a9d2006-02-04 00:00:00 +0000603
604.nextrow:
DRCe5eaf372014-05-09 18:00:32 +0000605 pop esi
606 pop edi
MIYASAKA Masarua2e6a9d2006-02-04 00:00:00 +0000607
DRCe5eaf372014-05-09 18:00:32 +0000608 add esi, byte SIZEOF_JSAMPROW ; input_data
609 add edi, byte SIZEOF_JSAMPROW ; output_data
610 dec ecx ; rowctr
611 jg short .rowloop
MIYASAKA Masarua2e6a9d2006-02-04 00:00:00 +0000612
613.return:
DRCe5eaf372014-05-09 18:00:32 +0000614 pop edi
615 pop esi
616; pop edx ; need not be preserved
617; pop ecx ; need not be preserved
618; pop ebx ; unused
619 pop ebp
620 ret
MIYASAKA Masarua2e6a9d2006-02-04 00:00:00 +0000621
622; --------------------------------------------------------------------------
623;
624; Fast processing for the common case of 2:1 horizontal and 2:1 vertical.
625; It's still a box filter.
626;
627; GLOBAL(void)
Pierre Ossmaneea72152009-03-09 13:34:17 +0000628; jsimd_h2v2_upsample_sse2 (nt max_v_samp_factor,
629; JDIMENSION output_width,
630; JSAMPARRAY input_data,
631; JSAMPARRAY * output_data_ptr);
MIYASAKA Masarua2e6a9d2006-02-04 00:00:00 +0000632;
633
DRCe5eaf372014-05-09 18:00:32 +0000634%define max_v_samp(b) (b)+8 ; int max_v_samp_factor
635%define output_width(b) (b)+12 ; JDIMENSION output_width
636%define input_data(b) (b)+16 ; JSAMPARRAY input_data
637%define output_data_ptr(b) (b)+20 ; JSAMPARRAY * output_data_ptr
MIYASAKA Masarua2e6a9d2006-02-04 00:00:00 +0000638
DRCe5eaf372014-05-09 18:00:32 +0000639 align 16
640 global EXTN(jsimd_h2v2_upsample_sse2)
MIYASAKA Masarua2e6a9d2006-02-04 00:00:00 +0000641
Pierre Ossmaneea72152009-03-09 13:34:17 +0000642EXTN(jsimd_h2v2_upsample_sse2):
DRCe5eaf372014-05-09 18:00:32 +0000643 push ebp
644 mov ebp,esp
645 push ebx
646; push ecx ; need not be preserved
647; push edx ; need not be preserved
648 push esi
649 push edi
MIYASAKA Masarua2e6a9d2006-02-04 00:00:00 +0000650
DRCe5eaf372014-05-09 18:00:32 +0000651 mov edx, JDIMENSION [output_width(ebp)]
652 add edx, byte (2*SIZEOF_XMMWORD)-1
653 and edx, byte -(2*SIZEOF_XMMWORD)
654 jz near .return
MIYASAKA Masarua2e6a9d2006-02-04 00:00:00 +0000655
DRCe5eaf372014-05-09 18:00:32 +0000656 mov ecx, INT [max_v_samp(ebp)] ; rowctr
657 test ecx,ecx
658 jz near .return
MIYASAKA Masarua2e6a9d2006-02-04 00:00:00 +0000659
DRCe5eaf372014-05-09 18:00:32 +0000660 mov esi, JSAMPARRAY [input_data(ebp)] ; input_data
661 mov edi, POINTER [output_data_ptr(ebp)]
662 mov edi, JSAMPARRAY [edi] ; output_data
663 alignx 16,7
MIYASAKA Masarua2e6a9d2006-02-04 00:00:00 +0000664.rowloop:
DRCe5eaf372014-05-09 18:00:32 +0000665 push edi
666 push esi
MIYASAKA Masarua2e6a9d2006-02-04 00:00:00 +0000667
DRCe5eaf372014-05-09 18:00:32 +0000668 mov esi, JSAMPROW [esi] ; inptr
669 mov ebx, JSAMPROW [edi+0*SIZEOF_JSAMPROW] ; outptr0
670 mov edi, JSAMPROW [edi+1*SIZEOF_JSAMPROW] ; outptr1
671 mov eax,edx ; colctr
672 alignx 16,7
MIYASAKA Masarua2e6a9d2006-02-04 00:00:00 +0000673.columnloop:
674
DRCe5eaf372014-05-09 18:00:32 +0000675 movdqa xmm0, XMMWORD [esi+0*SIZEOF_XMMWORD]
MIYASAKA Masarua2e6a9d2006-02-04 00:00:00 +0000676
DRCe5eaf372014-05-09 18:00:32 +0000677 movdqa xmm1,xmm0
678 punpcklbw xmm0,xmm0
679 punpckhbw xmm1,xmm1
MIYASAKA Masarua2e6a9d2006-02-04 00:00:00 +0000680
DRCe5eaf372014-05-09 18:00:32 +0000681 movdqa XMMWORD [ebx+0*SIZEOF_XMMWORD], xmm0
682 movdqa XMMWORD [ebx+1*SIZEOF_XMMWORD], xmm1
683 movdqa XMMWORD [edi+0*SIZEOF_XMMWORD], xmm0
684 movdqa XMMWORD [edi+1*SIZEOF_XMMWORD], xmm1
MIYASAKA Masarua2e6a9d2006-02-04 00:00:00 +0000685
DRCe5eaf372014-05-09 18:00:32 +0000686 sub eax, byte 2*SIZEOF_XMMWORD
687 jz short .nextrow
MIYASAKA Masarua2e6a9d2006-02-04 00:00:00 +0000688
DRCe5eaf372014-05-09 18:00:32 +0000689 movdqa xmm2, XMMWORD [esi+1*SIZEOF_XMMWORD]
MIYASAKA Masarua2e6a9d2006-02-04 00:00:00 +0000690
DRCe5eaf372014-05-09 18:00:32 +0000691 movdqa xmm3,xmm2
692 punpcklbw xmm2,xmm2
693 punpckhbw xmm3,xmm3
MIYASAKA Masarua2e6a9d2006-02-04 00:00:00 +0000694
DRCe5eaf372014-05-09 18:00:32 +0000695 movdqa XMMWORD [ebx+2*SIZEOF_XMMWORD], xmm2
696 movdqa XMMWORD [ebx+3*SIZEOF_XMMWORD], xmm3
697 movdqa XMMWORD [edi+2*SIZEOF_XMMWORD], xmm2
698 movdqa XMMWORD [edi+3*SIZEOF_XMMWORD], xmm3
MIYASAKA Masarua2e6a9d2006-02-04 00:00:00 +0000699
DRCe5eaf372014-05-09 18:00:32 +0000700 sub eax, byte 2*SIZEOF_XMMWORD
701 jz short .nextrow
MIYASAKA Masarua2e6a9d2006-02-04 00:00:00 +0000702
DRCe5eaf372014-05-09 18:00:32 +0000703 add esi, byte 2*SIZEOF_XMMWORD ; inptr
704 add ebx, byte 4*SIZEOF_XMMWORD ; outptr0
705 add edi, byte 4*SIZEOF_XMMWORD ; outptr1
706 jmp short .columnloop
707 alignx 16,7
MIYASAKA Masarua2e6a9d2006-02-04 00:00:00 +0000708
709.nextrow:
DRCe5eaf372014-05-09 18:00:32 +0000710 pop esi
711 pop edi
MIYASAKA Masarua2e6a9d2006-02-04 00:00:00 +0000712
DRCe5eaf372014-05-09 18:00:32 +0000713 add esi, byte 1*SIZEOF_JSAMPROW ; input_data
714 add edi, byte 2*SIZEOF_JSAMPROW ; output_data
715 sub ecx, byte 2 ; rowctr
716 jg short .rowloop
MIYASAKA Masarua2e6a9d2006-02-04 00:00:00 +0000717
718.return:
DRCe5eaf372014-05-09 18:00:32 +0000719 pop edi
720 pop esi
721; pop edx ; need not be preserved
722; pop ecx ; need not be preserved
723 pop ebx
724 pop ebp
725 ret
MIYASAKA Masarua2e6a9d2006-02-04 00:00:00 +0000726
DRC132b5fd2009-10-08 09:04:56 +0000727; For some reason, the OS X linker does not honor the request to align the
728; segment unless we do this.
DRCe5eaf372014-05-09 18:00:32 +0000729 align 16