blob: 0a509408aa1314f5434a60aff46fd61d0a1efa08 [file] [log] [blame]
MIYASAKA Masarua2e6a9d2006-02-04 00:00:00 +00001;
DRC72130be2014-05-09 20:14:26 +00002; jquanti.asm - sample data conversion and quantization (SSE2)
MIYASAKA Masarua2e6a9d2006-02-04 00:00:00 +00003;
Pierre Ossmaneea72152009-03-09 13:34:17 +00004; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
DRC7ee3ce92016-07-05 16:19:26 -05005; Copyright (C) 2016, D. R. Commander.
Pierre Ossmaneea72152009-03-09 13:34:17 +00006;
DRC123f7252016-05-24 10:23:56 -05007; Based on the x86 SIMD extension for IJG JPEG library
MIYASAKA Masarua2e6a9d2006-02-04 00:00:00 +00008; Copyright (C) 1999-2006, MIYASAKA Masaru.
9; For conditions of distribution and use, see copyright notice in jsimdext.inc
10;
11; This file should be assembled with NASM (Netwide Assembler),
12; can *not* be assembled with Microsoft's MASM or any compatible
13; assembler (including Borland's Turbo Assembler).
14; NASM is available from http://nasm.sourceforge.net/ or
15; http://sourceforge.net/project/showfiles.php?group_id=6208
MIYASAKA Masarua2e6a9d2006-02-04 00:00:00 +000016
Pierre Ossman3a65ef42009-03-16 13:34:18 +000017%include "jsimdext.inc"
18%include "jdct.inc"
MIYASAKA Masarua2e6a9d2006-02-04 00:00:00 +000019
20; --------------------------------------------------------------------------
DRCff5685d2016-05-27 16:58:23 -050021 SECTION SEG_TEXT
22 BITS 32
MIYASAKA Masarua2e6a9d2006-02-04 00:00:00 +000023;
24; Load data into workspace, applying unsigned->signed conversion
25;
26; GLOBAL(void)
DRC19c791c2018-03-08 10:55:20 -060027; jsimd_convsamp_sse2(JSAMPARRAY sample_data, JDIMENSION start_col,
28; DCTELEM *workspace);
MIYASAKA Masarua2e6a9d2006-02-04 00:00:00 +000029;
30
DRC19c791c2018-03-08 10:55:20 -060031%define sample_data ebp + 8 ; JSAMPARRAY sample_data
32%define start_col ebp + 12 ; JDIMENSION start_col
33%define workspace ebp + 16 ; DCTELEM *workspace
MIYASAKA Masarua2e6a9d2006-02-04 00:00:00 +000034
DRC7ee3ce92016-07-05 16:19:26 -050035 align 32
mayeut88421562018-02-23 21:56:32 +010036 GLOBAL_FUNCTION(jsimd_convsamp_sse2)
MIYASAKA Masarua2e6a9d2006-02-04 00:00:00 +000037
Pierre Ossmaneea72152009-03-09 13:34:17 +000038EXTN(jsimd_convsamp_sse2):
DRCff5685d2016-05-27 16:58:23 -050039 push ebp
40 mov ebp, esp
41 push ebx
42; push ecx ; need not be preserved
43; push edx ; need not be preserved
44 push esi
45 push edi
MIYASAKA Masarua2e6a9d2006-02-04 00:00:00 +000046
DRCff5685d2016-05-27 16:58:23 -050047 pxor xmm6, xmm6 ; xmm6=(all 0's)
48 pcmpeqw xmm7, xmm7
49 psllw xmm7, 7 ; xmm7={0xFF80 0xFF80 0xFF80 0xFF80 ..}
MIYASAKA Masarua2e6a9d2006-02-04 00:00:00 +000050
DRCff5685d2016-05-27 16:58:23 -050051 mov esi, JSAMPARRAY [sample_data] ; (JSAMPROW *)
52 mov eax, JDIMENSION [start_col]
53 mov edi, POINTER [workspace] ; (DCTELEM *)
54 mov ecx, DCTSIZE/4
55 alignx 16, 7
MIYASAKA Masarua2e6a9d2006-02-04 00:00:00 +000056.convloop:
DRCff5685d2016-05-27 16:58:23 -050057 mov ebx, JSAMPROW [esi+0*SIZEOF_JSAMPROW] ; (JSAMPLE *)
58 mov edx, JSAMPROW [esi+1*SIZEOF_JSAMPROW] ; (JSAMPLE *)
MIYASAKA Masarua2e6a9d2006-02-04 00:00:00 +000059
DRCff5685d2016-05-27 16:58:23 -050060 movq xmm0, XMM_MMWORD [ebx+eax*SIZEOF_JSAMPLE] ; xmm0=(01234567)
61 movq xmm1, XMM_MMWORD [edx+eax*SIZEOF_JSAMPLE] ; xmm1=(89ABCDEF)
MIYASAKA Masarua2e6a9d2006-02-04 00:00:00 +000062
DRCff5685d2016-05-27 16:58:23 -050063 mov ebx, JSAMPROW [esi+2*SIZEOF_JSAMPROW] ; (JSAMPLE *)
64 mov edx, JSAMPROW [esi+3*SIZEOF_JSAMPROW] ; (JSAMPLE *)
MIYASAKA Masarua2e6a9d2006-02-04 00:00:00 +000065
DRCff5685d2016-05-27 16:58:23 -050066 movq xmm2, XMM_MMWORD [ebx+eax*SIZEOF_JSAMPLE] ; xmm2=(GHIJKLMN)
67 movq xmm3, XMM_MMWORD [edx+eax*SIZEOF_JSAMPLE] ; xmm3=(OPQRSTUV)
MIYASAKA Masarua2e6a9d2006-02-04 00:00:00 +000068
DRCff5685d2016-05-27 16:58:23 -050069 punpcklbw xmm0, xmm6 ; xmm0=(01234567)
70 punpcklbw xmm1, xmm6 ; xmm1=(89ABCDEF)
71 paddw xmm0, xmm7
72 paddw xmm1, xmm7
73 punpcklbw xmm2, xmm6 ; xmm2=(GHIJKLMN)
74 punpcklbw xmm3, xmm6 ; xmm3=(OPQRSTUV)
75 paddw xmm2, xmm7
76 paddw xmm3, xmm7
MIYASAKA Masarua2e6a9d2006-02-04 00:00:00 +000077
DRCff5685d2016-05-27 16:58:23 -050078 movdqa XMMWORD [XMMBLOCK(0,0,edi,SIZEOF_DCTELEM)], xmm0
79 movdqa XMMWORD [XMMBLOCK(1,0,edi,SIZEOF_DCTELEM)], xmm1
80 movdqa XMMWORD [XMMBLOCK(2,0,edi,SIZEOF_DCTELEM)], xmm2
81 movdqa XMMWORD [XMMBLOCK(3,0,edi,SIZEOF_DCTELEM)], xmm3
MIYASAKA Masarua2e6a9d2006-02-04 00:00:00 +000082
DRCff5685d2016-05-27 16:58:23 -050083 add esi, byte 4*SIZEOF_JSAMPROW
84 add edi, byte 4*DCTSIZE*SIZEOF_DCTELEM
85 dec ecx
86 jnz short .convloop
MIYASAKA Masarua2e6a9d2006-02-04 00:00:00 +000087
DRCff5685d2016-05-27 16:58:23 -050088 pop edi
89 pop esi
90; pop edx ; need not be preserved
91; pop ecx ; need not be preserved
92 pop ebx
93 pop ebp
94 ret
MIYASAKA Masarua2e6a9d2006-02-04 00:00:00 +000095
MIYASAKA Masarua2e6a9d2006-02-04 00:00:00 +000096; --------------------------------------------------------------------------
97;
98; Quantize/descale the coefficients, and store into coef_block
99;
100; This implementation is based on an algorithm described in
101; "How to optimize for the Pentium family of microprocessors"
102; (http://www.agner.org/assem/).
103;
104; GLOBAL(void)
DRC19c791c2018-03-08 10:55:20 -0600105; jsimd_quantize_sse2(JCOEFPTR coef_block, DCTELEM *divisors,
106; DCTELEM *workspace);
MIYASAKA Masarua2e6a9d2006-02-04 00:00:00 +0000107;
108
DRC19c791c2018-03-08 10:55:20 -0600109%define RECIPROCAL(m, n, b) \
110 XMMBLOCK(DCTSIZE * 0 + (m), (n), (b), SIZEOF_DCTELEM)
111%define CORRECTION(m, n, b) \
112 XMMBLOCK(DCTSIZE * 1 + (m), (n), (b), SIZEOF_DCTELEM)
113%define SCALE(m, n, b) \
114 XMMBLOCK(DCTSIZE * 2 + (m), (n), (b), SIZEOF_DCTELEM)
MIYASAKA Masarua2e6a9d2006-02-04 00:00:00 +0000115
DRC19c791c2018-03-08 10:55:20 -0600116%define coef_block ebp + 8 ; JCOEFPTR coef_block
117%define divisors ebp + 12 ; DCTELEM *divisors
118%define workspace ebp + 16 ; DCTELEM *workspace
MIYASAKA Masarua2e6a9d2006-02-04 00:00:00 +0000119
DRC7ee3ce92016-07-05 16:19:26 -0500120 align 32
mayeut88421562018-02-23 21:56:32 +0100121 GLOBAL_FUNCTION(jsimd_quantize_sse2)
MIYASAKA Masarua2e6a9d2006-02-04 00:00:00 +0000122
Pierre Ossmaneea72152009-03-09 13:34:17 +0000123EXTN(jsimd_quantize_sse2):
DRCff5685d2016-05-27 16:58:23 -0500124 push ebp
125 mov ebp, esp
126; push ebx ; unused
127; push ecx ; unused
128; push edx ; need not be preserved
129 push esi
130 push edi
MIYASAKA Masarua2e6a9d2006-02-04 00:00:00 +0000131
DRCff5685d2016-05-27 16:58:23 -0500132 mov esi, POINTER [workspace]
133 mov edx, POINTER [divisors]
134 mov edi, JCOEFPTR [coef_block]
135 mov eax, DCTSIZE2/32
136 alignx 16, 7
MIYASAKA Masarua2e6a9d2006-02-04 00:00:00 +0000137.quantloop:
DRCff5685d2016-05-27 16:58:23 -0500138 movdqa xmm4, XMMWORD [XMMBLOCK(0,0,esi,SIZEOF_DCTELEM)]
139 movdqa xmm5, XMMWORD [XMMBLOCK(1,0,esi,SIZEOF_DCTELEM)]
140 movdqa xmm6, XMMWORD [XMMBLOCK(2,0,esi,SIZEOF_DCTELEM)]
141 movdqa xmm7, XMMWORD [XMMBLOCK(3,0,esi,SIZEOF_DCTELEM)]
142 movdqa xmm0, xmm4
143 movdqa xmm1, xmm5
144 movdqa xmm2, xmm6
145 movdqa xmm3, xmm7
146 psraw xmm4, (WORD_BIT-1)
147 psraw xmm5, (WORD_BIT-1)
148 psraw xmm6, (WORD_BIT-1)
149 psraw xmm7, (WORD_BIT-1)
150 pxor xmm0, xmm4
151 pxor xmm1, xmm5
152 pxor xmm2, xmm6
153 pxor xmm3, xmm7
154 psubw xmm0, xmm4 ; if (xmm0 < 0) xmm0 = -xmm0;
155 psubw xmm1, xmm5 ; if (xmm1 < 0) xmm1 = -xmm1;
156 psubw xmm2, xmm6 ; if (xmm2 < 0) xmm2 = -xmm2;
157 psubw xmm3, xmm7 ; if (xmm3 < 0) xmm3 = -xmm3;
MIYASAKA Masarua2e6a9d2006-02-04 00:00:00 +0000158
DRCff5685d2016-05-27 16:58:23 -0500159 paddw xmm0, XMMWORD [CORRECTION(0,0,edx)] ; correction + roundfactor
160 paddw xmm1, XMMWORD [CORRECTION(1,0,edx)]
161 paddw xmm2, XMMWORD [CORRECTION(2,0,edx)]
162 paddw xmm3, XMMWORD [CORRECTION(3,0,edx)]
163 pmulhuw xmm0, XMMWORD [RECIPROCAL(0,0,edx)] ; reciprocal
164 pmulhuw xmm1, XMMWORD [RECIPROCAL(1,0,edx)]
165 pmulhuw xmm2, XMMWORD [RECIPROCAL(2,0,edx)]
166 pmulhuw xmm3, XMMWORD [RECIPROCAL(3,0,edx)]
167 pmulhuw xmm0, XMMWORD [SCALE(0,0,edx)] ; scale
168 pmulhuw xmm1, XMMWORD [SCALE(1,0,edx)]
169 pmulhuw xmm2, XMMWORD [SCALE(2,0,edx)]
170 pmulhuw xmm3, XMMWORD [SCALE(3,0,edx)]
MIYASAKA Masarua2e6a9d2006-02-04 00:00:00 +0000171
DRCff5685d2016-05-27 16:58:23 -0500172 pxor xmm0, xmm4
173 pxor xmm1, xmm5
174 pxor xmm2, xmm6
175 pxor xmm3, xmm7
176 psubw xmm0, xmm4
177 psubw xmm1, xmm5
178 psubw xmm2, xmm6
179 psubw xmm3, xmm7
180 movdqa XMMWORD [XMMBLOCK(0,0,edi,SIZEOF_DCTELEM)], xmm0
181 movdqa XMMWORD [XMMBLOCK(1,0,edi,SIZEOF_DCTELEM)], xmm1
182 movdqa XMMWORD [XMMBLOCK(2,0,edi,SIZEOF_DCTELEM)], xmm2
183 movdqa XMMWORD [XMMBLOCK(3,0,edi,SIZEOF_DCTELEM)], xmm3
MIYASAKA Masarua2e6a9d2006-02-04 00:00:00 +0000184
DRCff5685d2016-05-27 16:58:23 -0500185 add esi, byte 32*SIZEOF_DCTELEM
186 add edx, byte 32*SIZEOF_DCTELEM
187 add edi, byte 32*SIZEOF_JCOEF
188 dec eax
189 jnz near .quantloop
MIYASAKA Masarua2e6a9d2006-02-04 00:00:00 +0000190
DRCff5685d2016-05-27 16:58:23 -0500191 pop edi
192 pop esi
193; pop edx ; need not be preserved
194; pop ecx ; unused
195; pop ebx ; unused
196 pop ebp
197 ret
MIYASAKA Masarua2e6a9d2006-02-04 00:00:00 +0000198
DRC132b5fd2009-10-08 09:04:56 +0000199; For some reason, the OS X linker does not honor the request to align the
200; segment unless we do this.
DRC7ee3ce92016-07-05 16:19:26 -0500201 align 32