|  | ; | 
|  | ; jcsamss2-64.asm - downsampling (64-bit SSE2) | 
|  | ; | 
|  | ; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB | 
|  | ; Copyright 2009 D. R. Commander | 
|  | ; | 
|  | ; Based on | 
|  | ; x86 SIMD extension for IJG JPEG library | 
|  | ; Copyright (C) 1999-2006, MIYASAKA Masaru. | 
|  | ; For conditions of distribution and use, see copyright notice in jsimdext.inc | 
|  | ; | 
|  | ; This file should be assembled with NASM (Netwide Assembler), | 
|  | ; can *not* be assembled with Microsoft's MASM or any compatible | 
|  | ; assembler (including Borland's Turbo Assembler). | 
|  | ; NASM is available from http://nasm.sourceforge.net/ or | 
|  | ; http://sourceforge.net/project/showfiles.php?group_id=6208 | 
|  | ; | 
|  | ; [TAB8] | 
|  |  | 
|  | %include "jsimdext.inc" | 
|  |  | 
|  | ; -------------------------------------------------------------------------- | 
|  | SECTION	SEG_TEXT | 
|  | BITS	64 | 
|  | ; | 
|  | ; Downsample pixel values of a single component. | 
|  | ; This version handles the common case of 2:1 horizontal and 1:1 vertical, | 
|  | ; without smoothing. | 
|  | ; | 
|  | ; GLOBAL(void) | 
|  | ; jsimd_h2v1_downsample_sse2 (JDIMENSION image_width, int max_v_samp_factor, | 
|  | ;                             JDIMENSION v_samp_factor, JDIMENSION width_blocks, | 
|  | ;                             JSAMPARRAY input_data, JSAMPARRAY output_data); | 
|  | ; | 
|  |  | 
|  | ; r10 = JDIMENSION image_width | 
|  | ; r11 = int max_v_samp_factor | 
|  | ; r12 = JDIMENSION v_samp_factor | 
|  | ; r13 = JDIMENSION width_blocks | 
|  | ; r14 = JSAMPARRAY input_data | 
|  | ; r15 = JSAMPARRAY output_data | 
|  |  | 
|  | align	16 | 
|  | global	EXTN(jsimd_h2v1_downsample_sse2) | 
|  |  | 
|  | EXTN(jsimd_h2v1_downsample_sse2): | 
|  | push	rbp | 
|  | mov	rax,rsp | 
|  | mov	rbp,rsp | 
|  | collect_args | 
|  |  | 
|  | mov rcx, r13 | 
|  | shl	rcx,3			; imul rcx,DCTSIZE (rcx = output_cols) | 
|  | jz	near .return | 
|  |  | 
|  | mov rdx, r10 | 
|  |  | 
|  | ; -- expand_right_edge | 
|  |  | 
|  | push	rcx | 
|  | shl	rcx,1				; output_cols * 2 | 
|  | sub	rcx,rdx | 
|  | jle	short .expand_end | 
|  |  | 
|  | mov	rax, r11 | 
|  | test	rax,rax | 
|  | jle	short .expand_end | 
|  |  | 
|  | cld | 
|  | mov	rsi, r14	; input_data | 
|  | .expandloop: | 
|  | push	rax | 
|  | push	rcx | 
|  |  | 
|  | mov	rdi, JSAMPROW [rsi] | 
|  | add	rdi,rdx | 
|  | mov	al, JSAMPLE [rdi-1] | 
|  |  | 
|  | rep stosb | 
|  |  | 
|  | pop	rcx | 
|  | pop	rax | 
|  |  | 
|  | add	rsi, byte SIZEOF_JSAMPROW | 
|  | dec	rax | 
|  | jg	short .expandloop | 
|  |  | 
|  | .expand_end: | 
|  | pop	rcx				; output_cols | 
|  |  | 
|  | ; -- h2v1_downsample | 
|  |  | 
|  | mov	rax, r12	; rowctr | 
|  | test	eax,eax | 
|  | jle	near .return | 
|  |  | 
|  | mov	rdx, 0x00010000		; bias pattern | 
|  | movd	xmm7,edx | 
|  | pcmpeqw	xmm6,xmm6 | 
|  | pshufd	xmm7,xmm7,0x00		; xmm7={0, 1, 0, 1, 0, 1, 0, 1} | 
|  | psrlw	xmm6,BYTE_BIT		; xmm6={0xFF 0x00 0xFF 0x00 ..} | 
|  |  | 
|  | mov	rsi, r14	; input_data | 
|  | mov	rdi, r15	; output_data | 
|  | .rowloop: | 
|  | push	rcx | 
|  | push	rdi | 
|  | push	rsi | 
|  |  | 
|  | mov	rsi, JSAMPROW [rsi]		; inptr | 
|  | mov rdi, JSAMPROW [rdi]		; outptr | 
|  |  | 
|  | cmp	rcx, byte SIZEOF_XMMWORD | 
|  | jae	short .columnloop | 
|  |  | 
|  | .columnloop_r8: | 
|  | movdqa	xmm0, XMMWORD [rsi+0*SIZEOF_XMMWORD] | 
|  | pxor	xmm1,xmm1 | 
|  | mov	rcx, SIZEOF_XMMWORD | 
|  | jmp	short .downsample | 
|  |  | 
|  | .columnloop: | 
|  | movdqa	xmm0, XMMWORD [rsi+0*SIZEOF_XMMWORD] | 
|  | movdqa	xmm1, XMMWORD [rsi+1*SIZEOF_XMMWORD] | 
|  |  | 
|  | .downsample: | 
|  | movdqa	xmm2,xmm0 | 
|  | movdqa	xmm3,xmm1 | 
|  |  | 
|  | pand	xmm0,xmm6 | 
|  | psrlw	xmm2,BYTE_BIT | 
|  | pand	xmm1,xmm6 | 
|  | psrlw	xmm3,BYTE_BIT | 
|  |  | 
|  | paddw	xmm0,xmm2 | 
|  | paddw	xmm1,xmm3 | 
|  | paddw	xmm0,xmm7 | 
|  | paddw	xmm1,xmm7 | 
|  | psrlw	xmm0,1 | 
|  | psrlw	xmm1,1 | 
|  |  | 
|  | packuswb xmm0,xmm1 | 
|  |  | 
|  | movdqa	XMMWORD [rdi+0*SIZEOF_XMMWORD], xmm0 | 
|  |  | 
|  | sub	rcx, byte SIZEOF_XMMWORD	; outcol | 
|  | add	rsi, byte 2*SIZEOF_XMMWORD	; inptr | 
|  | add	rdi, byte 1*SIZEOF_XMMWORD	; outptr | 
|  | cmp	rcx, byte SIZEOF_XMMWORD | 
|  | jae	short .columnloop | 
|  | test	rcx,rcx | 
|  | jnz	short .columnloop_r8 | 
|  |  | 
|  | pop	rsi | 
|  | pop	rdi | 
|  | pop	rcx | 
|  |  | 
|  | add	rsi, byte SIZEOF_JSAMPROW	; input_data | 
|  | add	rdi, byte SIZEOF_JSAMPROW	; output_data | 
|  | dec	rax				; rowctr | 
|  | jg	near .rowloop | 
|  |  | 
|  | .return: | 
|  | uncollect_args | 
|  | pop	rbp | 
|  | ret | 
|  |  | 
|  | ; -------------------------------------------------------------------------- | 
|  | ; | 
|  | ; Downsample pixel values of a single component. | 
|  | ; This version handles the standard case of 2:1 horizontal and 2:1 vertical, | 
|  | ; without smoothing. | 
|  | ; | 
|  | ; GLOBAL(void) | 
|  | ; jsimd_h2v2_downsample_sse2 (JDIMENSION image_width, int max_v_samp_factor, | 
|  | ;                             JDIMENSION v_samp_factor, JDIMENSION width_blocks, | 
|  | ;                             JSAMPARRAY input_data, JSAMPARRAY output_data); | 
|  | ; | 
|  |  | 
|  | ; r10 = JDIMENSION image_width | 
|  | ; r11 = int max_v_samp_factor | 
|  | ; r12 = JDIMENSION v_samp_factor | 
|  | ; r13 = JDIMENSION width_blocks | 
|  | ; r14 = JSAMPARRAY input_data | 
|  | ; r15 = JSAMPARRAY output_data | 
|  |  | 
|  | align	16 | 
|  | global	EXTN(jsimd_h2v2_downsample_sse2) | 
|  |  | 
|  | EXTN(jsimd_h2v2_downsample_sse2): | 
|  | push	rbp | 
|  | mov	rax,rsp | 
|  | mov	rbp,rsp | 
|  | collect_args | 
|  |  | 
|  | mov	rcx, r13 | 
|  | shl	rcx,3			; imul rcx,DCTSIZE (rcx = output_cols) | 
|  | jz	near .return | 
|  |  | 
|  | mov	rdx, r10 | 
|  |  | 
|  | ; -- expand_right_edge | 
|  |  | 
|  | push	rcx | 
|  | shl	rcx,1				; output_cols * 2 | 
|  | sub	rcx,rdx | 
|  | jle	short .expand_end | 
|  |  | 
|  | mov	rax, r11 | 
|  | test	rax,rax | 
|  | jle	short .expand_end | 
|  |  | 
|  | cld | 
|  | mov	rsi, r14	; input_data | 
|  | .expandloop: | 
|  | push	rax | 
|  | push	rcx | 
|  |  | 
|  | mov	rdi, JSAMPROW [rsi] | 
|  | add	rdi,rdx | 
|  | mov	al, JSAMPLE [rdi-1] | 
|  |  | 
|  | rep stosb | 
|  |  | 
|  | pop	rcx | 
|  | pop	rax | 
|  |  | 
|  | add	rsi, byte SIZEOF_JSAMPROW | 
|  | dec	rax | 
|  | jg	short .expandloop | 
|  |  | 
|  | .expand_end: | 
|  | pop	rcx				; output_cols | 
|  |  | 
|  | ; -- h2v2_downsample | 
|  |  | 
|  | mov	rax, r12	; rowctr | 
|  | test	rax,rax | 
|  | jle	near .return | 
|  |  | 
|  | mov	rdx, 0x00020001		; bias pattern | 
|  | movd	xmm7,edx | 
|  | pcmpeqw	xmm6,xmm6 | 
|  | pshufd	xmm7,xmm7,0x00		; xmm7={1, 2, 1, 2, 1, 2, 1, 2} | 
|  | psrlw	xmm6,BYTE_BIT		; xmm6={0xFF 0x00 0xFF 0x00 ..} | 
|  |  | 
|  | mov	rsi, r14	; input_data | 
|  | mov	rdi, r15	; output_data | 
|  | .rowloop: | 
|  | push	rcx | 
|  | push	rdi | 
|  | push	rsi | 
|  |  | 
|  | mov	rdx, JSAMPROW [rsi+0*SIZEOF_JSAMPROW]	; inptr0 | 
|  | mov	rsi, JSAMPROW [rsi+1*SIZEOF_JSAMPROW]	; inptr1 | 
|  | mov	rdi, JSAMPROW [rdi]			; outptr | 
|  |  | 
|  | cmp	rcx, byte SIZEOF_XMMWORD | 
|  | jae	short .columnloop | 
|  |  | 
|  | .columnloop_r8: | 
|  | movdqa	xmm0, XMMWORD [rdx+0*SIZEOF_XMMWORD] | 
|  | movdqa	xmm1, XMMWORD [rsi+0*SIZEOF_XMMWORD] | 
|  | pxor	xmm2,xmm2 | 
|  | pxor	xmm3,xmm3 | 
|  | mov	rcx, SIZEOF_XMMWORD | 
|  | jmp	short .downsample | 
|  |  | 
|  | .columnloop: | 
|  | movdqa	xmm0, XMMWORD [rdx+0*SIZEOF_XMMWORD] | 
|  | movdqa	xmm1, XMMWORD [rsi+0*SIZEOF_XMMWORD] | 
|  | movdqa	xmm2, XMMWORD [rdx+1*SIZEOF_XMMWORD] | 
|  | movdqa	xmm3, XMMWORD [rsi+1*SIZEOF_XMMWORD] | 
|  |  | 
|  | .downsample: | 
|  | movdqa	xmm4,xmm0 | 
|  | movdqa	xmm5,xmm1 | 
|  | pand	xmm0,xmm6 | 
|  | psrlw	xmm4,BYTE_BIT | 
|  | pand	xmm1,xmm6 | 
|  | psrlw	xmm5,BYTE_BIT | 
|  | paddw	xmm0,xmm4 | 
|  | paddw	xmm1,xmm5 | 
|  |  | 
|  | movdqa	xmm4,xmm2 | 
|  | movdqa	xmm5,xmm3 | 
|  | pand	xmm2,xmm6 | 
|  | psrlw	xmm4,BYTE_BIT | 
|  | pand	xmm3,xmm6 | 
|  | psrlw	xmm5,BYTE_BIT | 
|  | paddw	xmm2,xmm4 | 
|  | paddw	xmm3,xmm5 | 
|  |  | 
|  | paddw	xmm0,xmm1 | 
|  | paddw	xmm2,xmm3 | 
|  | paddw	xmm0,xmm7 | 
|  | paddw	xmm2,xmm7 | 
|  | psrlw	xmm0,2 | 
|  | psrlw	xmm2,2 | 
|  |  | 
|  | packuswb xmm0,xmm2 | 
|  |  | 
|  | movdqa	XMMWORD [rdi+0*SIZEOF_XMMWORD], xmm0 | 
|  |  | 
|  | sub	rcx, byte SIZEOF_XMMWORD	; outcol | 
|  | add	rdx, byte 2*SIZEOF_XMMWORD	; inptr0 | 
|  | add	rsi, byte 2*SIZEOF_XMMWORD	; inptr1 | 
|  | add	rdi, byte 1*SIZEOF_XMMWORD	; outptr | 
|  | cmp	rcx, byte SIZEOF_XMMWORD | 
|  | jae	near .columnloop | 
|  | test	rcx,rcx | 
|  | jnz	near .columnloop_r8 | 
|  |  | 
|  | pop	rsi | 
|  | pop	rdi | 
|  | pop	rcx | 
|  |  | 
|  | add	rsi, byte 2*SIZEOF_JSAMPROW	; input_data | 
|  | add	rdi, byte 1*SIZEOF_JSAMPROW	; output_data | 
|  | dec	rax				; rowctr | 
|  | jg	near .rowloop | 
|  |  | 
|  | .return: | 
|  | uncollect_args | 
|  | pop	rbp | 
|  | ret | 
|  |  | 
|  | ; For some reason, the OS X linker does not honor the request to align the | 
|  | ; segment unless we do this. | 
|  | align	16 |