simd/jcsample-sse2-64.asm - third_party/libjpeg-turbo - Git at Google

 ;
 ; jcsample.asm - downsampling (64-bit SSE2)
 ;
 ; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
 ; Copyright (C) 2009, D. R. Commander.
 ;
 ; Based on the x86 SIMD extension for IJG JPEG library
 ; Copyright (C) 1999-2006, MIYASAKA Masaru.
 ; For conditions of distribution and use, see copyright notice in jsimdext.inc
 ;
 ; This file should be assembled with NASM (Netwide Assembler),
 ; can *not* be assembled with Microsoft's MASM or any compatible
 ; assembler (including Borland's Turbo Assembler).
 ; NASM is available from http://nasm.sourceforge.net/ or
 ; http://sourceforge.net/project/showfiles.php?group_id=6208
 ;
 ; [TAB8]

 %include "jsimdext.inc"

 ; --------------------------------------------------------------------------
         SECTION SEG_TEXT
         BITS    64
 ;
 ; Downsample pixel values of a single component.
 ; This version handles the common case of 2:1 horizontal and 1:1 vertical,
 ; without smoothing.
 ;
 ; GLOBAL(void)
 ; jsimd_h2v1_downsample_sse2 (JDIMENSION image_width, int max_v_samp_factor,
 ;                             JDIMENSION v_samp_factor, JDIMENSION width_blocks,
 ;                             JSAMPARRAY input_data, JSAMPARRAY output_data);
 ;

 ; r10 = JDIMENSION image_width
 ; r11 = int max_v_samp_factor
 ; r12 = JDIMENSION v_samp_factor
 ; r13 = JDIMENSION width_blocks
 ; r14 = JSAMPARRAY input_data
 ; r15 = JSAMPARRAY output_data

         align   16
         global  EXTN(jsimd_h2v1_downsample_sse2)

 EXTN(jsimd_h2v1_downsample_sse2):
         push    rbp
         mov     rax,rsp
         mov     rbp,rsp
         collect_args

         mov ecx, r13d
         shl     rcx,3                   ; imul rcx,DCTSIZE (rcx = output_cols)
         jz      near .return

         mov edx, r10d

         ; -- expand_right_edge

         push    rcx
         shl     rcx,1                           ; output_cols * 2
         sub     rcx,rdx
         jle     short .expand_end

         mov     rax, r11
         test    rax,rax
         jle     short .expand_end

         cld
         mov     rsi, r14        ; input_data
 .expandloop:
         push    rax
         push    rcx

         mov     rdi, JSAMPROW [rsi]
         add     rdi,rdx
         mov     al, JSAMPLE [rdi-1]

         rep stosb

         pop     rcx
         pop     rax

         add     rsi, byte SIZEOF_JSAMPROW
         dec     rax
         jg      short .expandloop

 .expand_end:
         pop     rcx                             ; output_cols

         ; -- h2v1_downsample

         mov     eax, r12d        ; rowctr
         test    eax,eax
         jle     near .return

         mov     rdx, 0x00010000         ; bias pattern
         movd    xmm7,edx
         pcmpeqw xmm6,xmm6
         pshufd  xmm7,xmm7,0x00          ; xmm7={0, 1, 0, 1, 0, 1, 0, 1}
         psrlw   xmm6,BYTE_BIT           ; xmm6={0xFF 0x00 0xFF 0x00 ..}

         mov     rsi, r14        ; input_data
         mov     rdi, r15        ; output_data
 .rowloop:
         push    rcx
         push    rdi
         push    rsi

         mov     rsi, JSAMPROW [rsi]             ; inptr
         mov rdi, JSAMPROW [rdi]         ; outptr

         cmp     rcx, byte SIZEOF_XMMWORD
         jae     short .columnloop

 .columnloop_r8:
         movdqa  xmm0, XMMWORD [rsi+0*SIZEOF_XMMWORD]
         pxor    xmm1,xmm1
         mov     rcx, SIZEOF_XMMWORD
         jmp     short .downsample

 .columnloop:
         movdqa  xmm0, XMMWORD [rsi+0*SIZEOF_XMMWORD]
         movdqa  xmm1, XMMWORD [rsi+1*SIZEOF_XMMWORD]

 .downsample:
         movdqa  xmm2,xmm0
         movdqa  xmm3,xmm1

         pand    xmm0,xmm6
         psrlw   xmm2,BYTE_BIT
         pand    xmm1,xmm6
         psrlw   xmm3,BYTE_BIT

         paddw   xmm0,xmm2
         paddw   xmm1,xmm3
         paddw   xmm0,xmm7
         paddw   xmm1,xmm7
         psrlw   xmm0,1
         psrlw   xmm1,1

         packuswb xmm0,xmm1

         movdqa  XMMWORD [rdi+0*SIZEOF_XMMWORD], xmm0

         sub     rcx, byte SIZEOF_XMMWORD        ; outcol
         add     rsi, byte 2*SIZEOF_XMMWORD      ; inptr
         add     rdi, byte 1*SIZEOF_XMMWORD      ; outptr
         cmp     rcx, byte SIZEOF_XMMWORD
         jae     short .columnloop
         test    rcx,rcx
         jnz     short .columnloop_r8

         pop     rsi
         pop     rdi
         pop     rcx

         add     rsi, byte SIZEOF_JSAMPROW       ; input_data
         add     rdi, byte SIZEOF_JSAMPROW       ; output_data
         dec     rax                             ; rowctr
         jg      near .rowloop

 .return:
         uncollect_args
         pop     rbp
         ret

 ; --------------------------------------------------------------------------
 ;
 ; Downsample pixel values of a single component.
 ; This version handles the standard case of 2:1 horizontal and 2:1 vertical,
 ; without smoothing.
 ;
 ; GLOBAL(void)
 ; jsimd_h2v2_downsample_sse2 (JDIMENSION image_width, int max_v_samp_factor,
 ;                             JDIMENSION v_samp_factor, JDIMENSION width_blocks,
 ;                             JSAMPARRAY input_data, JSAMPARRAY output_data);
 ;

 ; r10 = JDIMENSION image_width
 ; r11 = int max_v_samp_factor
 ; r12 = JDIMENSION v_samp_factor
 ; r13 = JDIMENSION width_blocks
 ; r14 = JSAMPARRAY input_data
 ; r15 = JSAMPARRAY output_data

         align   16
         global  EXTN(jsimd_h2v2_downsample_sse2)

 EXTN(jsimd_h2v2_downsample_sse2):
         push    rbp
         mov     rax,rsp
         mov     rbp,rsp
         collect_args

         mov     ecx, r13d
         shl     rcx,3                   ; imul rcx,DCTSIZE (rcx = output_cols)
         jz      near .return

         mov     edx, r10d

         ; -- expand_right_edge

         push    rcx
         shl     rcx,1                           ; output_cols * 2
         sub     rcx,rdx
         jle     short .expand_end

         mov     rax, r11
         test    rax,rax
         jle     short .expand_end

         cld
         mov     rsi, r14        ; input_data
 .expandloop:
         push    rax
         push    rcx

         mov     rdi, JSAMPROW [rsi]
         add     rdi,rdx
         mov     al, JSAMPLE [rdi-1]

         rep stosb

         pop     rcx
         pop     rax

         add     rsi, byte SIZEOF_JSAMPROW
         dec     rax
         jg      short .expandloop

 .expand_end:
         pop     rcx                             ; output_cols

         ; -- h2v2_downsample

         mov     eax, r12d        ; rowctr
         test    rax,rax
         jle     near .return

         mov     rdx, 0x00020001         ; bias pattern
         movd    xmm7,edx
         pcmpeqw xmm6,xmm6
         pshufd  xmm7,xmm7,0x00          ; xmm7={1, 2, 1, 2, 1, 2, 1, 2}
         psrlw   xmm6,BYTE_BIT           ; xmm6={0xFF 0x00 0xFF 0x00 ..}

         mov     rsi, r14        ; input_data
         mov     rdi, r15        ; output_data
 .rowloop:
         push    rcx
         push    rdi
         push    rsi

         mov     rdx, JSAMPROW [rsi+0*SIZEOF_JSAMPROW]   ; inptr0
         mov     rsi, JSAMPROW [rsi+1*SIZEOF_JSAMPROW]   ; inptr1
         mov     rdi, JSAMPROW [rdi]                     ; outptr

         cmp     rcx, byte SIZEOF_XMMWORD
         jae     short .columnloop

 .columnloop_r8:
         movdqa  xmm0, XMMWORD [rdx+0*SIZEOF_XMMWORD]
         movdqa  xmm1, XMMWORD [rsi+0*SIZEOF_XMMWORD]
         pxor    xmm2,xmm2
         pxor    xmm3,xmm3
         mov     rcx, SIZEOF_XMMWORD
         jmp     short .downsample

 .columnloop:
         movdqa  xmm0, XMMWORD [rdx+0*SIZEOF_XMMWORD]
         movdqa  xmm1, XMMWORD [rsi+0*SIZEOF_XMMWORD]
         movdqa  xmm2, XMMWORD [rdx+1*SIZEOF_XMMWORD]
         movdqa  xmm3, XMMWORD [rsi+1*SIZEOF_XMMWORD]

 .downsample:
         movdqa  xmm4,xmm0
         movdqa  xmm5,xmm1
         pand    xmm0,xmm6
         psrlw   xmm4,BYTE_BIT
         pand    xmm1,xmm6
         psrlw   xmm5,BYTE_BIT
         paddw   xmm0,xmm4
         paddw   xmm1,xmm5

         movdqa  xmm4,xmm2
         movdqa  xmm5,xmm3
         pand    xmm2,xmm6
         psrlw   xmm4,BYTE_BIT
         pand    xmm3,xmm6
         psrlw   xmm5,BYTE_BIT
         paddw   xmm2,xmm4
         paddw   xmm3,xmm5

         paddw   xmm0,xmm1
         paddw   xmm2,xmm3
         paddw   xmm0,xmm7
         paddw   xmm2,xmm7
         psrlw   xmm0,2
         psrlw   xmm2,2

         packuswb xmm0,xmm2

         movdqa  XMMWORD [rdi+0*SIZEOF_XMMWORD], xmm0

         sub     rcx, byte SIZEOF_XMMWORD        ; outcol
         add     rdx, byte 2*SIZEOF_XMMWORD      ; inptr0
         add     rsi, byte 2*SIZEOF_XMMWORD      ; inptr1
         add     rdi, byte 1*SIZEOF_XMMWORD      ; outptr
         cmp     rcx, byte SIZEOF_XMMWORD
         jae     near .columnloop
         test    rcx,rcx
         jnz     near .columnloop_r8

         pop     rsi
         pop     rdi
         pop     rcx

         add     rsi, byte 2*SIZEOF_JSAMPROW     ; input_data
         add     rdi, byte 1*SIZEOF_JSAMPROW     ; output_data
         dec     rax                             ; rowctr
         jg      near .rowloop

 .return:
         uncollect_args
         pop     rbp
         ret

 ; For some reason, the OS X linker does not honor the request to align the
 ; segment unless we do this.
         align   16
	;
	; jcsample.asm - downsampling (64-bit SSE2)
	;
	; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
	; Copyright (C) 2009, D. R. Commander.
	;
	; Based on the x86 SIMD extension for IJG JPEG library
	; Copyright (C) 1999-2006, MIYASAKA Masaru.
	; For conditions of distribution and use, see copyright notice in jsimdext.inc
	;
	; This file should be assembled with NASM (Netwide Assembler),
	; can not be assembled with Microsoft's MASM or any compatible
	; assembler (including Borland's Turbo Assembler).
	; NASM is available from http://nasm.sourceforge.net/ or
	; http://sourceforge.net/project/showfiles.php?group_id=6208
	;
	; [TAB8]

	%include "jsimdext.inc"

	; --------------------------------------------------------------------------
	SECTION SEG_TEXT
	BITS 64
	;
	; Downsample pixel values of a single component.
	; This version handles the common case of 2:1 horizontal and 1:1 vertical,
	; without smoothing.
	;
	; GLOBAL(void)
	; jsimd_h2v1_downsample_sse2 (JDIMENSION image_width, int max_v_samp_factor,
	; JDIMENSION v_samp_factor, JDIMENSION width_blocks,
	; JSAMPARRAY input_data, JSAMPARRAY output_data);
	;

	; r10 = JDIMENSION image_width
	; r11 = int max_v_samp_factor
	; r12 = JDIMENSION v_samp_factor
	; r13 = JDIMENSION width_blocks
	; r14 = JSAMPARRAY input_data
	; r15 = JSAMPARRAY output_data

	align 16
	global EXTN(jsimd_h2v1_downsample_sse2)

	EXTN(jsimd_h2v1_downsample_sse2):
	push rbp
	mov rax,rsp
	mov rbp,rsp
	collect_args

	mov ecx, r13d
	shl rcx,3 ; imul rcx,DCTSIZE (rcx = output_cols)
	jz near .return

	mov edx, r10d

	; -- expand_right_edge

	push rcx
	shl rcx,1 ; output_cols * 2
	sub rcx,rdx
	jle short .expand_end

	mov rax, r11
	test rax,rax
	jle short .expand_end

	cld
	mov rsi, r14 ; input_data
	.expandloop:
	push rax
	push rcx

	mov rdi, JSAMPROW [rsi]
	add rdi,rdx
	mov al, JSAMPLE [rdi-1]

	rep stosb

	pop rcx
	pop rax

	add rsi, byte SIZEOF_JSAMPROW
	dec rax
	jg short .expandloop

	.expand_end:
	pop rcx ; output_cols

	; -- h2v1_downsample

	mov eax, r12d ; rowctr
	test eax,eax
	jle near .return

	mov rdx, 0x00010000 ; bias pattern
	movd xmm7,edx
	pcmpeqw xmm6,xmm6
	pshufd xmm7,xmm7,0x00 ; xmm7={0, 1, 0, 1, 0, 1, 0, 1}
	psrlw xmm6,BYTE_BIT ; xmm6={0xFF 0x00 0xFF 0x00 ..}

	mov rsi, r14 ; input_data
	mov rdi, r15 ; output_data
	.rowloop:
	push rcx
	push rdi
	push rsi

	mov rsi, JSAMPROW [rsi] ; inptr
	mov rdi, JSAMPROW [rdi] ; outptr

	cmp rcx, byte SIZEOF_XMMWORD
	jae short .columnloop

	.columnloop_r8:
	movdqa xmm0, XMMWORD [rsi+0*SIZEOF_XMMWORD]
	pxor xmm1,xmm1
	mov rcx, SIZEOF_XMMWORD
	jmp short .downsample

	.columnloop:
	movdqa xmm0, XMMWORD [rsi+0*SIZEOF_XMMWORD]
	movdqa xmm1, XMMWORD [rsi+1*SIZEOF_XMMWORD]

	.downsample:
	movdqa xmm2,xmm0
	movdqa xmm3,xmm1

	pand xmm0,xmm6
	psrlw xmm2,BYTE_BIT
	pand xmm1,xmm6
	psrlw xmm3,BYTE_BIT

	paddw xmm0,xmm2
	paddw xmm1,xmm3
	paddw xmm0,xmm7
	paddw xmm1,xmm7
	psrlw xmm0,1
	psrlw xmm1,1

	packuswb xmm0,xmm1

	movdqa XMMWORD [rdi+0*SIZEOF_XMMWORD], xmm0

	sub rcx, byte SIZEOF_XMMWORD ; outcol
	add rsi, byte 2*SIZEOF_XMMWORD ; inptr
	add rdi, byte 1*SIZEOF_XMMWORD ; outptr
	cmp rcx, byte SIZEOF_XMMWORD
	jae short .columnloop
	test rcx,rcx
	jnz short .columnloop_r8

	pop rsi
	pop rdi
	pop rcx

	add rsi, byte SIZEOF_JSAMPROW ; input_data
	add rdi, byte SIZEOF_JSAMPROW ; output_data
	dec rax ; rowctr
	jg near .rowloop

	.return:
	uncollect_args
	pop rbp
	ret

	; --------------------------------------------------------------------------
	;
	; Downsample pixel values of a single component.
	; This version handles the standard case of 2:1 horizontal and 2:1 vertical,
	; without smoothing.
	;
	; GLOBAL(void)
	; jsimd_h2v2_downsample_sse2 (JDIMENSION image_width, int max_v_samp_factor,
	; JDIMENSION v_samp_factor, JDIMENSION width_blocks,
	; JSAMPARRAY input_data, JSAMPARRAY output_data);
	;

	; r10 = JDIMENSION image_width
	; r11 = int max_v_samp_factor
	; r12 = JDIMENSION v_samp_factor
	; r13 = JDIMENSION width_blocks
	; r14 = JSAMPARRAY input_data
	; r15 = JSAMPARRAY output_data

	align 16
	global EXTN(jsimd_h2v2_downsample_sse2)

	EXTN(jsimd_h2v2_downsample_sse2):
	push rbp
	mov rax,rsp
	mov rbp,rsp
	collect_args

	mov ecx, r13d
	shl rcx,3 ; imul rcx,DCTSIZE (rcx = output_cols)
	jz near .return

	mov edx, r10d

	; -- expand_right_edge

	push rcx
	shl rcx,1 ; output_cols * 2
	sub rcx,rdx
	jle short .expand_end

	mov rax, r11
	test rax,rax
	jle short .expand_end

	cld
	mov rsi, r14 ; input_data
	.expandloop:
	push rax
	push rcx

	mov rdi, JSAMPROW [rsi]
	add rdi,rdx
	mov al, JSAMPLE [rdi-1]

	rep stosb

	pop rcx
	pop rax

	add rsi, byte SIZEOF_JSAMPROW
	dec rax
	jg short .expandloop

	.expand_end:
	pop rcx ; output_cols

	; -- h2v2_downsample

	mov eax, r12d ; rowctr
	test rax,rax
	jle near .return

	mov rdx, 0x00020001 ; bias pattern
	movd xmm7,edx
	pcmpeqw xmm6,xmm6
	pshufd xmm7,xmm7,0x00 ; xmm7={1, 2, 1, 2, 1, 2, 1, 2}
	psrlw xmm6,BYTE_BIT ; xmm6={0xFF 0x00 0xFF 0x00 ..}

	mov rsi, r14 ; input_data
	mov rdi, r15 ; output_data
	.rowloop:
	push rcx
	push rdi
	push rsi

	mov rdx, JSAMPROW [rsi+0*SIZEOF_JSAMPROW] ; inptr0
	mov rsi, JSAMPROW [rsi+1*SIZEOF_JSAMPROW] ; inptr1
	mov rdi, JSAMPROW [rdi] ; outptr

	cmp rcx, byte SIZEOF_XMMWORD
	jae short .columnloop

	.columnloop_r8:
	movdqa xmm0, XMMWORD [rdx+0*SIZEOF_XMMWORD]
	movdqa xmm1, XMMWORD [rsi+0*SIZEOF_XMMWORD]
	pxor xmm2,xmm2
	pxor xmm3,xmm3
	mov rcx, SIZEOF_XMMWORD
	jmp short .downsample

	.columnloop:
	movdqa xmm0, XMMWORD [rdx+0*SIZEOF_XMMWORD]
	movdqa xmm1, XMMWORD [rsi+0*SIZEOF_XMMWORD]
	movdqa xmm2, XMMWORD [rdx+1*SIZEOF_XMMWORD]
	movdqa xmm3, XMMWORD [rsi+1*SIZEOF_XMMWORD]

	.downsample:
	movdqa xmm4,xmm0
	movdqa xmm5,xmm1
	pand xmm0,xmm6
	psrlw xmm4,BYTE_BIT
	pand xmm1,xmm6
	psrlw xmm5,BYTE_BIT
	paddw xmm0,xmm4
	paddw xmm1,xmm5

	movdqa xmm4,xmm2
	movdqa xmm5,xmm3
	pand xmm2,xmm6
	psrlw xmm4,BYTE_BIT
	pand xmm3,xmm6
	psrlw xmm5,BYTE_BIT
	paddw xmm2,xmm4
	paddw xmm3,xmm5

	paddw xmm0,xmm1
	paddw xmm2,xmm3
	paddw xmm0,xmm7
	paddw xmm2,xmm7
	psrlw xmm0,2
	psrlw xmm2,2

	packuswb xmm0,xmm2

	movdqa XMMWORD [rdi+0*SIZEOF_XMMWORD], xmm0

	sub rcx, byte SIZEOF_XMMWORD ; outcol
	add rdx, byte 2*SIZEOF_XMMWORD ; inptr0
	add rsi, byte 2*SIZEOF_XMMWORD ; inptr1
	add rdi, byte 1*SIZEOF_XMMWORD ; outptr
	cmp rcx, byte SIZEOF_XMMWORD
	jae near .columnloop
	test rcx,rcx
	jnz near .columnloop_r8

	pop rsi
	pop rdi
	pop rcx

	add rsi, byte 2*SIZEOF_JSAMPROW ; input_data
	add rdi, byte 1*SIZEOF_JSAMPROW ; output_data
	dec rax ; rowctr
	jg near .rowloop

	.return:
	uncollect_args
	pop rbp
	ret

	; For some reason, the OS X linker does not honor the request to align the
	; segment unless we do this.
	align 16