| /* |
| * MIPS DSPr2 optimizations for libjpeg-turbo |
| * |
| * Copyright (C) 2013-2014, MIPS Technologies, Inc., California. |
| * All Rights Reserved. |
| * Authors: Teodora Novkovic <teodora.novkovic@imgtec.com> |
| * Darko Laus <darko.laus@imgtec.com> |
| * Copyright (C) 2015, D. R. Commander. All Rights Reserved. |
| * |
| * This software is provided 'as-is', without any express or implied |
| * warranty. In no event will the authors be held liable for any damages |
| * arising from the use of this software. |
| * |
| * Permission is granted to anyone to use this software for any purpose, |
| * including commercial applications, and to alter it and redistribute it |
| * freely, subject to the following restrictions: |
| * |
| * 1. The origin of this software must not be misrepresented; you must not |
| * claim that you wrote the original software. If you use this software |
| * in a product, an acknowledgment in the product documentation would be |
| * appreciated but is not required. |
| * 2. Altered source versions must be plainly marked as such, and must not be |
| * misrepresented as being the original software. |
| * 3. This notice may not be removed or altered from any source distribution. |
| */ |
| |
| #include "jsimd_dspr2_asm.h" |
| |
| |
| /*****************************************************************************/ |
| LEAF_DSPR2(jsimd_c_null_convert_dspr2) |
| /* |
| * a0 = cinfo->image_width |
| * a1 = input_buf |
| * a2 = output_buf |
| * a3 = output_row |
| * 16(sp) = num_rows |
| * 20(sp) = cinfo->num_components |
| * |
| * Null conversion for compression |
| */ |
| SAVE_REGS_ON_STACK 8, s0, s1 |
| |
| lw t9, 24(sp) /* t9 = num_rows */ |
| lw s0, 28(sp) /* s0 = cinfo->num_components */ |
| andi t0, a0, 3 /* t0 = cinfo->image_width & 3 */ |
| beqz t0, 4f /* no residual */ |
| nop |
| 0: |
| addiu t9, t9, -1 |
| bltz t9, 7f |
| li t1, 0 |
| 1: |
| sll t3, t1, 2 |
| lwx t5, t3(a2) /* t5 = outptr = output_buf[ci] */ |
| lw t2, 0(a1) /* t2 = inptr = *input_buf */ |
| sll t4, a3, 2 |
| lwx t5, t4(t5) /* t5 = outptr = output_buf[ci][output_row] */ |
| addu t2, t2, t1 |
| addu s1, t5, a0 |
| addu t6, t5, t0 |
| 2: |
| lbu t3, 0(t2) |
| addiu t5, t5, 1 |
| sb t3, -1(t5) |
| bne t6, t5, 2b |
| addu t2, t2, s0 |
| 3: |
| lbu t3, 0(t2) |
| addu t4, t2, s0 |
| addu t7, t4, s0 |
| addu t8, t7, s0 |
| addu t2, t8, s0 |
| lbu t4, 0(t4) |
| lbu t7, 0(t7) |
| lbu t8, 0(t8) |
| addiu t5, t5, 4 |
| sb t3, -4(t5) |
| sb t4, -3(t5) |
| sb t7, -2(t5) |
| bne s1, t5, 3b |
| sb t8, -1(t5) |
| addiu t1, t1, 1 |
| bne t1, s0, 1b |
| nop |
| addiu a1, a1, 4 |
| bgez t9, 0b |
| addiu a3, a3, 1 |
| b 7f |
| nop |
| 4: |
| addiu t9, t9, -1 |
| bltz t9, 7f |
| li t1, 0 |
| 5: |
| sll t3, t1, 2 |
| lwx t5, t3(a2) /* t5 = outptr = output_buf[ci] */ |
| lw t2, 0(a1) /* t2 = inptr = *input_buf */ |
| sll t4, a3, 2 |
| lwx t5, t4(t5) /* t5 = outptr = output_buf[ci][output_row] */ |
| addu t2, t2, t1 |
| addu s1, t5, a0 |
| addu t6, t5, t0 |
| 6: |
| lbu t3, 0(t2) |
| addu t4, t2, s0 |
| addu t7, t4, s0 |
| addu t8, t7, s0 |
| addu t2, t8, s0 |
| lbu t4, 0(t4) |
| lbu t7, 0(t7) |
| lbu t8, 0(t8) |
| addiu t5, t5, 4 |
| sb t3, -4(t5) |
| sb t4, -3(t5) |
| sb t7, -2(t5) |
| bne s1, t5, 6b |
| sb t8, -1(t5) |
| addiu t1, t1, 1 |
| bne t1, s0, 5b |
| nop |
| addiu a1, a1, 4 |
| bgez t9, 4b |
| addiu a3, a3, 1 |
| 7: |
| RESTORE_REGS_FROM_STACK 8, s0, s1 |
| |
| j ra |
| nop |
| |
| END(jsimd_c_null_convert_dspr2) |
| |
| |
| /*****************************************************************************/ |
| /* |
| * jsimd_extrgb_ycc_convert_dspr2 |
| * jsimd_extbgr_ycc_convert_dspr2 |
| * jsimd_extrgbx_ycc_convert_dspr2 |
| * jsimd_extbgrx_ycc_convert_dspr2 |
| * jsimd_extxbgr_ycc_convert_dspr2 |
| * jsimd_extxrgb_ycc_convert_dspr2 |
| * |
| * Colorspace conversion RGB -> YCbCr |
| */ |
| |
| .macro GENERATE_JSIMD_RGB_YCC_CONVERT_DSPR2 colorid, pixel_size, \ |
| r_offs, g_offs, b_offs |
| |
| .macro DO_RGB_TO_YCC r, g, b, inptr |
| lbu \r, \r_offs(\inptr) |
| lbu \g, \g_offs(\inptr) |
| lbu \b, \b_offs(\inptr) |
| addiu \inptr, \pixel_size |
| .endm |
| |
| LEAF_DSPR2(jsimd_\colorid\()_ycc_convert_dspr2) |
| /* |
| * a0 = cinfo->image_width |
| * a1 = input_buf |
| * a2 = output_buf |
| * a3 = output_row |
| * 16(sp) = num_rows |
| */ |
| SAVE_REGS_ON_STACK 32, s0, s1, s2, s3, s4, s5, s6, s7 |
| |
| lw t7, 48(sp) /* t7 = num_rows */ |
| li s0, 0x4c8b /* FIX(0.29900) */ |
| li s1, 0x9646 /* FIX(0.58700) */ |
| li s2, 0x1d2f /* FIX(0.11400) */ |
| li s3, 0xffffd4cd /* -FIX(0.16874) */ |
| li s4, 0xffffab33 /* -FIX(0.33126) */ |
| li s5, 0x8000 /* FIX(0.50000) */ |
| li s6, 0xffff94d1 /* -FIX(0.41869) */ |
| li s7, 0xffffeb2f /* -FIX(0.08131) */ |
| li t8, 0x807fff /* CBCR_OFFSET + ONE_HALF-1 */ |
| |
| 0: |
| addiu t7, -1 /* --num_rows */ |
| lw t6, 0(a1) /* t6 = input_buf[0] */ |
| lw t0, 0(a2) |
| lw t1, 4(a2) |
| lw t2, 8(a2) |
| sll t3, a3, 2 |
| lwx t0, t3(t0) /* t0 = output_buf[0][output_row] */ |
| lwx t1, t3(t1) /* t1 = output_buf[1][output_row] */ |
| lwx t2, t3(t2) /* t2 = output_buf[2][output_row] */ |
| |
| addu t9, t2, a0 /* t9 = end address */ |
| addiu a3, 1 |
| |
| 1: |
| DO_RGB_TO_YCC t3, t4, t5, t6 |
| |
| mtlo s5, $ac0 |
| mtlo t8, $ac1 |
| mtlo t8, $ac2 |
| maddu $ac0, s2, t5 |
| maddu $ac1, s5, t5 |
| maddu $ac2, s5, t3 |
| maddu $ac0, s0, t3 |
| maddu $ac1, s3, t3 |
| maddu $ac2, s6, t4 |
| maddu $ac0, s1, t4 |
| maddu $ac1, s4, t4 |
| maddu $ac2, s7, t5 |
| extr.w t3, $ac0, 16 |
| extr.w t4, $ac1, 16 |
| extr.w t5, $ac2, 16 |
| sb t3, 0(t0) |
| sb t4, 0(t1) |
| sb t5, 0(t2) |
| addiu t0, 1 |
| addiu t2, 1 |
| bne t2, t9, 1b |
| addiu t1, 1 |
| bgtz t7, 0b |
| addiu a1, 4 |
| |
| RESTORE_REGS_FROM_STACK 32, s0, s1, s2, s3, s4, s5, s6, s7 |
| |
| j ra |
| nop |
| END(jsimd_\colorid\()_ycc_convert_dspr2) |
| |
| .purgem DO_RGB_TO_YCC |
| |
| .endm |
| |
| /*-------------------------------------id -- pix R G B */ |
| GENERATE_JSIMD_RGB_YCC_CONVERT_DSPR2 extrgb, 3, 0, 1, 2 |
| GENERATE_JSIMD_RGB_YCC_CONVERT_DSPR2 extbgr, 3, 2, 1, 0 |
| GENERATE_JSIMD_RGB_YCC_CONVERT_DSPR2 extrgbx, 4, 0, 1, 2 |
| GENERATE_JSIMD_RGB_YCC_CONVERT_DSPR2 extbgrx, 4, 2, 1, 0 |
| GENERATE_JSIMD_RGB_YCC_CONVERT_DSPR2 extxbgr, 4, 3, 2, 1 |
| GENERATE_JSIMD_RGB_YCC_CONVERT_DSPR2 extxrgb, 4, 1, 2, 3 |
| |
| |
| /*****************************************************************************/ |
| /* |
| * jsimd_ycc_extrgb_convert_dspr2 |
| * jsimd_ycc_extbgr_convert_dspr2 |
| * jsimd_ycc_extrgbx_convert_dspr2 |
| * jsimd_ycc_extbgrx_convert_dspr2 |
| * jsimd_ycc_extxbgr_convert_dspr2 |
| * jsimd_ycc_extxrgb_convert_dspr2 |
| * |
| * Colorspace conversion YCbCr -> RGB |
| */ |
| |
| .macro GENERATE_JSIMD_YCC_RGB_CONVERT_DSPR2 colorid, pixel_size, \ |
| r_offs, g_offs, b_offs, a_offs |
| |
| .macro STORE_YCC_TO_RGB scratch0 scratch1 scratch2 outptr |
| sb \scratch0, \r_offs(\outptr) |
| sb \scratch1, \g_offs(\outptr) |
| sb \scratch2, \b_offs(\outptr) |
| .if (\pixel_size == 4) |
| li t0, 0xFF |
| sb t0, \a_offs(\outptr) |
| .endif |
| addiu \outptr, \pixel_size |
| .endm |
| |
| LEAF_DSPR2(jsimd_ycc_\colorid\()_convert_dspr2) |
| /* |
| * a0 = cinfo->image_width |
| * a1 = input_buf |
| * a2 = input_row |
| * a3 = output_buf |
| * 16(sp) = num_rows |
| */ |
| SAVE_REGS_ON_STACK 32, s0, s1, s2, s3, s4, s5, s6, s7 |
| |
| lw s1, 48(sp) |
| li t3, 0x8000 |
| li t4, 0x166e9 /* FIX(1.40200) */ |
| li t5, 0x1c5a2 /* FIX(1.77200) */ |
| li t6, 0xffff492e /* -FIX(0.71414) */ |
| li t7, 0xffffa7e6 /* -FIX(0.34414) */ |
| repl.ph t8, 128 |
| |
| 0: |
| lw s0, 0(a3) |
| lw t0, 0(a1) |
| lw t1, 4(a1) |
| lw t2, 8(a1) |
| sll s5, a2, 2 |
| addiu s1, -1 |
| lwx s2, s5(t0) |
| lwx s3, s5(t1) |
| lwx s4, s5(t2) |
| addu t9, s2, a0 |
| addiu a2, 1 |
| |
| 1: |
| lbu s7, 0(s4) /* cr */ |
| lbu s6, 0(s3) /* cb */ |
| lbu s5, 0(s2) /* y */ |
| addiu s2, 1 |
| addiu s4, 1 |
| addiu s7, -128 |
| addiu s6, -128 |
| mul t2, t7, s6 |
| mul t0, t6, s7 /* Crgtab[cr] */ |
| sll s7, 15 |
| mulq_rs.w t1, t4, s7 /* Crrtab[cr] */ |
| sll s6, 15 |
| addu t2, t3 /* Cbgtab[cb] */ |
| addu t2, t0 |
| |
| mulq_rs.w t0, t5, s6 /* Cbbtab[cb] */ |
| sra t2, 16 |
| addu t1, s5 |
| addu t2, s5 /* add y */ |
| ins t2, t1, 16, 16 |
| subu.ph t2, t2, t8 |
| addu t0, s5 |
| shll_s.ph t2, t2, 8 |
| subu t0, 128 |
| shra.ph t2, t2, 8 |
| shll_s.w t0, t0, 24 |
| addu.ph t2, t2, t8 /* clip & store */ |
| sra t0, t0, 24 |
| sra t1, t2, 16 |
| addiu t0, 128 |
| |
| STORE_YCC_TO_RGB t1, t2, t0, s0 |
| |
| bne s2, t9, 1b |
| addiu s3, 1 |
| bgtz s1, 0b |
| addiu a3, 4 |
| |
| RESTORE_REGS_FROM_STACK 32, s0, s1, s2, s3, s4, s5, s6, s7 |
| |
| j ra |
| nop |
| END(jsimd_ycc_\colorid\()_convert_dspr2) |
| |
| .purgem STORE_YCC_TO_RGB |
| |
| .endm |
| |
| /*-------------------------------------id -- pix R G B A */ |
| GENERATE_JSIMD_YCC_RGB_CONVERT_DSPR2 extrgb, 3, 0, 1, 2, 3 |
| GENERATE_JSIMD_YCC_RGB_CONVERT_DSPR2 extbgr, 3, 2, 1, 0, 3 |
| GENERATE_JSIMD_YCC_RGB_CONVERT_DSPR2 extrgbx, 4, 0, 1, 2, 3 |
| GENERATE_JSIMD_YCC_RGB_CONVERT_DSPR2 extbgrx, 4, 2, 1, 0, 3 |
| GENERATE_JSIMD_YCC_RGB_CONVERT_DSPR2 extxbgr, 4, 3, 2, 1, 0 |
| GENERATE_JSIMD_YCC_RGB_CONVERT_DSPR2 extxrgb, 4, 1, 2, 3, 0 |
| |
| |
| /*****************************************************************************/ |
| /* |
| * jsimd_extrgb_gray_convert_dspr2 |
| * jsimd_extbgr_gray_convert_dspr2 |
| * jsimd_extrgbx_gray_convert_dspr2 |
| * jsimd_extbgrx_gray_convert_dspr2 |
| * jsimd_extxbgr_gray_convert_dspr2 |
| * jsimd_extxrgb_gray_convert_dspr2 |
| * |
| * Colorspace conversion RGB -> GRAY |
| */ |
| |
| .macro GENERATE_JSIMD_RGB_GRAY_CONVERT_DSPR2 colorid, pixel_size, \ |
| r_offs, g_offs, b_offs |
| |
| .macro DO_RGB_TO_GRAY r, g, b, inptr |
| lbu \r, \r_offs(\inptr) |
| lbu \g, \g_offs(\inptr) |
| lbu \b, \b_offs(\inptr) |
| addiu \inptr, \pixel_size |
| .endm |
| |
| LEAF_DSPR2(jsimd_\colorid\()_gray_convert_dspr2) |
| /* |
| * a0 = cinfo->image_width |
| * a1 = input_buf |
| * a2 = output_buf |
| * a3 = output_row |
| * 16(sp) = num_rows |
| */ |
| SAVE_REGS_ON_STACK 32, s0, s1, s2, s3, s4, s5, s6, s7 |
| |
| li s0, 0x4c8b /* s0 = FIX(0.29900) */ |
| li s1, 0x9646 /* s1 = FIX(0.58700) */ |
| li s2, 0x1d2f /* s2 = FIX(0.11400) */ |
| li s7, 0x8000 /* s7 = FIX(0.50000) */ |
| lw s6, 48(sp) |
| andi t7, a0, 3 |
| |
| 0: |
| addiu s6, -1 /* s6 = num_rows */ |
| lw t0, 0(a1) |
| lw t1, 0(a2) |
| sll t3, a3, 2 |
| lwx t1, t3(t1) |
| addiu a3, 1 |
| addu t9, t1, a0 |
| subu t8, t9, t7 |
| beq t1, t8, 2f |
| nop |
| |
| 1: |
| DO_RGB_TO_GRAY t3, t4, t5, t0 |
| DO_RGB_TO_GRAY s3, s4, s5, t0 |
| |
| mtlo s7, $ac0 |
| maddu $ac0, s2, t5 |
| maddu $ac0, s1, t4 |
| maddu $ac0, s0, t3 |
| mtlo s7, $ac1 |
| maddu $ac1, s2, s5 |
| maddu $ac1, s1, s4 |
| maddu $ac1, s0, s3 |
| extr.w t6, $ac0, 16 |
| |
| DO_RGB_TO_GRAY t3, t4, t5, t0 |
| DO_RGB_TO_GRAY s3, s4, s5, t0 |
| |
| mtlo s7, $ac0 |
| maddu $ac0, s2, t5 |
| maddu $ac0, s1, t4 |
| extr.w t2, $ac1, 16 |
| maddu $ac0, s0, t3 |
| mtlo s7, $ac1 |
| maddu $ac1, s2, s5 |
| maddu $ac1, s1, s4 |
| maddu $ac1, s0, s3 |
| extr.w t5, $ac0, 16 |
| sb t6, 0(t1) |
| sb t2, 1(t1) |
| extr.w t3, $ac1, 16 |
| addiu t1, 4 |
| sb t5, -2(t1) |
| sb t3, -1(t1) |
| bne t1, t8, 1b |
| nop |
| |
| 2: |
| beqz t7, 4f |
| nop |
| |
| 3: |
| DO_RGB_TO_GRAY t3, t4, t5, t0 |
| |
| mtlo s7, $ac0 |
| maddu $ac0, s2, t5 |
| maddu $ac0, s1, t4 |
| maddu $ac0, s0, t3 |
| extr.w t6, $ac0, 16 |
| sb t6, 0(t1) |
| addiu t1, 1 |
| bne t1, t9, 3b |
| nop |
| |
| 4: |
| bgtz s6, 0b |
| addiu a1, 4 |
| |
| RESTORE_REGS_FROM_STACK 32, s0, s1, s2, s3, s4, s5, s6, s7 |
| |
| j ra |
| nop |
| END(jsimd_\colorid\()_gray_convert_dspr2) |
| |
| .purgem DO_RGB_TO_GRAY |
| |
| .endm |
| |
| /*-------------------------------------id -- pix R G B */ |
| GENERATE_JSIMD_RGB_GRAY_CONVERT_DSPR2 extrgb, 3, 0, 1, 2 |
| GENERATE_JSIMD_RGB_GRAY_CONVERT_DSPR2 extbgr, 3, 2, 1, 0 |
| GENERATE_JSIMD_RGB_GRAY_CONVERT_DSPR2 extrgbx, 4, 0, 1, 2 |
| GENERATE_JSIMD_RGB_GRAY_CONVERT_DSPR2 extbgrx, 4, 2, 1, 0 |
| GENERATE_JSIMD_RGB_GRAY_CONVERT_DSPR2 extxbgr, 4, 3, 2, 1 |
| GENERATE_JSIMD_RGB_GRAY_CONVERT_DSPR2 extxrgb, 4, 1, 2, 3 |
| |
| |
| /*****************************************************************************/ |
| /* |
| * jsimd_h2v2_merged_upsample_dspr2 |
| * jsimd_h2v2_extrgb_merged_upsample_dspr2 |
| * jsimd_h2v2_extrgbx_merged_upsample_dspr2 |
| * jsimd_h2v2_extbgr_merged_upsample_dspr2 |
| * jsimd_h2v2_extbgrx_merged_upsample_dspr2 |
| * jsimd_h2v2_extxbgr_merged_upsample_dspr2 |
| * jsimd_h2v2_extxrgb_merged_upsample_dspr2 |
| * |
| * Merged h2v2 upsample routines |
| */ |
| .macro GENERATE_H2V2_MERGED_UPSAMPLE_DSPR2 colorid, pixel_size, \ |
| r1_offs, g1_offs, \ |
| b1_offs, a1_offs, \ |
| r2_offs, g2_offs, \ |
| b2_offs, a2_offs |
| |
| .macro STORE_H2V2_2_PIXELS scratch0 scratch1 scratch2 scratch3 scratch4 \ |
| scratch5 outptr |
| sb \scratch0, \r1_offs(\outptr) |
| sb \scratch1, \g1_offs(\outptr) |
| sb \scratch2, \b1_offs(\outptr) |
| sb \scratch3, \r2_offs(\outptr) |
| sb \scratch4, \g2_offs(\outptr) |
| sb \scratch5, \b2_offs(\outptr) |
| .if (\pixel_size == 8) |
| li \scratch0, 0xFF |
| sb \scratch0, \a1_offs(\outptr) |
| sb \scratch0, \a2_offs(\outptr) |
| .endif |
| addiu \outptr, \pixel_size |
| .endm |
| |
| .macro STORE_H2V2_1_PIXEL scratch0 scratch1 scratch2 outptr |
| sb \scratch0, \r1_offs(\outptr) |
| sb \scratch1, \g1_offs(\outptr) |
| sb \scratch2, \b1_offs(\outptr) |
| |
| .if (\pixel_size == 8) |
| li t0, 0xFF |
| sb t0, \a1_offs(\outptr) |
| .endif |
| .endm |
| |
| LEAF_DSPR2(jsimd_h2v2_\colorid\()_merged_upsample_dspr2) |
| /* |
| * a0 = cinfo->output_width |
| * a1 = input_buf |
| * a2 = in_row_group_ctr |
| * a3 = output_buf |
| * 16(sp) = cinfo->sample_range_limit |
| */ |
| SAVE_REGS_ON_STACK 40, s0, s1, s2, s3, s4, s5, s6, s7, ra |
| |
| lw t9, 56(sp) /* cinfo->sample_range_limit */ |
| lw v0, 0(a1) |
| lw v1, 4(a1) |
| lw t0, 8(a1) |
| sll t1, a2, 3 |
| addiu t2, t1, 4 |
| sll t3, a2, 2 |
| lw t4, 0(a3) /* t4 = output_buf[0] */ |
| lwx t1, t1(v0) /* t1 = input_buf[0][in_row_group_ctr*2] */ |
| lwx t2, t2(v0) /* t2 = input_buf[0][in_row_group_ctr*2 + 1] */ |
| lwx t5, t3(v1) /* t5 = input_buf[1][in_row_group_ctr] */ |
| lwx t6, t3(t0) /* t6 = input_buf[2][in_row_group_ctr] */ |
| lw t7, 4(a3) /* t7 = output_buf[1] */ |
| li s1, 0xe6ea |
| addiu t8, s1, 0x7fff /* t8 = 0x166e9 [FIX(1.40200)] */ |
| addiu s0, t8, 0x5eb9 /* s0 = 0x1c5a2 [FIX(1.77200)] */ |
| addiu s1, zero, 0xa7e6 /* s4 = 0xffffa7e6 [-FIX(0.34414)] */ |
| xori s2, s1, 0xeec8 /* s3 = 0xffff492e [-FIX(0.71414)] */ |
| srl t3, a0, 1 |
| blez t3, 2f |
| addu t0, t5, t3 /* t0 = end address */ |
| 1: |
| lbu t3, 0(t5) |
| lbu s3, 0(t6) |
| addiu t5, t5, 1 |
| addiu t3, t3, -128 /* (cb - 128) */ |
| addiu s3, s3, -128 /* (cr - 128) */ |
| mult $ac1, s1, t3 |
| madd $ac1, s2, s3 |
| sll s3, s3, 15 |
| sll t3, t3, 15 |
| mulq_rs.w s4, t8, s3 /* s4 = (C1 * cr + ONE_HALF)>> SCALEBITS */ |
| extr_r.w s5, $ac1, 16 |
| mulq_rs.w s6, s0, t3 /* s6 = (C2 * cb + ONE_HALF)>> SCALEBITS */ |
| lbu v0, 0(t1) |
| addiu t6, t6, 1 |
| addiu t1, t1, 2 |
| addu t3, v0, s4 /* y+cred */ |
| addu s3, v0, s5 /* y+cgreen */ |
| addu v1, v0, s6 /* y+cblue */ |
| addu t3, t9, t3 /* y+cred */ |
| addu s3, t9, s3 /* y+cgreen */ |
| addu v1, t9, v1 /* y+cblue */ |
| lbu AT, 0(t3) |
| lbu s7, 0(s3) |
| lbu ra, 0(v1) |
| lbu v0, -1(t1) |
| addu t3, v0, s4 /* y+cred */ |
| addu s3, v0, s5 /* y+cgreen */ |
| addu v1, v0, s6 /* y+cblue */ |
| addu t3, t9, t3 /* y+cred */ |
| addu s3, t9, s3 /* y+cgreen */ |
| addu v1, t9, v1 /* y+cblue */ |
| lbu t3, 0(t3) |
| lbu s3, 0(s3) |
| lbu v1, 0(v1) |
| lbu v0, 0(t2) |
| |
| STORE_H2V2_2_PIXELS AT, s7, ra, t3, s3, v1, t4 |
| |
| addu t3, v0, s4 /* y+cred */ |
| addu s3, v0, s5 /* y+cgreen */ |
| addu v1, v0, s6 /* y+cblue */ |
| addu t3, t9, t3 /* y+cred */ |
| addu s3, t9, s3 /* y+cgreen */ |
| addu v1, t9, v1 /* y+cblue */ |
| lbu AT, 0(t3) |
| lbu s7, 0(s3) |
| lbu ra, 0(v1) |
| lbu v0, 1(t2) |
| addiu t2, t2, 2 |
| addu t3, v0, s4 /* y+cred */ |
| addu s3, v0, s5 /* y+cgreen */ |
| addu v1, v0, s6 /* y+cblue */ |
| addu t3, t9, t3 /* y+cred */ |
| addu s3, t9, s3 /* y+cgreen */ |
| addu v1, t9, v1 /* y+cblue */ |
| lbu t3, 0(t3) |
| lbu s3, 0(s3) |
| lbu v1, 0(v1) |
| |
| STORE_H2V2_2_PIXELS AT, s7, ra, t3, s3, v1, t7 |
| |
| bne t0, t5, 1b |
| nop |
| 2: |
| andi t0, a0, 1 |
| beqz t0, 4f |
| lbu t3, 0(t5) |
| lbu s3, 0(t6) |
| addiu t3, t3, -128 /* (cb - 128) */ |
| addiu s3, s3, -128 /* (cr - 128) */ |
| mult $ac1, s1, t3 |
| madd $ac1, s2, s3 |
| sll s3, s3, 15 |
| sll t3, t3, 15 |
| lbu v0, 0(t1) |
| extr_r.w s5, $ac1, 16 |
| mulq_rs.w s4, t8, s3 /* s4 = (C1 * cr + ONE_HALF)>> SCALEBITS */ |
| mulq_rs.w s6, s0, t3 /* s6 = (C2 * cb + ONE_HALF)>> SCALEBITS */ |
| addu t3, v0, s4 /* y+cred */ |
| addu s3, v0, s5 /* y+cgreen */ |
| addu v1, v0, s6 /* y+cblue */ |
| addu t3, t9, t3 /* y+cred */ |
| addu s3, t9, s3 /* y+cgreen */ |
| addu v1, t9, v1 /* y+cblue */ |
| lbu t3, 0(t3) |
| lbu s3, 0(s3) |
| lbu v1, 0(v1) |
| lbu v0, 0(t2) |
| |
| STORE_H2V2_1_PIXEL t3, s3, v1, t4 |
| |
| addu t3, v0, s4 /* y+cred */ |
| addu s3, v0, s5 /* y+cgreen */ |
| addu v1, v0, s6 /* y+cblue */ |
| addu t3, t9, t3 /* y+cred */ |
| addu s3, t9, s3 /* y+cgreen */ |
| addu v1, t9, v1 /* y+cblue */ |
| lbu t3, 0(t3) |
| lbu s3, 0(s3) |
| lbu v1, 0(v1) |
| |
| STORE_H2V2_1_PIXEL t3, s3, v1, t7 |
| 4: |
| RESTORE_REGS_FROM_STACK 40, s0, s1, s2, s3, s4, s5, s6, s7, ra |
| |
| j ra |
| nop |
| |
| END(jsimd_h2v2_\colorid\()_merged_upsample_dspr2) |
| |
| .purgem STORE_H2V2_1_PIXEL |
| .purgem STORE_H2V2_2_PIXELS |
| .endm |
| |
| /*------------------------------------id -- pix R1 G1 B1 A1 R2 G2 B2 A2 */ |
| GENERATE_H2V2_MERGED_UPSAMPLE_DSPR2 extrgb, 6, 0, 1, 2, 6, 3, 4, 5, 6 |
| GENERATE_H2V2_MERGED_UPSAMPLE_DSPR2 extbgr, 6, 2, 1, 0, 3, 5, 4, 3, 6 |
| GENERATE_H2V2_MERGED_UPSAMPLE_DSPR2 extrgbx, 8, 0, 1, 2, 3, 4, 5, 6, 7 |
| GENERATE_H2V2_MERGED_UPSAMPLE_DSPR2 extbgrx, 8, 2, 1, 0, 3, 6, 5, 4, 7 |
| GENERATE_H2V2_MERGED_UPSAMPLE_DSPR2 extxbgr, 8, 3, 2, 1, 0, 7, 6, 5, 4 |
| GENERATE_H2V2_MERGED_UPSAMPLE_DSPR2 extxrgb, 8, 1, 2, 3, 0, 5, 6, 7, 4 |
| |
| |
| /*****************************************************************************/ |
| /* |
| * jsimd_h2v1_merged_upsample_dspr2 |
| * jsimd_h2v1_extrgb_merged_upsample_dspr2 |
| * jsimd_h2v1_extrgbx_merged_upsample_dspr2 |
| * jsimd_h2v1_extbgr_merged_upsample_dspr2 |
| * jsimd_h2v1_extbgrx_merged_upsample_dspr2 |
| * jsimd_h2v1_extxbgr_merged_upsample_dspr2 |
| * jsimd_h2v1_extxrgb_merged_upsample_dspr2 |
| * |
| * Merged h2v1 upsample routines |
| */ |
| |
| .macro GENERATE_H2V1_MERGED_UPSAMPLE_DSPR2 colorid, pixel_size, \ |
| r1_offs, g1_offs, \ |
| b1_offs, a1_offs, \ |
| r2_offs, g2_offs, \ |
| b2_offs, a2_offs |
| |
| .macro STORE_H2V1_2_PIXELS scratch0 scratch1 scratch2 scratch3 scratch4 \ |
| scratch5 outptr |
| sb \scratch0, \r1_offs(\outptr) |
| sb \scratch1, \g1_offs(\outptr) |
| sb \scratch2, \b1_offs(\outptr) |
| sb \scratch3, \r2_offs(\outptr) |
| sb \scratch4, \g2_offs(\outptr) |
| sb \scratch5, \b2_offs(\outptr) |
| .if (\pixel_size == 8) |
| li t0, 0xFF |
| sb t0, \a1_offs(\outptr) |
| sb t0, \a2_offs(\outptr) |
| .endif |
| addiu \outptr, \pixel_size |
| .endm |
| |
| .macro STORE_H2V1_1_PIXEL scratch0 scratch1 scratch2 outptr |
| sb \scratch0, \r1_offs(\outptr) |
| sb \scratch1, \g1_offs(\outptr) |
| sb \scratch2, \b1_offs(\outptr) |
| .if (\pixel_size == 8) |
| li t0, 0xFF |
| sb t0, \a1_offs(\outptr) |
| .endif |
| .endm |
| |
| LEAF_DSPR2(jsimd_h2v1_\colorid\()_merged_upsample_dspr2) |
| /* |
| * a0 = cinfo->output_width |
| * a1 = input_buf |
| * a2 = in_row_group_ctr |
| * a3 = output_buf |
| * 16(sp) = range_limit |
| */ |
| SAVE_REGS_ON_STACK 40, s0, s1, s2, s3, s4, s5, s6, s7, ra |
| |
| li t0, 0xe6ea |
| lw t1, 0(a1) /* t1 = input_buf[0] */ |
| lw t2, 4(a1) /* t2 = input_buf[1] */ |
| lw t3, 8(a1) /* t3 = input_buf[2] */ |
| lw t8, 56(sp) /* t8 = range_limit */ |
| addiu s1, t0, 0x7fff /* s1 = 0x166e9 [FIX(1.40200)] */ |
| addiu s2, s1, 0x5eb9 /* s2 = 0x1c5a2 [FIX(1.77200)] */ |
| addiu s0, t0, 0x9916 /* s0 = 0x8000 */ |
| addiu s4, zero, 0xa7e6 /* s4 = 0xffffa7e6 [-FIX(0.34414)] */ |
| xori s3, s4, 0xeec8 /* s3 = 0xffff492e [-FIX(0.71414)] */ |
| srl t0, a0, 1 |
| sll t4, a2, 2 |
| lwx s5, t4(t1) /* s5 = inptr0 */ |
| lwx s6, t4(t2) /* s6 = inptr1 */ |
| lwx s7, t4(t3) /* s7 = inptr2 */ |
| lw t7, 0(a3) /* t7 = outptr */ |
| blez t0, 2f |
| addu t9, s6, t0 /* t9 = end address */ |
| 1: |
| lbu t2, 0(s6) /* t2 = cb */ |
| lbu t0, 0(s7) /* t0 = cr */ |
| lbu t1, 0(s5) /* t1 = y */ |
| addiu t2, t2, -128 /* t2 = cb - 128 */ |
| addiu t0, t0, -128 /* t0 = cr - 128 */ |
| mult $ac1, s4, t2 |
| madd $ac1, s3, t0 |
| sll t0, t0, 15 |
| sll t2, t2, 15 |
| mulq_rs.w t0, s1, t0 /* t0 = (C1*cr + ONE_HALF)>> SCALEBITS */ |
| extr_r.w t5, $ac1, 16 |
| mulq_rs.w t6, s2, t2 /* t6 = (C2*cb + ONE_HALF)>> SCALEBITS */ |
| addiu s7, s7, 1 |
| addiu s6, s6, 1 |
| addu t2, t1, t0 /* t2 = y + cred */ |
| addu t3, t1, t5 /* t3 = y + cgreen */ |
| addu t4, t1, t6 /* t4 = y + cblue */ |
| addu t2, t8, t2 |
| addu t3, t8, t3 |
| addu t4, t8, t4 |
| lbu t1, 1(s5) |
| lbu v0, 0(t2) |
| lbu v1, 0(t3) |
| lbu ra, 0(t4) |
| addu t2, t1, t0 |
| addu t3, t1, t5 |
| addu t4, t1, t6 |
| addu t2, t8, t2 |
| addu t3, t8, t3 |
| addu t4, t8, t4 |
| lbu t2, 0(t2) |
| lbu t3, 0(t3) |
| lbu t4, 0(t4) |
| |
| STORE_H2V1_2_PIXELS v0, v1, ra, t2, t3, t4, t7 |
| |
| bne t9, s6, 1b |
| addiu s5, s5, 2 |
| 2: |
| andi t0, a0, 1 |
| beqz t0, 4f |
| nop |
| 3: |
| lbu t2, 0(s6) |
| lbu t0, 0(s7) |
| lbu t1, 0(s5) |
| addiu t2, t2, -128 /* (cb - 128) */ |
| addiu t0, t0, -128 /* (cr - 128) */ |
| mul t3, s4, t2 |
| mul t4, s3, t0 |
| sll t0, t0, 15 |
| sll t2, t2, 15 |
| mulq_rs.w t0, s1, t0 /* (C1*cr + ONE_HALF)>> SCALEBITS */ |
| mulq_rs.w t6, s2, t2 /* (C2*cb + ONE_HALF)>> SCALEBITS */ |
| addu t3, t3, s0 |
| addu t3, t4, t3 |
| sra t5, t3, 16 /* (C4*cb + ONE_HALF + C3*cr)>> SCALEBITS */ |
| addu t2, t1, t0 /* y + cred */ |
| addu t3, t1, t5 /* y + cgreen */ |
| addu t4, t1, t6 /* y + cblue */ |
| addu t2, t8, t2 |
| addu t3, t8, t3 |
| addu t4, t8, t4 |
| lbu t2, 0(t2) |
| lbu t3, 0(t3) |
| lbu t4, 0(t4) |
| |
| STORE_H2V1_1_PIXEL t2, t3, t4, t7 |
| 4: |
| RESTORE_REGS_FROM_STACK 40, s0, s1, s2, s3, s4, s5, s6, s7, ra |
| |
| j ra |
| nop |
| |
| END(jsimd_h2v1_\colorid\()_merged_upsample_dspr2) |
| |
| .purgem STORE_H2V1_1_PIXEL |
| .purgem STORE_H2V1_2_PIXELS |
| .endm |
| |
| /*------------------------------------id -- pix R1 G1 B1 A1 R2 G2 B2 A2 */ |
| GENERATE_H2V1_MERGED_UPSAMPLE_DSPR2 extrgb, 6, 0, 1, 2, 6, 3, 4, 5, 6 |
| GENERATE_H2V1_MERGED_UPSAMPLE_DSPR2 extbgr, 6, 2, 1, 0, 3, 5, 4, 3, 6 |
| GENERATE_H2V1_MERGED_UPSAMPLE_DSPR2 extrgbx, 8, 0, 1, 2, 3, 4, 5, 6, 7 |
| GENERATE_H2V1_MERGED_UPSAMPLE_DSPR2 extbgrx, 8, 2, 1, 0, 3, 6, 5, 4, 7 |
| GENERATE_H2V1_MERGED_UPSAMPLE_DSPR2 extxbgr, 8, 3, 2, 1, 0, 7, 6, 5, 4 |
| GENERATE_H2V1_MERGED_UPSAMPLE_DSPR2 extxrgb, 8, 1, 2, 3, 0, 5, 6, 7, 4 |
| |
| |
| /*****************************************************************************/ |
| /* |
| * jsimd_h2v2_fancy_upsample_dspr2 |
| * |
| * Fancy processing for the common case of 2:1 horizontal and 2:1 vertical. |
| */ |
| LEAF_DSPR2(jsimd_h2v2_fancy_upsample_dspr2) |
| /* |
| * a0 = cinfo->max_v_samp_factor |
| * a1 = downsampled_width |
| * a2 = input_data |
| * a3 = output_data_ptr |
| */ |
| SAVE_REGS_ON_STACK 24, s0, s1, s2, s3, s4, s5 |
| |
| li s4, 0 |
| lw s2, 0(a3) /* s2 = *output_data_ptr */ |
| 0: |
| li t9, 2 |
| lw s1, -4(a2) /* s1 = inptr1 */ |
| |
| 1: |
| lw s0, 0(a2) /* s0 = inptr0 */ |
| lwx s3, s4(s2) |
| addiu s5, a1, -2 /* s5 = downsampled_width - 2 */ |
| srl t4, s5, 1 |
| sll t4, t4, 1 |
| lbu t0, 0(s0) |
| lbu t1, 1(s0) |
| lbu t2, 0(s1) |
| lbu t3, 1(s1) |
| addiu s0, 2 |
| addiu s1, 2 |
| addu t8, s0, t4 /* t8 = end address */ |
| andi s5, s5, 1 /* s5 = residual */ |
| sll t4, t0, 1 |
| sll t6, t1, 1 |
| addu t0, t0, t4 /* t0 = (*inptr0++) * 3 */ |
| addu t1, t1, t6 /* t1 = (*inptr0++) * 3 */ |
| addu t7, t0, t2 /* t7 = thiscolsum */ |
| addu t6, t1, t3 /* t5 = nextcolsum */ |
| sll t0, t7, 2 /* t0 = thiscolsum * 4 */ |
| subu t1, t0, t7 /* t1 = thiscolsum * 3 */ |
| shra_r.w t0, t0, 4 |
| addiu t1, 7 |
| addu t1, t1, t6 |
| srl t1, t1, 4 |
| sb t0, 0(s3) |
| sb t1, 1(s3) |
| beq t8, s0, 22f /* skip to final iteration if width == 3 */ |
| addiu s3, 2 |
| 2: |
| lh t0, 0(s0) /* t0 = A3|A2 */ |
| lh t2, 0(s1) /* t2 = B3|B2 */ |
| addiu s0, 2 |
| addiu s1, 2 |
| preceu.ph.qbr t0, t0 /* t0 = 0|A3|0|A2 */ |
| preceu.ph.qbr t2, t2 /* t2 = 0|B3|0|B2 */ |
| shll.ph t1, t0, 1 |
| sll t3, t6, 1 |
| addu.ph t0, t1, t0 /* t0 = A3*3|A2*3 */ |
| addu t3, t3, t6 /* t3 = this * 3 */ |
| addu.ph t0, t0, t2 /* t0 = next2|next1 */ |
| addu t1, t3, t7 |
| andi t7, t0, 0xFFFF /* t7 = next1 */ |
| sll t2, t7, 1 |
| addu t2, t7, t2 /* t2 = next1*3 */ |
| addu t4, t2, t6 |
| srl t6, t0, 16 /* t6 = next2 */ |
| shra_r.w t1, t1, 4 /* t1 = (this*3 + last + 8) >> 4 */ |
| addu t0, t3, t7 |
| addiu t0, 7 |
| srl t0, t0, 4 /* t0 = (this*3 + next1 + 7) >> 4 */ |
| shra_r.w t4, t4, 4 /* t3 = (next1*3 + this + 8) >> 4 */ |
| addu t2, t2, t6 |
| addiu t2, 7 |
| srl t2, t2, 4 /* t2 = (next1*3 + next2 + 7) >> 4 */ |
| sb t1, 0(s3) |
| sb t0, 1(s3) |
| sb t4, 2(s3) |
| sb t2, 3(s3) |
| bne t8, s0, 2b |
| addiu s3, 4 |
| 22: |
| beqz s5, 4f |
| addu t8, s0, s5 |
| 3: |
| lbu t0, 0(s0) |
| lbu t2, 0(s1) |
| addiu s0, 1 |
| addiu s1, 1 |
| sll t3, t6, 1 |
| sll t1, t0, 1 |
| addu t1, t0, t1 /* t1 = inptr0 * 3 */ |
| addu t3, t3, t6 /* t3 = thiscolsum * 3 */ |
| addu t5, t1, t2 |
| addu t1, t3, t7 |
| shra_r.w t1, t1, 4 |
| addu t0, t3, t5 |
| addiu t0, 7 |
| srl t0, t0, 4 |
| sb t1, 0(s3) |
| sb t0, 1(s3) |
| addiu s3, 2 |
| move t7, t6 |
| bne t8, s0, 3b |
| move t6, t5 |
| 4: |
| sll t0, t6, 2 /* t0 = thiscolsum * 4 */ |
| subu t1, t0, t6 /* t1 = thiscolsum * 3 */ |
| addu t1, t1, t7 |
| addiu s4, 4 |
| shra_r.w t1, t1, 4 |
| addiu t0, 7 |
| srl t0, t0, 4 |
| sb t1, 0(s3) |
| sb t0, 1(s3) |
| addiu t9, -1 |
| addiu s3, 2 |
| bnez t9, 1b |
| lw s1, 4(a2) |
| srl t0, s4, 2 |
| subu t0, a0, t0 |
| bgtz t0, 0b |
| addiu a2, 4 |
| |
| RESTORE_REGS_FROM_STACK 24, s0, s1, s2, s3, s4, s5 |
| |
| j ra |
| nop |
| END(jsimd_h2v2_fancy_upsample_dspr2) |
| |
| |
| /*****************************************************************************/ |
| LEAF_DSPR2(jsimd_h2v1_fancy_upsample_dspr2) |
| /* |
| * a0 = cinfo->max_v_samp_factor |
| * a1 = downsampled_width |
| * a2 = input_data |
| * a3 = output_data_ptr |
| */ |
| SAVE_REGS_ON_STACK 16, s0, s1, s2, s3 |
| |
| .set at |
| |
| beqz a0, 3f |
| sll t0, a0, 2 |
| lw s1, 0(a3) |
| li s3, 0x10001 |
| addu s0, s1, t0 |
| 0: |
| addiu t8, a1, -2 |
| srl t9, t8, 2 |
| lw t7, 0(a2) |
| lw s2, 0(s1) |
| lbu t0, 0(t7) |
| lbu t1, 1(t7) /* t1 = inptr[1] */ |
| sll t2, t0, 1 |
| addu t2, t2, t0 /* t2 = invalue*3 */ |
| addu t2, t2, t1 |
| shra_r.w t2, t2, 2 |
| sb t0, 0(s2) |
| sb t2, 1(s2) |
| beqz t9, 11f |
| addiu s2, 2 |
| 1: |
| ulw t0, 0(t7) /* t0 = |P3|P2|P1|P0| */ |
| ulw t1, 1(t7) |
| ulh t2, 4(t7) /* t2 = |0|0|P5|P4| */ |
| preceu.ph.qbl t3, t0 /* t3 = |0|P3|0|P2| */ |
| preceu.ph.qbr t0, t0 /* t0 = |0|P1|0|P0| */ |
| preceu.ph.qbr t2, t2 /* t2 = |0|P5|0|P4| */ |
| preceu.ph.qbl t4, t1 /* t4 = |0|P4|0|P3| */ |
| preceu.ph.qbr t1, t1 /* t1 = |0|P2|0|P1| */ |
| shll.ph t5, t4, 1 |
| shll.ph t6, t1, 1 |
| addu.ph t5, t5, t4 /* t5 = |P4*3|P3*3| */ |
| addu.ph t6, t6, t1 /* t6 = |P2*3|P1*3| */ |
| addu.ph t4, t3, s3 |
| addu.ph t0, t0, s3 |
| addu.ph t4, t4, t5 |
| addu.ph t0, t0, t6 |
| shrl.ph t4, t4, 2 /* t4 = |0|P3|0|P2| */ |
| shrl.ph t0, t0, 2 /* t0 = |0|P1|0|P0| */ |
| addu.ph t2, t2, t5 |
| addu.ph t3, t3, t6 |
| shra_r.ph t2, t2, 2 /* t2 = |0|P5|0|P4| */ |
| shra_r.ph t3, t3, 2 /* t3 = |0|P3|0|P2| */ |
| shll.ph t2, t2, 8 |
| shll.ph t3, t3, 8 |
| or t2, t4, t2 |
| or t3, t3, t0 |
| addiu t9, -1 |
| usw t3, 0(s2) |
| usw t2, 4(s2) |
| addiu s2, 8 |
| bgtz t9, 1b |
| addiu t7, 4 |
| 11: |
| andi t8, 3 |
| beqz t8, 22f |
| addiu t7, 1 |
| |
| 2: |
| lbu t0, 0(t7) |
| addiu t7, 1 |
| sll t1, t0, 1 |
| addu t2, t0, t1 /* t2 = invalue */ |
| lbu t3, -2(t7) |
| lbu t4, 0(t7) |
| addiu t3, 1 |
| addiu t4, 2 |
| addu t3, t3, t2 |
| addu t4, t4, t2 |
| srl t3, 2 |
| srl t4, 2 |
| sb t3, 0(s2) |
| sb t4, 1(s2) |
| addiu t8, -1 |
| bgtz t8, 2b |
| addiu s2, 2 |
| |
| 22: |
| lbu t0, 0(t7) |
| lbu t2, -1(t7) |
| sll t1, t0, 1 |
| addu t1, t1, t0 /* t1 = invalue * 3 */ |
| addu t1, t1, t2 |
| addiu t1, 1 |
| srl t1, t1, 2 |
| sb t1, 0(s2) |
| sb t0, 1(s2) |
| addiu s1, 4 |
| bne s1, s0, 0b |
| addiu a2, 4 |
| 3: |
| RESTORE_REGS_FROM_STACK 16, s0, s1, s2, s3 |
| |
| j ra |
| nop |
| END(jsimd_h2v1_fancy_upsample_dspr2) |
| |
| |
| /*****************************************************************************/ |
| LEAF_DSPR2(jsimd_h2v1_downsample_dspr2) |
| /* |
| * a0 = cinfo->image_width |
| * a1 = cinfo->max_v_samp_factor |
| * a2 = compptr->v_samp_factor |
| * a3 = compptr->width_in_blocks |
| * 16(sp) = input_data |
| * 20(sp) = output_data |
| */ |
| .set at |
| |
| SAVE_REGS_ON_STACK 24, s0, s1, s2, s3, s4 |
| |
| beqz a2, 7f |
| lw s1, 44(sp) /* s1 = output_data */ |
| lw s0, 40(sp) /* s0 = input_data */ |
| srl s2, a0, 2 |
| andi t9, a0, 2 |
| srl t7, t9, 1 |
| addu s2, t7, s2 |
| sll t0, a3, 3 /* t0 = width_in_blocks*DCT */ |
| srl t7, t0, 1 |
| subu s2, t7, s2 |
| 0: |
| andi t6, a0, 1 /* t6 = temp_index */ |
| addiu t6, -1 |
| lw t4, 0(s1) /* t4 = outptr */ |
| lw t5, 0(s0) /* t5 = inptr0 */ |
| li s3, 0 /* s3 = bias */ |
| srl t7, a0, 1 /* t7 = image_width1 */ |
| srl s4, t7, 2 |
| andi t8, t7, 3 |
| 1: |
| ulhu t0, 0(t5) |
| ulhu t1, 2(t5) |
| ulhu t2, 4(t5) |
| ulhu t3, 6(t5) |
| raddu.w.qb t0, t0 |
| raddu.w.qb t1, t1 |
| raddu.w.qb t2, t2 |
| raddu.w.qb t3, t3 |
| shra.ph t0, t0, 1 |
| shra_r.ph t1, t1, 1 |
| shra.ph t2, t2, 1 |
| shra_r.ph t3, t3, 1 |
| sb t0, 0(t4) |
| sb t1, 1(t4) |
| sb t2, 2(t4) |
| sb t3, 3(t4) |
| addiu s4, -1 |
| addiu t4, 4 |
| bgtz s4, 1b |
| addiu t5, 8 |
| beqz t8, 3f |
| addu s4, t4, t8 |
| 2: |
| ulhu t0, 0(t5) |
| raddu.w.qb t0, t0 |
| addqh.w t0, t0, s3 |
| xori s3, s3, 1 |
| sb t0, 0(t4) |
| addiu t4, 1 |
| bne t4, s4, 2b |
| addiu t5, 2 |
| 3: |
| lbux t1, t6(t5) |
| sll t1, 1 |
| addqh.w t2, t1, s3 /* t2 = pixval1 */ |
| xori s3, s3, 1 |
| addqh.w t3, t1, s3 /* t3 = pixval2 */ |
| blez s2, 5f |
| append t3, t2, 8 |
| addu t5, t4, s2 /* t5 = loop_end2 */ |
| 4: |
| ush t3, 0(t4) |
| addiu s2, -1 |
| bgtz s2, 4b |
| addiu t4, 2 |
| 5: |
| beqz t9, 6f |
| nop |
| sb t2, 0(t4) |
| 6: |
| addiu s1, 4 |
| addiu a2, -1 |
| bnez a2, 0b |
| addiu s0, 4 |
| 7: |
| RESTORE_REGS_FROM_STACK 24, s0, s1, s2, s3, s4 |
| |
| j ra |
| nop |
| END(jsimd_h2v1_downsample_dspr2) |
| |
| |
| /*****************************************************************************/ |
| LEAF_DSPR2(jsimd_h2v2_downsample_dspr2) |
| /* |
| * a0 = cinfo->image_width |
| * a1 = cinfo->max_v_samp_factor |
| * a2 = compptr->v_samp_factor |
| * a3 = compptr->width_in_blocks |
| * 16(sp) = input_data |
| * 20(sp) = output_data |
| */ |
| .set at |
| |
| SAVE_REGS_ON_STACK 32, s0, s1, s2, s3, s4, s5, s6, s7 |
| |
| beqz a2, 8f |
| lw s1, 52(sp) /* s1 = output_data */ |
| lw s0, 48(sp) /* s0 = input_data */ |
| |
| andi t6, a0, 1 /* t6 = temp_index */ |
| addiu t6, -1 |
| srl t7, a0, 1 /* t7 = image_width1 */ |
| srl s4, t7, 2 |
| andi t8, t7, 3 |
| andi t9, a0, 2 |
| srl s2, a0, 2 |
| srl t7, t9, 1 |
| addu s2, t7, s2 |
| sll t0, a3, 3 /* s2 = width_in_blocks*DCT */ |
| srl t7, t0, 1 |
| subu s2, t7, s2 |
| 0: |
| lw t4, 0(s1) /* t4 = outptr */ |
| lw t5, 0(s0) /* t5 = inptr0 */ |
| lw s7, 4(s0) /* s7 = inptr1 */ |
| li s6, 1 /* s6 = bias */ |
| 2: |
| ulw t0, 0(t5) /* t0 = |P3|P2|P1|P0| */ |
| ulw t1, 0(s7) /* t1 = |Q3|Q2|Q1|Q0| */ |
| ulw t2, 4(t5) |
| ulw t3, 4(s7) |
| precrq.ph.w t7, t0, t1 /* t2 = |P3|P2|Q3|Q2| */ |
| ins t0, t1, 16, 16 /* t0 = |Q1|Q0|P1|P0| */ |
| raddu.w.qb t1, t7 |
| raddu.w.qb t0, t0 |
| shra_r.w t1, t1, 2 |
| addiu t0, 1 |
| srl t0, 2 |
| precrq.ph.w t7, t2, t3 |
| ins t2, t3, 16, 16 |
| raddu.w.qb t7, t7 |
| raddu.w.qb t2, t2 |
| shra_r.w t7, t7, 2 |
| addiu t2, 1 |
| srl t2, 2 |
| sb t0, 0(t4) |
| sb t1, 1(t4) |
| sb t2, 2(t4) |
| sb t7, 3(t4) |
| addiu t4, 4 |
| addiu t5, 8 |
| addiu s4, s4, -1 |
| bgtz s4, 2b |
| addiu s7, 8 |
| beqz t8, 4f |
| addu t8, t4, t8 |
| 3: |
| ulhu t0, 0(t5) |
| ulhu t1, 0(s7) |
| ins t0, t1, 16, 16 |
| raddu.w.qb t0, t0 |
| addu t0, t0, s6 |
| srl t0, 2 |
| xori s6, s6, 3 |
| sb t0, 0(t4) |
| addiu t5, 2 |
| addiu t4, 1 |
| bne t8, t4, 3b |
| addiu s7, 2 |
| 4: |
| lbux t1, t6(t5) |
| sll t1, 1 |
| lbux t0, t6(s7) |
| sll t0, 1 |
| addu t1, t1, t0 |
| addu t3, t1, s6 |
| srl t0, t3, 2 /* t2 = pixval1 */ |
| xori s6, s6, 3 |
| addu t2, t1, s6 |
| srl t1, t2, 2 /* t3 = pixval2 */ |
| blez s2, 6f |
| append t1, t0, 8 |
| 5: |
| ush t1, 0(t4) |
| addiu s2, -1 |
| bgtz s2, 5b |
| addiu t4, 2 |
| 6: |
| beqz t9, 7f |
| nop |
| sb t0, 0(t4) |
| 7: |
| addiu s1, 4 |
| addiu a2, -1 |
| bnez a2, 0b |
| addiu s0, 8 |
| 8: |
| RESTORE_REGS_FROM_STACK 32, s0, s1, s2, s3, s4, s5, s6, s7 |
| |
| j ra |
| nop |
| END(jsimd_h2v2_downsample_dspr2) |
| |
| |
| /*****************************************************************************/ |
| LEAF_DSPR2(jsimd_h2v2_smooth_downsample_dspr2) |
| /* |
| * a0 = input_data |
| * a1 = output_data |
| * a2 = compptr->v_samp_factor |
| * a3 = cinfo->max_v_samp_factor |
| * 16(sp) = cinfo->smoothing_factor |
| * 20(sp) = compptr->width_in_blocks |
| * 24(sp) = cinfo->image_width |
| */ |
| .set at |
| |
| SAVE_REGS_ON_STACK 32, s0, s1, s2, s3, s4, s5, s6, s7 |
| |
| lw s7, 52(sp) /* compptr->width_in_blocks */ |
| lw s0, 56(sp) /* cinfo->image_width */ |
| lw s6, 48(sp) /* cinfo->smoothing_factor */ |
| sll s7, 3 /* output_cols = width_in_blocks * DCTSIZE */ |
| sll v0, s7, 1 |
| subu v0, v0, s0 |
| blez v0, 2f |
| move v1, zero |
| addiu t0, a3, 2 /* t0 = cinfo->max_v_samp_factor + 2 */ |
| 0: |
| addiu t1, a0, -4 |
| sll t2, v1, 2 |
| lwx t1, t2(t1) |
| move t3, v0 |
| addu t1, t1, s0 |
| lbu t2, -1(t1) |
| 1: |
| addiu t3, t3, -1 |
| sb t2, 0(t1) |
| bgtz t3, 1b |
| addiu t1, t1, 1 |
| addiu v1, v1, 1 |
| bne v1, t0, 0b |
| nop |
| 2: |
| li v0, 80 |
| mul v0, s6, v0 |
| li v1, 16384 |
| move t4, zero |
| move t5, zero |
| subu t6, v1, v0 /* t6 = 16384 - tmp_smoot_f * 80 */ |
| sll t7, s6, 4 /* t7 = tmp_smoot_f * 16 */ |
| 3: |
| /* Special case for first column: pretend column -1 is same as column 0 */ |
| sll v0, t4, 2 |
| lwx t8, v0(a1) /* outptr = output_data[outrow] */ |
| sll v1, t5, 2 |
| addiu t9, v1, 4 |
| addiu s0, v1, -4 |
| addiu s1, v1, 8 |
| lwx s2, v1(a0) /* inptr0 = input_data[inrow] */ |
| lwx t9, t9(a0) /* inptr1 = input_data[inrow+1] */ |
| lwx s0, s0(a0) /* above_ptr = input_data[inrow-1] */ |
| lwx s1, s1(a0) /* below_ptr = input_data[inrow+2] */ |
| lh v0, 0(s2) |
| lh v1, 0(t9) |
| lh t0, 0(s0) |
| lh t1, 0(s1) |
| ins v0, v1, 16, 16 |
| ins t0, t1, 16, 16 |
| raddu.w.qb t2, v0 |
| raddu.w.qb s3, t0 |
| lbu v0, 0(s2) |
| lbu v1, 2(s2) |
| lbu t0, 0(t9) |
| lbu t1, 2(t9) |
| addu v0, v0, v1 |
| mult $ac1, t2, t6 |
| addu t0, t0, t1 |
| lbu t2, 2(s0) |
| addu t0, t0, v0 |
| lbu t3, 2(s1) |
| addu s3, t0, s3 |
| lbu v0, 0(s0) |
| lbu t0, 0(s1) |
| sll s3, s3, 1 |
| addu v0, v0, t2 |
| addu t0, t0, t3 |
| addu t0, t0, v0 |
| addu s3, t0, s3 |
| madd $ac1, s3, t7 |
| extr_r.w v0, $ac1, 16 |
| addiu t8, t8, 1 |
| addiu s2, s2, 2 |
| addiu t9, t9, 2 |
| addiu s0, s0, 2 |
| addiu s1, s1, 2 |
| sb v0, -1(t8) |
| addiu s4, s7, -2 |
| and s4, s4, 3 |
| addu s5, s4, t8 /* end address */ |
| 4: |
| lh v0, 0(s2) |
| lh v1, 0(t9) |
| lh t0, 0(s0) |
| lh t1, 0(s1) |
| ins v0, v1, 16, 16 |
| ins t0, t1, 16, 16 |
| raddu.w.qb t2, v0 |
| raddu.w.qb s3, t0 |
| lbu v0, -1(s2) |
| lbu v1, 2(s2) |
| lbu t0, -1(t9) |
| lbu t1, 2(t9) |
| addu v0, v0, v1 |
| mult $ac1, t2, t6 |
| addu t0, t0, t1 |
| lbu t2, 2(s0) |
| addu t0, t0, v0 |
| lbu t3, 2(s1) |
| addu s3, t0, s3 |
| lbu v0, -1(s0) |
| lbu t0, -1(s1) |
| sll s3, s3, 1 |
| addu v0, v0, t2 |
| addu t0, t0, t3 |
| addu t0, t0, v0 |
| addu s3, t0, s3 |
| madd $ac1, s3, t7 |
| extr_r.w t2, $ac1, 16 |
| addiu t8, t8, 1 |
| addiu s2, s2, 2 |
| addiu t9, t9, 2 |
| addiu s0, s0, 2 |
| sb t2, -1(t8) |
| bne s5, t8, 4b |
| addiu s1, s1, 2 |
| addiu s5, s7, -2 |
| subu s5, s5, s4 |
| addu s5, s5, t8 /* end address */ |
| 5: |
| lh v0, 0(s2) |
| lh v1, 0(t9) |
| lh t0, 0(s0) |
| lh t1, 0(s1) |
| ins v0, v1, 16, 16 |
| ins t0, t1, 16, 16 |
| raddu.w.qb t2, v0 |
| raddu.w.qb s3, t0 |
| lbu v0, -1(s2) |
| lbu v1, 2(s2) |
| lbu t0, -1(t9) |
| lbu t1, 2(t9) |
| addu v0, v0, v1 |
| mult $ac1, t2, t6 |
| addu t0, t0, t1 |
| lbu t2, 2(s0) |
| addu t0, t0, v0 |
| lbu t3, 2(s1) |
| addu s3, t0, s3 |
| lbu v0, -1(s0) |
| lbu t0, -1(s1) |
| sll s3, s3, 1 |
| addu v0, v0, t2 |
| addu t0, t0, t3 |
| lh v1, 2(t9) |
| addu t0, t0, v0 |
| lh v0, 2(s2) |
| addu s3, t0, s3 |
| lh t0, 2(s0) |
| lh t1, 2(s1) |
| madd $ac1, s3, t7 |
| extr_r.w t2, $ac1, 16 |
| ins t0, t1, 16, 16 |
| ins v0, v1, 16, 16 |
| raddu.w.qb s3, t0 |
| lbu v1, 4(s2) |
| lbu t0, 1(t9) |
| lbu t1, 4(t9) |
| sb t2, 0(t8) |
| raddu.w.qb t3, v0 |
| lbu v0, 1(s2) |
| addu t0, t0, t1 |
| mult $ac1, t3, t6 |
| addu v0, v0, v1 |
| lbu t2, 4(s0) |
| addu t0, t0, v0 |
| lbu v0, 1(s0) |
| addu s3, t0, s3 |
| lbu t0, 1(s1) |
| lbu t3, 4(s1) |
| addu v0, v0, t2 |
| sll s3, s3, 1 |
| addu t0, t0, t3 |
| lh v1, 4(t9) |
| addu t0, t0, v0 |
| lh v0, 4(s2) |
| addu s3, t0, s3 |
| lh t0, 4(s0) |
| lh t1, 4(s1) |
| madd $ac1, s3, t7 |
| extr_r.w t2, $ac1, 16 |
| ins t0, t1, 16, 16 |
| ins v0, v1, 16, 16 |
| raddu.w.qb s3, t0 |
| lbu v1, 6(s2) |
| lbu t0, 3(t9) |
| lbu t1, 6(t9) |
| sb t2, 1(t8) |
| raddu.w.qb t3, v0 |
| lbu v0, 3(s2) |
| addu t0, t0, t1 |
| mult $ac1, t3, t6 |
| addu v0, v0, v1 |
| lbu t2, 6(s0) |
| addu t0, t0, v0 |
| lbu v0, 3(s0) |
| addu s3, t0, s3 |
| lbu t0, 3(s1) |
| lbu t3, 6(s1) |
| addu v0, v0, t2 |
| sll s3, s3, 1 |
| addu t0, t0, t3 |
| lh v1, 6(t9) |
| addu t0, t0, v0 |
| lh v0, 6(s2) |
| addu s3, t0, s3 |
| lh t0, 6(s0) |
| lh t1, 6(s1) |
| madd $ac1, s3, t7 |
| extr_r.w t3, $ac1, 16 |
| ins t0, t1, 16, 16 |
| ins v0, v1, 16, 16 |
| raddu.w.qb s3, t0 |
| lbu v1, 8(s2) |
| lbu t0, 5(t9) |
| lbu t1, 8(t9) |
| sb t3, 2(t8) |
| raddu.w.qb t2, v0 |
| lbu v0, 5(s2) |
| addu t0, t0, t1 |
| mult $ac1, t2, t6 |
| addu v0, v0, v1 |
| lbu t2, 8(s0) |
| addu t0, t0, v0 |
| lbu v0, 5(s0) |
| addu s3, t0, s3 |
| lbu t0, 5(s1) |
| lbu t3, 8(s1) |
| addu v0, v0, t2 |
| sll s3, s3, 1 |
| addu t0, t0, t3 |
| addiu t8, t8, 4 |
| addu t0, t0, v0 |
| addiu s2, s2, 8 |
| addu s3, t0, s3 |
| addiu t9, t9, 8 |
| madd $ac1, s3, t7 |
| extr_r.w t1, $ac1, 16 |
| addiu s0, s0, 8 |
| addiu s1, s1, 8 |
| bne s5, t8, 5b |
| sb t1, -1(t8) |
| /* Special case for last column */ |
| lh v0, 0(s2) |
| lh v1, 0(t9) |
| lh t0, 0(s0) |
| lh t1, 0(s1) |
| ins v0, v1, 16, 16 |
| ins t0, t1, 16, 16 |
| raddu.w.qb t2, v0 |
| raddu.w.qb s3, t0 |
| lbu v0, -1(s2) |
| lbu v1, 1(s2) |
| lbu t0, -1(t9) |
| lbu t1, 1(t9) |
| addu v0, v0, v1 |
| mult $ac1, t2, t6 |
| addu t0, t0, t1 |
| lbu t2, 1(s0) |
| addu t0, t0, v0 |
| lbu t3, 1(s1) |
| addu s3, t0, s3 |
| lbu v0, -1(s0) |
| lbu t0, -1(s1) |
| sll s3, s3, 1 |
| addu v0, v0, t2 |
| addu t0, t0, t3 |
| addu t0, t0, v0 |
| addu s3, t0, s3 |
| madd $ac1, s3, t7 |
| extr_r.w t0, $ac1, 16 |
| addiu t5, t5, 2 |
| sb t0, 0(t8) |
| addiu t4, t4, 1 |
| bne t4, a2, 3b |
| addiu t5, t5, 2 |
| |
| RESTORE_REGS_FROM_STACK 32, s0, s1, s2, s3, s4, s5, s6, s7 |
| |
| j ra |
| nop |
| |
| END(jsimd_h2v2_smooth_downsample_dspr2) |
| |
| |
| /*****************************************************************************/ |
| LEAF_DSPR2(jsimd_int_upsample_dspr2) |
| /* |
| * a0 = upsample->h_expand[compptr->component_index] |
| * a1 = upsample->v_expand[compptr->component_index] |
| * a2 = input_data |
| * a3 = output_data_ptr |
| * 16(sp) = cinfo->output_width |
| * 20(sp) = cinfo->max_v_samp_factor |
| */ |
| .set at |
| |
| SAVE_REGS_ON_STACK 16, s0, s1, s2, s3 |
| |
| lw s0, 0(a3) /* s0 = output_data */ |
| lw s1, 32(sp) /* s1 = cinfo->output_width */ |
| lw s2, 36(sp) /* s2 = cinfo->max_v_samp_factor */ |
| li t6, 0 /* t6 = inrow */ |
| beqz s2, 10f |
| li s3, 0 /* s3 = outrow */ |
| 0: |
| addu t0, a2, t6 |
| addu t7, s0, s3 |
| lw t3, 0(t0) /* t3 = inptr */ |
| lw t8, 0(t7) /* t8 = outptr */ |
| beqz s1, 4f |
| addu t5, t8, s1 /* t5 = outend */ |
| 1: |
| lb t2, 0(t3) /* t2 = invalue = *inptr++ */ |
| addiu t3, 1 |
| beqz a0, 3f |
| move t0, a0 /* t0 = h_expand */ |
| 2: |
| sb t2, 0(t8) |
| addiu t0, -1 |
| bgtz t0, 2b |
| addiu t8, 1 |
| 3: |
| bgt t5, t8, 1b |
| nop |
| 4: |
| addiu t9, a1, -1 /* t9 = v_expand - 1 */ |
| blez t9, 9f |
| nop |
| 5: |
| lw t3, 0(s0) |
| lw t4, 4(s0) |
| subu t0, s1, 0xF |
| blez t0, 7f |
| addu t5, t3, s1 /* t5 = end address */ |
| andi t7, s1, 0xF /* t7 = residual */ |
| subu t8, t5, t7 |
| 6: |
| ulw t0, 0(t3) |
| ulw t1, 4(t3) |
| ulw t2, 8(t3) |
| usw t0, 0(t4) |
| ulw t0, 12(t3) |
| usw t1, 4(t4) |
| usw t2, 8(t4) |
| usw t0, 12(t4) |
| addiu t3, 16 |
| bne t3, t8, 6b |
| addiu t4, 16 |
| beqz t7, 8f |
| nop |
| 7: |
| lbu t0, 0(t3) |
| sb t0, 0(t4) |
| addiu t3, 1 |
| bne t3, t5, 7b |
| addiu t4, 1 |
| 8: |
| addiu t9, -1 |
| bgtz t9, 5b |
| addiu s0, 8 |
| 9: |
| addu s3, s3, a1 |
| bne s3, s2, 0b |
| addiu t6, 1 |
| 10: |
| RESTORE_REGS_FROM_STACK 16, s0, s1, s2, s3 |
| |
| j ra |
| nop |
| END(jsimd_int_upsample_dspr2) |
| |
| |
| /*****************************************************************************/ |
| LEAF_DSPR2(jsimd_h2v1_upsample_dspr2) |
| /* |
| * a0 = cinfo->max_v_samp_factor |
| * a1 = cinfo->output_width |
| * a2 = input_data |
| * a3 = output_data_ptr |
| */ |
| lw t7, 0(a3) /* t7 = output_data */ |
| andi t8, a1, 0xf /* t8 = residual */ |
| sll t0, a0, 2 |
| blez a0, 4f |
| addu t9, t7, t0 /* t9 = output_data end address */ |
| 0: |
| lw t5, 0(t7) /* t5 = outptr */ |
| lw t6, 0(a2) /* t6 = inptr */ |
| addu t3, t5, a1 /* t3 = outptr + output_width (end address) */ |
| subu t3, t8 /* t3 = end address - residual */ |
| beq t5, t3, 2f |
| move t4, t8 |
| 1: |
| ulw t0, 0(t6) /* t0 = |P3|P2|P1|P0| */ |
| ulw t2, 4(t6) /* t2 = |P7|P6|P5|P4| */ |
| srl t1, t0, 16 /* t1 = |X|X|P3|P2| */ |
| ins t0, t0, 16, 16 /* t0 = |P1|P0|P1|P0| */ |
| ins t1, t1, 16, 16 /* t1 = |P3|P2|P3|P2| */ |
| ins t0, t0, 8, 16 /* t0 = |P1|P1|P0|P0| */ |
| ins t1, t1, 8, 16 /* t1 = |P3|P3|P2|P2| */ |
| usw t0, 0(t5) |
| usw t1, 4(t5) |
| srl t0, t2, 16 /* t0 = |X|X|P7|P6| */ |
| ins t2, t2, 16, 16 /* t2 = |P5|P4|P5|P4| */ |
| ins t0, t0, 16, 16 /* t0 = |P7|P6|P7|P6| */ |
| ins t2, t2, 8, 16 /* t2 = |P5|P5|P4|P4| */ |
| ins t0, t0, 8, 16 /* t0 = |P7|P7|P6|P6| */ |
| usw t2, 8(t5) |
| usw t0, 12(t5) |
| addiu t5, 16 |
| bne t5, t3, 1b |
| addiu t6, 8 |
| beqz t8, 3f |
| move t4, t8 |
| 2: |
| lbu t1, 0(t6) |
| sb t1, 0(t5) |
| sb t1, 1(t5) |
| addiu t4, -2 |
| addiu t6, 1 |
| bgtz t4, 2b |
| addiu t5, 2 |
| 3: |
| addiu t7, 4 |
| bne t9, t7, 0b |
| addiu a2, 4 |
| 4: |
| j ra |
| nop |
| END(jsimd_h2v1_upsample_dspr2) |
| |
| |
| /*****************************************************************************/ |
| LEAF_DSPR2(jsimd_h2v2_upsample_dspr2) |
| /* |
| * a0 = cinfo->max_v_samp_factor |
| * a1 = cinfo->output_width |
| * a2 = input_data |
| * a3 = output_data_ptr |
| */ |
| lw t7, 0(a3) |
| blez a0, 7f |
| andi t9, a1, 0xf /* t9 = residual */ |
| 0: |
| lw t6, 0(a2) /* t6 = inptr */ |
| lw t5, 0(t7) /* t5 = outptr */ |
| addu t8, t5, a1 /* t8 = outptr end address */ |
| subu t8, t9 /* t8 = end address - residual */ |
| beq t5, t8, 2f |
| move t4, t9 |
| 1: |
| ulw t0, 0(t6) |
| srl t1, t0, 16 |
| ins t0, t0, 16, 16 |
| ins t0, t0, 8, 16 |
| ins t1, t1, 16, 16 |
| ins t1, t1, 8, 16 |
| ulw t2, 4(t6) |
| usw t0, 0(t5) |
| usw t1, 4(t5) |
| srl t3, t2, 16 |
| ins t2, t2, 16, 16 |
| ins t2, t2, 8, 16 |
| ins t3, t3, 16, 16 |
| ins t3, t3, 8, 16 |
| usw t2, 8(t5) |
| usw t3, 12(t5) |
| addiu t5, 16 |
| bne t5, t8, 1b |
| addiu t6, 8 |
| beqz t9, 3f |
| move t4, t9 |
| 2: |
| lbu t0, 0(t6) |
| sb t0, 0(t5) |
| sb t0, 1(t5) |
| addiu t4, -2 |
| addiu t6, 1 |
| bgtz t4, 2b |
| addiu t5, 2 |
| 3: |
| lw t6, 0(t7) /* t6 = outptr[0] */ |
| lw t5, 4(t7) /* t5 = outptr[1] */ |
| addu t4, t6, a1 /* t4 = new end address */ |
| beq a1, t9, 5f |
| subu t8, t4, t9 |
| 4: |
| ulw t0, 0(t6) |
| ulw t1, 4(t6) |
| ulw t2, 8(t6) |
| usw t0, 0(t5) |
| ulw t0, 12(t6) |
| usw t1, 4(t5) |
| usw t2, 8(t5) |
| usw t0, 12(t5) |
| addiu t6, 16 |
| bne t6, t8, 4b |
| addiu t5, 16 |
| beqz t9, 6f |
| nop |
| 5: |
| lbu t0, 0(t6) |
| sb t0, 0(t5) |
| addiu t6, 1 |
| bne t6, t4, 5b |
| addiu t5, 1 |
| 6: |
| addiu t7, 8 |
| addiu a0, -2 |
| bgtz a0, 0b |
| addiu a2, 4 |
| 7: |
| j ra |
| nop |
| END(jsimd_h2v2_upsample_dspr2) |
| |
| |
| /*****************************************************************************/ |
| LEAF_DSPR2(jsimd_idct_islow_dspr2) |
| /* |
| * a0 = coef_block |
| * a1 = compptr->dcttable |
| * a2 = output |
| * a3 = range_limit |
| */ |
| SAVE_REGS_ON_STACK 32, s0, s1, s2, s3, s4, s5, s6, s7 |
| |
| addiu sp, sp, -256 |
| move v0, sp |
| addiu v1, zero, 8 /* v1 = DCTSIZE = 8 */ |
| 1: |
| lh s4, 32(a0) /* s4 = inptr[16] */ |
| lh s5, 64(a0) /* s5 = inptr[32] */ |
| lh s6, 96(a0) /* s6 = inptr[48] */ |
| lh t1, 112(a0) /* t1 = inptr[56] */ |
| lh t7, 16(a0) /* t7 = inptr[8] */ |
| lh t5, 80(a0) /* t5 = inptr[40] */ |
| lh t3, 48(a0) /* t3 = inptr[24] */ |
| or s4, s4, t1 |
| or s4, s4, t3 |
| or s4, s4, t5 |
| or s4, s4, t7 |
| or s4, s4, s5 |
| or s4, s4, s6 |
| bnez s4, 2f |
| addiu v1, v1, -1 |
| lh s5, 0(a1) /* quantptr[DCTSIZE*0] */ |
| lh s6, 0(a0) /* inptr[DCTSIZE*0] */ |
| mul s5, s5, s6 /* DEQUANTIZE(inptr[0], quantptr[0]) */ |
| sll s5, s5, 2 |
| sw s5, 0(v0) |
| sw s5, 32(v0) |
| sw s5, 64(v0) |
| sw s5, 96(v0) |
| sw s5, 128(v0) |
| sw s5, 160(v0) |
| sw s5, 192(v0) |
| b 3f |
| sw s5, 224(v0) |
| 2: |
| lh t0, 112(a1) |
| lh t2, 48(a1) |
| lh t4, 80(a1) |
| lh t6, 16(a1) |
| mul t0, t0, t1 /* DEQUANTIZE(inptr[DCTSIZE*7], |
| quantptr[DCTSIZE*7]) */ |
| mul t1, t2, t3 /* DEQUANTIZE(inptr[DCTSIZE*3], |
| quantptr[DCTSIZE*3]) */ |
| mul t2, t4, t5 /* DEQUANTIZE(inptr[DCTSIZE*5], |
| quantptr[DCTSIZE*5]) */ |
| mul t3, t6, t7 /* DEQUANTIZE(inptr[DCTSIZE*1], |
| quantptr[DCTSIZE*1]) */ |
| lh t4, 32(a1) |
| lh t5, 32(a0) |
| lh t6, 96(a1) |
| lh t7, 96(a0) |
| addu s0, t0, t1 /* z3 = tmp0 + tmp2 */ |
| addu s1, t1, t2 /* z2 = tmp1 + tmp2 */ |
| addu s2, t2, t3 /* z4 = tmp1 + tmp3 */ |
| addu s3, s0, s2 /* z3 + z4 */ |
| addiu t9, zero, 9633 /* FIX_1_175875602 */ |
| mul s3, s3, t9 /* z5 = MULTIPLY(z3 + z4, FIX_1_175875602) */ |
| addu t8, t0, t3 /* z1 = tmp0 + tmp3 */ |
| addiu t9, zero, 2446 /* FIX_0_298631336 */ |
| mul t0, t0, t9 /* tmp0 = MULTIPLY(tmp0, FIX_0_298631336) */ |
| addiu t9, zero, 16819 /* FIX_2_053119869 */ |
| mul t2, t2, t9 /* tmp1 = MULTIPLY(tmp1, FIX_2_053119869) */ |
| addiu t9, zero, 25172 /* FIX_3_072711026 */ |
| mul t1, t1, t9 /* tmp2 = MULTIPLY(tmp2, FIX_3_072711026) */ |
| addiu t9, zero, 12299 /* FIX_1_501321110 */ |
| mul t3, t3, t9 /* tmp3 = MULTIPLY(tmp3, FIX_1_501321110) */ |
| addiu t9, zero, 16069 /* FIX_1_961570560 */ |
| mul s0, s0, t9 /* -z3 = MULTIPLY(z3, FIX_1_961570560) */ |
| addiu t9, zero, 3196 /* FIX_0_390180644 */ |
| mul s2, s2, t9 /* -z4 = MULTIPLY(z4, FIX_0_390180644) */ |
| addiu t9, zero, 7373 /* FIX_0_899976223 */ |
| mul t8, t8, t9 /* -z1 = MULTIPLY(z1, FIX_0_899976223) */ |
| addiu t9, zero, 20995 /* FIX_2_562915447 */ |
| mul s1, s1, t9 /* -z2 = MULTIPLY(z2, FIX_2_562915447) */ |
| subu s0, s3, s0 /* z3 += z5 */ |
| addu t0, t0, s0 /* tmp0 += z3 */ |
| addu t1, t1, s0 /* tmp2 += z3 */ |
| subu s2, s3, s2 /* z4 += z5 */ |
| addu t2, t2, s2 /* tmp1 += z4 */ |
| addu t3, t3, s2 /* tmp3 += z4 */ |
| subu t0, t0, t8 /* tmp0 += z1 */ |
| subu t1, t1, s1 /* tmp2 += z2 */ |
| subu t2, t2, s1 /* tmp1 += z2 */ |
| subu t3, t3, t8 /* tmp3 += z1 */ |
| mul s0, t4, t5 /* DEQUANTIZE(inptr[DCTSIZE*2], |
| quantptr[DCTSIZE*2]) */ |
| addiu t9, zero, 6270 /* FIX_0_765366865 */ |
| mul s1, t6, t7 /* DEQUANTIZE(inptr[DCTSIZE*6], |
| quantptr[DCTSIZE*6]) */ |
| lh t4, 0(a1) |
| lh t5, 0(a0) |
| lh t6, 64(a1) |
| lh t7, 64(a0) |
| mul s2, t9, s0 /* MULTIPLY(z2, FIX_0_765366865) */ |
| mul t5, t4, t5 /* DEQUANTIZE(inptr[DCTSIZE*0], |
| quantptr[DCTSIZE*0]) */ |
| mul t6, t6, t7 /* DEQUANTIZE(inptr[DCTSIZE*4], |
| quantptr[DCTSIZE*4]) */ |
| addiu t9, zero, 4433 /* FIX_0_541196100 */ |
| addu s3, s0, s1 /* z2 + z3 */ |
| mul s3, s3, t9 /* z1 = MULTIPLY(z2 + z3, FIX_0_541196100) */ |
| addiu t9, zero, 15137 /* FIX_1_847759065 */ |
| mul t8, s1, t9 /* MULTIPLY(z3, FIX_1_847759065) */ |
| addu t4, t5, t6 |
| subu t5, t5, t6 |
| sll t4, t4, 13 /* tmp0 = (z2 + z3) << CONST_BITS */ |
| sll t5, t5, 13 /* tmp1 = (z2 - z3) << CONST_BITS */ |
| addu t7, s3, s2 /* tmp3 = z1 + MULTIPLY(z2, FIX_0_765366865) */ |
| subu t6, s3, t8 /* tmp2 = |
| z1 + MULTIPLY(z3, -FIX_1_847759065) */ |
| addu s0, t4, t7 |
| subu s1, t4, t7 |
| addu s2, t5, t6 |
| subu s3, t5, t6 |
| addu t4, s0, t3 |
| subu s0, s0, t3 |
| addu t3, s2, t1 |
| subu s2, s2, t1 |
| addu t1, s3, t2 |
| subu s3, s3, t2 |
| addu t2, s1, t0 |
| subu s1, s1, t0 |
| shra_r.w t4, t4, 11 |
| shra_r.w t3, t3, 11 |
| shra_r.w t1, t1, 11 |
| shra_r.w t2, t2, 11 |
| shra_r.w s1, s1, 11 |
| shra_r.w s3, s3, 11 |
| shra_r.w s2, s2, 11 |
| shra_r.w s0, s0, 11 |
| sw t4, 0(v0) |
| sw t3, 32(v0) |
| sw t1, 64(v0) |
| sw t2, 96(v0) |
| sw s1, 128(v0) |
| sw s3, 160(v0) |
| sw s2, 192(v0) |
| sw s0, 224(v0) |
| 3: |
| addiu a1, a1, 2 |
| addiu a0, a0, 2 |
| bgtz v1, 1b |
| addiu v0, v0, 4 |
| move v0, sp |
| addiu v1, zero, 8 |
| 4: |
| lw t0, 8(v0) /* z2 = (JLONG)wsptr[2] */ |
| lw t1, 24(v0) /* z3 = (JLONG)wsptr[6] */ |
| lw t2, 0(v0) /* (JLONG)wsptr[0] */ |
| lw t3, 16(v0) /* (JLONG)wsptr[4] */ |
| lw s4, 4(v0) /* (JLONG)wsptr[1] */ |
| lw s5, 12(v0) /* (JLONG)wsptr[3] */ |
| lw s6, 20(v0) /* (JLONG)wsptr[5] */ |
| lw s7, 28(v0) /* (JLONG)wsptr[7] */ |
| or s4, s4, t0 |
| or s4, s4, t1 |
| or s4, s4, t3 |
| or s4, s4, s7 |
| or s4, s4, s5 |
| or s4, s4, s6 |
| bnez s4, 5f |
| addiu v1, v1, -1 |
| shra_r.w s5, t2, 5 |
| andi s5, s5, 0x3ff |
| lbux s5, s5(a3) |
| lw s1, 0(a2) |
| replv.qb s5, s5 |
| usw s5, 0(s1) |
| usw s5, 4(s1) |
| b 6f |
| nop |
| 5: |
| addu t4, t0, t1 /* z2 + z3 */ |
| addiu t8, zero, 4433 /* FIX_0_541196100 */ |
| mul t5, t4, t8 /* z1 = MULTIPLY(z2 + z3, FIX_0_541196100) */ |
| addiu t8, zero, 15137 /* FIX_1_847759065 */ |
| mul t1, t1, t8 /* MULTIPLY(z3, FIX_1_847759065) */ |
| addiu t8, zero, 6270 /* FIX_0_765366865 */ |
| mul t0, t0, t8 /* MULTIPLY(z2, FIX_0_765366865) */ |
| addu t4, t2, t3 /* (JLONG)wsptr[0] + (JLONG)wsptr[4] */ |
| subu t2, t2, t3 /* (JLONG)wsptr[0] - (JLONG)wsptr[4] */ |
| sll t4, t4, 13 /* tmp0 = |
| (wsptr[0] + wsptr[4]) << CONST_BITS */ |
| sll t2, t2, 13 /* tmp1 = |
| (wsptr[0] - wsptr[4]) << CONST_BITS */ |
| subu t1, t5, t1 /* tmp2 = |
| z1 + MULTIPLY(z3, -FIX_1_847759065) */ |
| subu t3, t2, t1 /* tmp12 = tmp1 - tmp2 */ |
| addu t2, t2, t1 /* tmp11 = tmp1 + tmp2 */ |
| addu t5, t5, t0 /* tmp3 = |
| z1 + MULTIPLY(z2, FIX_0_765366865) */ |
| subu t1, t4, t5 /* tmp13 = tmp0 - tmp3 */ |
| addu t0, t4, t5 /* tmp10 = tmp0 + tmp3 */ |
| lw t4, 28(v0) /* tmp0 = (JLONG)wsptr[7] */ |
| lw t6, 12(v0) /* tmp2 = (JLONG)wsptr[3] */ |
| lw t5, 20(v0) /* tmp1 = (JLONG)wsptr[5] */ |
| lw t7, 4(v0) /* tmp3 = (JLONG)wsptr[1] */ |
| addu s0, t4, t6 /* z3 = tmp0 + tmp2 */ |
| addiu t8, zero, 9633 /* FIX_1_175875602 */ |
| addu s1, t5, t7 /* z4 = tmp1 + tmp3 */ |
| addu s2, s0, s1 /* z3 + z4 */ |
| mul s2, s2, t8 /* z5 = MULTIPLY(z3 + z4, FIX_1_175875602) */ |
| addu s3, t4, t7 /* z1 = tmp0 + tmp3 */ |
| addu t9, t5, t6 /* z2 = tmp1 + tmp2 */ |
| addiu t8, zero, 16069 /* FIX_1_961570560 */ |
| mul s0, s0, t8 /* -z3 = MULTIPLY(z3, FIX_1_961570560) */ |
| addiu t8, zero, 3196 /* FIX_0_390180644 */ |
| mul s1, s1, t8 /* -z4 = MULTIPLY(z4, FIX_0_390180644) */ |
| addiu t8, zero, 2446 /* FIX_0_298631336 */ |
| mul t4, t4, t8 /* tmp0 = MULTIPLY(tmp0, FIX_0_298631336) */ |
| addiu t8, zero, 7373 /* FIX_0_899976223 */ |
| mul s3, s3, t8 /* -z1 = MULTIPLY(z1, FIX_0_899976223) */ |
| addiu t8, zero, 16819 /* FIX_2_053119869 */ |
| mul t5, t5, t8 /* tmp1 = MULTIPLY(tmp1, FIX_2_053119869) */ |
| addiu t8, zero, 20995 /* FIX_2_562915447 */ |
| mul t9, t9, t8 /* -z2 = MULTIPLY(z2, FIX_2_562915447) */ |
| addiu t8, zero, 25172 /* FIX_3_072711026 */ |
| mul t6, t6, t8 /* tmp2 = MULTIPLY(tmp2, FIX_3_072711026) */ |
| addiu t8, zero, 12299 /* FIX_1_501321110 */ |
| mul t7, t7, t8 /* tmp3 = MULTIPLY(tmp3, FIX_1_501321110) */ |
| subu s0, s2, s0 /* z3 += z5 */ |
| subu s1, s2, s1 /* z4 += z5 */ |
| addu t4, t4, s0 |
| subu t4, t4, s3 /* tmp0 */ |
| addu t5, t5, s1 |
| subu t5, t5, t9 /* tmp1 */ |
| addu t6, t6, s0 |
| subu t6, t6, t9 /* tmp2 */ |
| addu t7, t7, s1 |
| subu t7, t7, s3 /* tmp3 */ |
| addu s0, t0, t7 |
| subu t0, t0, t7 |
| addu t7, t2, t6 |
| subu t2, t2, t6 |
| addu t6, t3, t5 |
| subu t3, t3, t5 |
| addu t5, t1, t4 |
| subu t1, t1, t4 |
| shra_r.w s0, s0, 18 |
| shra_r.w t7, t7, 18 |
| shra_r.w t6, t6, 18 |
| shra_r.w t5, t5, 18 |
| shra_r.w t1, t1, 18 |
| shra_r.w t3, t3, 18 |
| shra_r.w t2, t2, 18 |
| shra_r.w t0, t0, 18 |
| andi s0, s0, 0x3ff |
| andi t7, t7, 0x3ff |
| andi t6, t6, 0x3ff |
| andi t5, t5, 0x3ff |
| andi t1, t1, 0x3ff |
| andi t3, t3, 0x3ff |
| andi t2, t2, 0x3ff |
| andi t0, t0, 0x3ff |
| lw s1, 0(a2) |
| lbux s0, s0(a3) |
| lbux t7, t7(a3) |
| lbux t6, t6(a3) |
| lbux t5, t5(a3) |
| lbux t1, t1(a3) |
| lbux t3, t3(a3) |
| lbux t2, t2(a3) |
| lbux t0, t0(a3) |
| sb s0, 0(s1) |
| sb t7, 1(s1) |
| sb t6, 2(s1) |
| sb t5, 3(s1) |
| sb t1, 4(s1) |
| sb t3, 5(s1) |
| sb t2, 6(s1) |
| sb t0, 7(s1) |
| 6: |
| addiu v0, v0, 32 |
| bgtz v1, 4b |
| addiu a2, a2, 4 |
| addiu sp, sp, 256 |
| |
| RESTORE_REGS_FROM_STACK 32, s0, s1, s2, s3, s4, s5, s6, s7 |
| |
| j ra |
| nop |
| |
| END(jsimd_idct_islow_dspr2) |
| |
| |
| /*****************************************************************************/ |
| LEAF_DSPR2(jsimd_idct_ifast_cols_dspr2) |
| /* |
| * a0 = inptr |
| * a1 = quantptr |
| * a2 = wsptr |
| * a3 = mips_idct_ifast_coefs |
| */ |
| SAVE_REGS_ON_STACK 32, s0, s1, s2, s3, s4, s5, s6, s7 |
| |
| addiu t9, a0, 16 /* end address */ |
| or AT, a3, zero |
| |
| 0: |
| lw s0, 0(a1) /* quantptr[DCTSIZE*0] */ |
| lw t0, 0(a0) /* inptr[DCTSIZE*0] */ |
| lw t1, 16(a0) /* inptr[DCTSIZE*1] */ |
| muleq_s.w.phl v0, t0, s0 /* tmp0 ... */ |
| lw t2, 32(a0) /* inptr[DCTSIZE*2] */ |
| lw t3, 48(a0) /* inptr[DCTSIZE*3] */ |
| lw t4, 64(a0) /* inptr[DCTSIZE*4] */ |
| lw t5, 80(a0) /* inptr[DCTSIZE*5] */ |
| muleq_s.w.phr t0, t0, s0 /* ... tmp0 ... */ |
| lw t6, 96(a0) /* inptr[DCTSIZE*6] */ |
| lw t7, 112(a0) /* inptr[DCTSIZE*7] */ |
| or s4, t1, t2 |
| or s5, t3, t4 |
| bnez s4, 1f |
| ins t0, v0, 16, 16 /* ... tmp0 */ |
| bnez s5, 1f |
| or s6, t5, t6 |
| or s6, s6, t7 |
| bnez s6, 1f |
| sw t0, 0(a2) /* wsptr[DCTSIZE*0] */ |
| sw t0, 16(a2) /* wsptr[DCTSIZE*1] */ |
| sw t0, 32(a2) /* wsptr[DCTSIZE*2] */ |
| sw t0, 48(a2) /* wsptr[DCTSIZE*3] */ |
| sw t0, 64(a2) /* wsptr[DCTSIZE*4] */ |
| sw t0, 80(a2) /* wsptr[DCTSIZE*5] */ |
| sw t0, 96(a2) /* wsptr[DCTSIZE*6] */ |
| sw t0, 112(a2) /* wsptr[DCTSIZE*7] */ |
| addiu a0, a0, 4 |
| b 2f |
| addiu a1, a1, 4 |
| |
| 1: |
| lw s1, 32(a1) /* quantptr[DCTSIZE*2] */ |
| lw s2, 64(a1) /* quantptr[DCTSIZE*4] */ |
| muleq_s.w.phl v0, t2, s1 /* tmp1 ... */ |
| muleq_s.w.phr t2, t2, s1 /* ... tmp1 ... */ |
| lw s0, 16(a1) /* quantptr[DCTSIZE*1] */ |
| lw s1, 48(a1) /* quantptr[DCTSIZE*3] */ |
| lw s3, 96(a1) /* quantptr[DCTSIZE*6] */ |
| muleq_s.w.phl v1, t4, s2 /* tmp2 ... */ |
| muleq_s.w.phr t4, t4, s2 /* ... tmp2 ... */ |
| lw s2, 80(a1) /* quantptr[DCTSIZE*5] */ |
| lw t8, 4(AT) /* FIX(1.414213562) */ |
| ins t2, v0, 16, 16 /* ... tmp1 */ |
| muleq_s.w.phl v0, t6, s3 /* tmp3 ... */ |
| muleq_s.w.phr t6, t6, s3 /* ... tmp3 ... */ |
| ins t4, v1, 16, 16 /* ... tmp2 */ |
| addq.ph s4, t0, t4 /* tmp10 */ |
| subq.ph s5, t0, t4 /* tmp11 */ |
| ins t6, v0, 16, 16 /* ... tmp3 */ |
| subq.ph s6, t2, t6 /* tmp12 ... */ |
| addq.ph s7, t2, t6 /* tmp13 */ |
| mulq_s.ph s6, s6, t8 /* ... tmp12 ... */ |
| addq.ph t0, s4, s7 /* tmp0 */ |
| subq.ph t6, s4, s7 /* tmp3 */ |
| muleq_s.w.phl v0, t1, s0 /* tmp4 ... */ |
| muleq_s.w.phr t1, t1, s0 /* ... tmp4 ... */ |
| shll_s.ph s6, s6, 1 /* x2 */ |
| lw s3, 112(a1) /* quantptr[DCTSIZE*7] */ |
| subq.ph s6, s6, s7 /* ... tmp12 */ |
| muleq_s.w.phl v1, t7, s3 /* tmp7 ... */ |
| muleq_s.w.phr t7, t7, s3 /* ... tmp7 ... */ |
| ins t1, v0, 16, 16 /* ... tmp4 */ |
| addq.ph t2, s5, s6 /* tmp1 */ |
| subq.ph t4, s5, s6 /* tmp2 */ |
| muleq_s.w.phl v0, t5, s2 /* tmp6 ... */ |
| muleq_s.w.phr t5, t5, s2 /* ... tmp6 ... */ |
| ins t7, v1, 16, 16 /* ... tmp7 */ |
| addq.ph s5, t1, t7 /* z11 */ |
| subq.ph s6, t1, t7 /* z12 */ |
| muleq_s.w.phl v1, t3, s1 /* tmp5 ... */ |
| muleq_s.w.phr t3, t3, s1 /* ... tmp5 ... */ |
| ins t5, v0, 16, 16 /* ... tmp6 */ |
| ins t3, v1, 16, 16 /* ... tmp5 */ |
| addq.ph s7, t5, t3 /* z13 */ |
| subq.ph v0, t5, t3 /* z10 */ |
| addq.ph t7, s5, s7 /* tmp7 */ |
| subq.ph s5, s5, s7 /* tmp11 ... */ |
| addq.ph v1, v0, s6 /* z5 ... */ |
| mulq_s.ph s5, s5, t8 /* ... tmp11 */ |
| lw t8, 8(AT) /* FIX(1.847759065) */ |
| lw s4, 0(AT) /* FIX(1.082392200) */ |
| addq.ph s0, t0, t7 |
| subq.ph s1, t0, t7 |
| mulq_s.ph v1, v1, t8 /* ... z5 */ |
| shll_s.ph s5, s5, 1 /* x2 */ |
| lw t8, 12(AT) /* FIX(-2.613125930) */ |
| sw s0, 0(a2) /* wsptr[DCTSIZE*0] */ |
| shll_s.ph v0, v0, 1 /* x4 */ |
| mulq_s.ph v0, v0, t8 /* tmp12 ... */ |
| mulq_s.ph s4, s6, s4 /* tmp10 ... */ |
| shll_s.ph v1, v1, 1 /* x2 */ |
| addiu a0, a0, 4 |
| addiu a1, a1, 4 |
| sw s1, 112(a2) /* wsptr[DCTSIZE*7] */ |
| shll_s.ph s6, v0, 1 /* x4 */ |
| shll_s.ph s4, s4, 1 /* x2 */ |
| addq.ph s6, s6, v1 /* ... tmp12 */ |
| subq.ph t5, s6, t7 /* tmp6 */ |
| subq.ph s4, s4, v1 /* ... tmp10 */ |
| subq.ph t3, s5, t5 /* tmp5 */ |
| addq.ph s2, t2, t5 |
| addq.ph t1, s4, t3 /* tmp4 */ |
| subq.ph s3, t2, t5 |
| sw s2, 16(a2) /* wsptr[DCTSIZE*1] */ |
| sw s3, 96(a2) /* wsptr[DCTSIZE*6] */ |
| addq.ph v0, t4, t3 |
| subq.ph v1, t4, t3 |
| sw v0, 32(a2) /* wsptr[DCTSIZE*2] */ |
| sw v1, 80(a2) /* wsptr[DCTSIZE*5] */ |
| addq.ph v0, t6, t1 |
| subq.ph v1, t6, t1 |
| sw v0, 64(a2) /* wsptr[DCTSIZE*4] */ |
| sw v1, 48(a2) /* wsptr[DCTSIZE*3] */ |
| |
| 2: |
| bne a0, t9, 0b |
| addiu a2, a2, 4 |
| |
| RESTORE_REGS_FROM_STACK 32, s0, s1, s2, s3, s4, s5, s6, s7 |
| |
| j ra |
| nop |
| |
| END(jsimd_idct_ifast_cols_dspr2) |
| |
| |
| /*****************************************************************************/ |
| LEAF_DSPR2(jsimd_idct_ifast_rows_dspr2) |
| /* |
| * a0 = wsptr |
| * a1 = output_buf |
| * a2 = output_col |
| * a3 = mips_idct_ifast_coefs |
| */ |
| SAVE_REGS_ON_STACK 40, s0, s1, s2, s3, s4, s5, s6, s7, s8, a3 |
| |
| addiu t9, a0, 128 /* end address */ |
| lui s8, 0x8080 |
| ori s8, s8, 0x8080 |
| |
| 0: |
| lw AT, 36(sp) /* restore $a3 (mips_idct_ifast_coefs) */ |
| lw t0, 0(a0) /* wsptr[DCTSIZE*0+0/1] b a */ |
| lw s0, 16(a0) /* wsptr[DCTSIZE*1+0/1] B A */ |
| lw t2, 4(a0) /* wsptr[DCTSIZE*0+2/3] d c */ |
| lw s2, 20(a0) /* wsptr[DCTSIZE*1+2/3] D C */ |
| lw t4, 8(a0) /* wsptr[DCTSIZE*0+4/5] f e */ |
| lw s4, 24(a0) /* wsptr[DCTSIZE*1+4/5] F E */ |
| lw t6, 12(a0) /* wsptr[DCTSIZE*0+6/7] h g */ |
| lw s6, 28(a0) /* wsptr[DCTSIZE*1+6/7] H G */ |
| precrq.ph.w t1, s0, t0 /* B b */ |
| ins t0, s0, 16, 16 /* A a */ |
| bnez t1, 1f |
| or s0, t2, s2 |
| bnez s0, 1f |
| or s0, t4, s4 |
| bnez s0, 1f |
| or s0, t6, s6 |
| bnez s0, 1f |
| shll_s.ph s0, t0, 2 /* A a */ |
| lw a3, 0(a1) |
| lw AT, 4(a1) |
| precrq.ph.w t0, s0, s0 /* A A */ |
| ins s0, s0, 16, 16 /* a a */ |
| addu a3, a3, a2 |
| addu AT, AT, a2 |
| precrq.qb.ph t0, t0, t0 /* A A A A */ |
| precrq.qb.ph s0, s0, s0 /* a a a a */ |
| addu.qb s0, s0, s8 |
| addu.qb t0, t0, s8 |
| sw s0, 0(a3) |
| sw s0, 4(a3) |
| sw t0, 0(AT) |
| sw t0, 4(AT) |
| addiu a0, a0, 32 |
| bne a0, t9, 0b |
| addiu a1, a1, 8 |
| b 2f |
| nop |
| |
| 1: |
| precrq.ph.w t3, s2, t2 |
| ins t2, s2, 16, 16 |
| precrq.ph.w t5, s4, t4 |
| ins t4, s4, 16, 16 |
| precrq.ph.w t7, s6, t6 |
| ins t6, s6, 16, 16 |
| lw t8, 4(AT) /* FIX(1.414213562) */ |
| addq.ph s4, t0, t4 /* tmp10 */ |
| subq.ph s5, t0, t4 /* tmp11 */ |
| subq.ph s6, t2, t6 /* tmp12 ... */ |
| addq.ph s7, t2, t6 /* tmp13 */ |
| mulq_s.ph s6, s6, t8 /* ... tmp12 ... */ |
| addq.ph t0, s4, s7 /* tmp0 */ |
| subq.ph t6, s4, s7 /* tmp3 */ |
| shll_s.ph s6, s6, 1 /* x2 */ |
| subq.ph s6, s6, s7 /* ... tmp12 */ |
| addq.ph t2, s5, s6 /* tmp1 */ |
| subq.ph t4, s5, s6 /* tmp2 */ |
| addq.ph s5, t1, t7 /* z11 */ |
| subq.ph s6, t1, t7 /* z12 */ |
| addq.ph s7, t5, t3 /* z13 */ |
| subq.ph v0, t5, t3 /* z10 */ |
| addq.ph t7, s5, s7 /* tmp7 */ |
| subq.ph s5, s5, s7 /* tmp11 ... */ |
| addq.ph v1, v0, s6 /* z5 ... */ |
| mulq_s.ph s5, s5, t8 /* ... tmp11 */ |
| lw t8, 8(AT) /* FIX(1.847759065) */ |
| lw s4, 0(AT) /* FIX(1.082392200) */ |
| addq.ph s0, t0, t7 /* tmp0 + tmp7 */ |
| subq.ph s7, t0, t7 /* tmp0 - tmp7 */ |
| mulq_s.ph v1, v1, t8 /* ... z5 */ |
| lw a3, 0(a1) |
| lw t8, 12(AT) /* FIX(-2.613125930) */ |
| shll_s.ph s5, s5, 1 /* x2 */ |
| addu a3, a3, a2 |
| shll_s.ph v0, v0, 1 /* x4 */ |
| mulq_s.ph v0, v0, t8 /* tmp12 ... */ |
| mulq_s.ph s4, s6, s4 /* tmp10 ... */ |
| shll_s.ph v1, v1, 1 /* x2 */ |
| addiu a0, a0, 32 |
| addiu a1, a1, 8 |
| shll_s.ph s6, v0, 1 /* x4 */ |
| shll_s.ph s4, s4, 1 /* x2 */ |
| addq.ph s6, s6, v1 /* ... tmp12 */ |
| shll_s.ph s0, s0, 2 |
| subq.ph t5, s6, t7 /* tmp6 */ |
| subq.ph s4, s4, v1 /* ... tmp10 */ |
| subq.ph t3, s5, t5 /* tmp5 */ |
| shll_s.ph s7, s7, 2 |
| addq.ph t1, s4, t3 /* tmp4 */ |
| addq.ph s1, t2, t5 /* tmp1 + tmp6 */ |
| subq.ph s6, t2, t5 /* tmp1 - tmp6 */ |
| addq.ph s2, t4, t3 /* tmp2 + tmp5 */ |
| subq.ph s5, t4, t3 /* tmp2 - tmp5 */ |
| addq.ph s4, t6, t1 /* tmp3 + tmp4 */ |
| subq.ph s3, t6, t1 /* tmp3 - tmp4 */ |
| shll_s.ph s1, s1, 2 |
| shll_s.ph s2, s2, 2 |
| shll_s.ph s3, s3, 2 |
| shll_s.ph s4, s4, 2 |
| shll_s.ph s5, s5, 2 |
| shll_s.ph s6, s6, 2 |
| precrq.ph.w t0, s1, s0 /* B A */ |
| ins s0, s1, 16, 16 /* b a */ |
| precrq.ph.w t2, s3, s2 /* D C */ |
| ins s2, s3, 16, 16 /* d c */ |
| precrq.ph.w t4, s5, s4 /* F E */ |
| ins s4, s5, 16, 16 /* f e */ |
| precrq.ph.w t6, s7, s6 /* H G */ |
| ins s6, s7, 16, 16 /* h g */ |
| precrq.qb.ph t0, t2, t0 /* D C B A */ |
| precrq.qb.ph s0, s2, s0 /* d c b a */ |
| precrq.qb.ph t4, t6, t4 /* H G F E */ |
| precrq.qb.ph s4, s6, s4 /* h g f e */ |
| addu.qb s0, s0, s8 |
| addu.qb s4, s4, s8 |
| sw s0, 0(a3) /* outptr[0/1/2/3] d c b a */ |
| sw s4, 4(a3) /* outptr[4/5/6/7] h g f e */ |
| lw a3, -4(a1) |
| addu.qb t0, t0, s8 |
| addu a3, a3, a2 |
| addu.qb t4, t4, s8 |
| sw t0, 0(a3) /* outptr[0/1/2/3] D C B A */ |
| bne a0, t9, 0b |
| sw t4, 4(a3) /* outptr[4/5/6/7] H G F E */ |
| |
| 2: |
| |
| RESTORE_REGS_FROM_STACK 40, s0, s1, s2, s3, s4, s5, s6, s7, s8, a3 |
| |
| j ra |
| nop |
| |
| END(jsimd_idct_ifast_rows_dspr2) |
| |
| |
| /*****************************************************************************/ |
| LEAF_DSPR2(jsimd_fdct_islow_dspr2) |
| /* |
| * a0 = data |
| */ |
| SAVE_REGS_ON_STACK 40, s0, s1, s2, s3, s4, s5, s6, s7, s8 |
| |
| lui t0, 6437 |
| ori t0, 2260 |
| lui t1, 9633 |
| ori t1, 11363 |
| lui t2, 0xd39e |
| ori t2, 0xe6dc |
| lui t3, 0xf72d |
| ori t3, 9633 |
| lui t4, 2261 |
| ori t4, 9633 |
| lui t5, 0xd39e |
| ori t5, 6437 |
| lui t6, 9633 |
| ori t6, 0xd39d |
| lui t7, 0xe6dc |
| ori t7, 2260 |
| lui t8, 4433 |
| ori t8, 10703 |
| lui t9, 0xd630 |
| ori t9, 4433 |
| li s8, 8 |
| move a1, a0 |
| 1: |
| lw s0, 0(a1) /* tmp0 = 1|0 */ |
| lw s1, 4(a1) /* tmp1 = 3|2 */ |
| lw s2, 8(a1) /* tmp2 = 5|4 */ |
| lw s3, 12(a1) /* tmp3 = 7|6 */ |
| packrl.ph s1, s1, s1 /* tmp1 = 2|3 */ |
| packrl.ph s3, s3, s3 /* tmp3 = 6|7 */ |
| subq.ph s7, s1, s2 /* tmp7 = 2-5|3-4 = t5|t4 */ |
| subq.ph s5, s0, s3 /* tmp5 = 1-6|0-7 = t6|t7 */ |
| mult $0, $0 /* ac0 = 0 */ |
| dpa.w.ph $ac0, s7, t0 /* ac0 += t5* 6437 + t4* 2260 */ |
| dpa.w.ph $ac0, s5, t1 /* ac0 += t6* 9633 + t7* 11363 */ |
| mult $ac1, $0, $0 /* ac1 = 0 */ |
| dpa.w.ph $ac1, s7, t2 /* ac1 += t5*-11362 + t4* -6436 */ |
| dpa.w.ph $ac1, s5, t3 /* ac1 += t6* -2259 + t7* 9633 */ |
| mult $ac2, $0, $0 /* ac2 = 0 */ |
| dpa.w.ph $ac2, s7, t4 /* ac2 += t5* 2261 + t4* 9633 */ |
| dpa.w.ph $ac2, s5, t5 /* ac2 += t6*-11362 + t7* 6437 */ |
| mult $ac3, $0, $0 /* ac3 = 0 */ |
| dpa.w.ph $ac3, s7, t6 /* ac3 += t5* 9633 + t4*-11363 */ |
| dpa.w.ph $ac3, s5, t7 /* ac3 += t6* -6436 + t7* 2260 */ |
| addq.ph s6, s1, s2 /* tmp6 = 2+5|3+4 = t2|t3 */ |
| addq.ph s4, s0, s3 /* tmp4 = 1+6|0+7 = t1|t0 */ |
| extr_r.w s0, $ac0, 11 /* tmp0 = (ac0 + 1024) >> 11 */ |
| extr_r.w s1, $ac1, 11 /* tmp1 = (ac1 + 1024) >> 11 */ |
| extr_r.w s2, $ac2, 11 /* tmp2 = (ac2 + 1024) >> 11 */ |
| extr_r.w s3, $ac3, 11 /* tmp3 = (ac3 + 1024) >> 11 */ |
| addq.ph s5, s4, s6 /* tmp5 = t1+t2|t0+t3 = t11|t10 */ |
| subq.ph s7, s4, s6 /* tmp7 = t1-t2|t0-t3 = t12|t13 */ |
| sh s0, 2(a1) |
| sh s1, 6(a1) |
| sh s2, 10(a1) |
| sh s3, 14(a1) |
| mult $0, $0 /* ac0 = 0 */ |
| dpa.w.ph $ac0, s7, t8 /* ac0 += t12* 4433 + t13* 10703 */ |
| mult $ac1, $0, $0 /* ac1 = 0 */ |
| dpa.w.ph $ac1, s7, t9 /* ac1 += t12*-10704 + t13* 4433 */ |
| sra s4, s5, 16 /* tmp4 = t11 */ |
| addiu a1, a1, 16 |
| addiu s8, s8, -1 |
| extr_r.w s0, $ac0, 11 /* tmp0 = (ac0 + 1024) >> 11 */ |
| extr_r.w s1, $ac1, 11 /* tmp1 = (ac1 + 1024) >> 11 */ |
| addu s2, s5, s4 /* tmp2 = t10 + t11 */ |
| subu s3, s5, s4 /* tmp3 = t10 - t11 */ |
| sll s2, s2, 2 /* tmp2 = (t10 + t11) << 2 */ |
| sll s3, s3, 2 /* tmp3 = (t10 - t11) << 2 */ |
| sh s2, -16(a1) |
| sh s3, -8(a1) |
| sh s0, -12(a1) |
| bgtz s8, 1b |
| sh s1, -4(a1) |
| li t0, 2260 |
| li t1, 11363 |
| li t2, 9633 |
| li t3, 6436 |
| li t4, 6437 |
| li t5, 2261 |
| li t6, 11362 |
| li t7, 2259 |
| li t8, 4433 |
| li t9, 10703 |
| li a1, 10704 |
| li s8, 8 |
| |
| 2: |
| lh a2, 0(a0) /* 0 */ |
| lh a3, 16(a0) /* 8 */ |
| lh v0, 32(a0) /* 16 */ |
| lh v1, 48(a0) /* 24 */ |
| lh s4, 64(a0) /* 32 */ |
| lh s5, 80(a0) /* 40 */ |
| lh s6, 96(a0) /* 48 */ |
| lh s7, 112(a0) /* 56 */ |
| addu s2, v0, s5 /* tmp2 = 16 + 40 */ |
| subu s5, v0, s5 /* tmp5 = 16 - 40 */ |
| addu s3, v1, s4 /* tmp3 = 24 + 32 */ |
| subu s4, v1, s4 /* tmp4 = 24 - 32 */ |
| addu s0, a2, s7 /* tmp0 = 0 + 56 */ |
| subu s7, a2, s7 /* tmp7 = 0 - 56 */ |
| addu s1, a3, s6 /* tmp1 = 8 + 48 */ |
| subu s6, a3, s6 /* tmp6 = 8 - 48 */ |
| addu a2, s0, s3 /* tmp10 = tmp0 + tmp3 */ |
| subu v1, s0, s3 /* tmp13 = tmp0 - tmp3 */ |
| addu a3, s1, s2 /* tmp11 = tmp1 + tmp2 */ |
| subu v0, s1, s2 /* tmp12 = tmp1 - tmp2 */ |
| mult s7, t1 /* ac0 = tmp7 * c1 */ |
| madd s4, t0 /* ac0 += tmp4 * c0 */ |
| madd s5, t4 /* ac0 += tmp5 * c4 */ |
| madd s6, t2 /* ac0 += tmp6 * c2 */ |
| mult $ac1, s7, t2 /* ac1 = tmp7 * c2 */ |
| msub $ac1, s4, t3 /* ac1 -= tmp4 * c3 */ |
| msub $ac1, s5, t6 /* ac1 -= tmp5 * c6 */ |
| msub $ac1, s6, t7 /* ac1 -= tmp6 * c7 */ |
| mult $ac2, s7, t4 /* ac2 = tmp7 * c4 */ |
| madd $ac2, s4, t2 /* ac2 += tmp4 * c2 */ |
| madd $ac2, s5, t5 /* ac2 += tmp5 * c5 */ |
| msub $ac2, s6, t6 /* ac2 -= tmp6 * c6 */ |
| mult $ac3, s7, t0 /* ac3 = tmp7 * c0 */ |
| msub $ac3, s4, t1 /* ac3 -= tmp4 * c1 */ |
| madd $ac3, s5, t2 /* ac3 += tmp5 * c2 */ |
| msub $ac3, s6, t3 /* ac3 -= tmp6 * c3 */ |
| extr_r.w s0, $ac0, 15 /* tmp0 = (ac0 + 16384) >> 15 */ |
| extr_r.w s1, $ac1, 15 /* tmp1 = (ac1 + 16384) >> 15 */ |
| extr_r.w s2, $ac2, 15 /* tmp2 = (ac2 + 16384) >> 15 */ |
| extr_r.w s3, $ac3, 15 /* tmp3 = (ac3 + 16384) >> 15 */ |
| addiu s8, s8, -1 |
| addu s4, a2, a3 /* tmp4 = tmp10 + tmp11 */ |
| subu s5, a2, a3 /* tmp5 = tmp10 - tmp11 */ |
| sh s0, 16(a0) |
| sh s1, 48(a0) |
| sh s2, 80(a0) |
| sh s3, 112(a0) |
| mult v0, t8 /* ac0 = tmp12 * c8 */ |
| madd v1, t9 /* ac0 += tmp13 * c9 */ |
| mult $ac1, v1, t8 /* ac1 = tmp13 * c8 */ |
| msub $ac1, v0, a1 /* ac1 -= tmp12 * c10 */ |
| addiu a0, a0, 2 |
| extr_r.w s6, $ac0, 15 /* tmp6 = (ac0 + 16384) >> 15 */ |
| extr_r.w s7, $ac1, 15 /* tmp7 = (ac1 + 16384) >> 15 */ |
| shra_r.w s4, s4, 2 /* tmp4 = (tmp4 + 2) >> 2 */ |
| shra_r.w s5, s5, 2 /* tmp5 = (tmp5 + 2) >> 2 */ |
| sh s4, -2(a0) |
| sh s5, 62(a0) |
| sh s6, 30(a0) |
| bgtz s8, 2b |
| sh s7, 94(a0) |
| |
| RESTORE_REGS_FROM_STACK 40, s0, s1, s2, s3, s4, s5, s6, s7, s8 |
| |
| jr ra |
| nop |
| |
| END(jsimd_fdct_islow_dspr2) |
| |
| |
| /**************************************************************************/ |
| LEAF_DSPR2(jsimd_fdct_ifast_dspr2) |
| /* |
| * a0 = data |
| */ |
| .set at |
| |
| SAVE_REGS_ON_STACK 8, s0, s1 |
| |
| li a1, 0x014e014e /* FIX_1_306562965 (334 << 16) | |
| (334 &
|