loongarch/filter_lsx_intrinsics.c - third_party/libpng - Git at Google

 /* filter_lsx_intrinsics.c - LSX optimized filter functions
  *
  * Copyright (c) 2021 Loongson Technology Corporation Limited
  * All rights reserved.
  * Copyright (c) 2018 Cosmin Truta
  * Copyright (c) 2016 Glenn Randers-Pehrson
  * Contributed by Jin Bo (jinbo@loongson.cn)
  *
  * This code is released under the libpng license.
  * For conditions of distribution and use, see the disclaimer
  * and license in png.h
  */

 #include "../pngpriv.h"

 #ifdef PNG_READ_SUPPORTED

 #if PNG_LOONGARCH_LSX_IMPLEMENTATION == 1 /* intrinsics code from pngpriv.h */

 #include <lsxintrin.h>

 #define LSX_LD(psrc) __lsx_vld((psrc), 0)

 #define LSX_LD_2(psrc, stride, out0, out1) \
 {                                          \
    out0 = LSX_LD(psrc);                    \
    out1 = LSX_LD(psrc + stride);           \
 }

 #define LSX_LD_4(psrc, stride, out0, out1, out2, out3) \
 {                                                      \
    LSX_LD_2(psrc, stride, out0, out1);                 \
    LSX_LD_2(psrc + stride * 2, stride, out2, out3);    \
 }

 #define LSX_ST(in, pdst) __lsx_vst(in, (pdst), 0)

 #define LSX_ST_2(in0, in1, pdst, stride) \
 {                                        \
    LSX_ST(in0, pdst);                    \
    LSX_ST(in1, pdst + stride);           \
 }

 #define LSX_ST_4(in0, in1, in2, in3, pdst, stride) \
 {                                                  \
    LSX_ST_2(in0, in1, pdst, stride);               \
    LSX_ST_2(in2, in3, pdst + stride * 2, stride);  \
 }

 #define LSX_ADD_B(in0, in1, out0) \
 {                                 \
    out0 = __lsx_vadd_b(in0, in1); \
 }

 #define LSX_ADD_B_2(in0, in1, in2, in3, out0, out1) \
 {                                                   \
    LSX_ADD_B(in0, in1, out0);                       \
    LSX_ADD_B(in2, in3, out1);                       \
 }

 #define LSX_ADD_B_4(in0, in1, in2, in3, in4, in5,     \
                     in6, in7, out0, out1, out2, out3) \
 {                                                     \
    LSX_ADD_B_2(in0, in1, in2, in3, out0, out1);       \
    LSX_ADD_B_2(in4, in5, in6, in7, out2, out3);       \
 }

 #define LSX_ABS_B_3(in0, in1, in2, out0, out1, out2) \
 {                                                    \
    out0 = __lsx_vadda_h(in0, zero);                  \
    out1 = __lsx_vadda_h(in1, zero);                  \
    out2 = __lsx_vadda_h(in2, zero);                  \
 }

 #define LSX_ILVL_B(in_h, in_l, out0)  \
 {                                     \
    out0 = __lsx_vilvl_b(in_h, in_l);  \
 }

 #define LSX_ILVL_B_2(in0_h, in0_l, in1_h, in1_l, out0, out1) \
 {                                                            \
    LSX_ILVL_B(in0_h, in0_l, out0);                           \
    LSX_ILVL_B(in1_h, in1_l, out1);                           \
 }

 #define LSX_HSUB_HU_BU_2(in0, in1, out0, out1) \
 {                                              \
    out0 = __lsx_vhsubw_hu_bu(in0, in0);        \
    out1 = __lsx_vhsubw_hu_bu(in1, in1);        \
 }

 #define LSX_CMP_PICK_SMALLER(in0, in1, in2, in3, in4, in5, out0) \
 {                                                                \
    __m128i _cmph, _cmpb, _in0, _in3;                             \
    _cmph = __lsx_vslt_h(in1, in0);                               \
    _cmpb = __lsx_vpickev_b(_cmph, _cmph);                        \
    _in0  = __lsx_vmin_bu(in0,in1);                               \
    _in3  = __lsx_vbitsel_v(in3, in4, _cmpb);                     \
    _cmph = __lsx_vslt_h(in2, _in0);                              \
    _cmpb = __lsx_vpickev_b(_cmph, _cmph);                        \
    _in3  = __lsx_vbitsel_v(_in3, in5, _cmpb);                    \
    out0  = __lsx_vadd_b(out0, _in3);                             \
 }

 void png_read_filter_row_up_lsx(png_row_infop row_info, png_bytep row,
                                 png_const_bytep prev_row)
 {
    size_t n = row_info->rowbytes;
    png_bytep rp = row;
    png_const_bytep pp = prev_row;
    __m128i vec_0, vec_1, vec_2, vec_3;
    __m128i vec_4, vec_5, vec_6, vec_7;

    while (n >= 64)
    {
       LSX_LD_4(rp, 16, vec_0, vec_1, vec_2, vec_3);
       LSX_LD_4(pp, 16, vec_4, vec_5, vec_6, vec_7);
       pp += 64;
       LSX_ADD_B_4(vec_0 ,vec_4, vec_1, vec_5, vec_2, vec_6,
                   vec_3, vec_7, vec_0, vec_1, vec_2, vec_3);
       LSX_ST_4(vec_0, vec_1, vec_2, vec_3, rp, 16);
       rp += 64;
       n -= 64;
    }
    if (n & 63)
    {
       if (n >= 32)
       {
          LSX_LD_2(rp, 16, vec_0, vec_1);
          LSX_LD_2(pp, 16, vec_2, vec_3);
          pp += 32;
          LSX_ADD_B_2(vec_0, vec_2, vec_1, vec_3, vec_0, vec_1);
          LSX_ST_2(vec_0, vec_1, rp, 16);
          rp += 32;
          n -= 32;
       }
       if (n & 31)
       {
          if (n >= 16)
          {
             vec_0 = LSX_LD(rp);
             vec_1 = LSX_LD(pp);
             pp += 16;
             LSX_ADD_B(vec_0, vec_1, vec_0);
             LSX_ST(vec_0, rp);
             rp += 16;
             n -= 16;
          }
          if (n >= 8)
          {
             vec_0 = __lsx_vldrepl_d(rp, 0);
             vec_1 = __lsx_vldrepl_d(pp, 0);
             vec_0 = __lsx_vadd_b(vec_0, vec_1);
             __lsx_vstelm_d(vec_0, rp, 0, 0);
             rp += 8;
             pp += 8;
             n -= 8;
          }
          while (n--)
          {
             *rp = *rp + *pp++;
             rp++;
          }
       }
    }
 }

 void png_read_filter_row_sub3_lsx(png_row_infop row_info, png_bytep row,
                                   png_const_bytep prev_row)
 {
    size_t n = row_info->rowbytes;
    png_uint_32 tmp;
    png_bytep nxt = row;
    __m128i vec_0, vec_1;

    PNG_UNUSED(prev_row);

    vec_0 = __lsx_vldrepl_w(nxt, 0);
    nxt += 3;
    n -= 3;

    while (n >= 3)
    {
       vec_1 = __lsx_vldrepl_w(nxt, 0);
       vec_1 = __lsx_vadd_b(vec_1, vec_0);
       __lsx_vstelm_h(vec_1, nxt, 0, 0);
       vec_0 = vec_1;
       nxt += 2;
       __lsx_vstelm_b(vec_1, nxt, 0, 2);
       nxt += 1;
       n -= 3;
    }

    row = nxt - 3;
    while (n--)
    {
       *nxt = *nxt + *row++;
       nxt++;
    }
 }

 void png_read_filter_row_sub4_lsx(png_row_infop row_info, png_bytep row,
                                   png_const_bytep prev_row)
 {
    size_t n = row_info->rowbytes;
    __m128i vec_0, vec_1;

    PNG_UNUSED(prev_row);

    vec_0 = __lsx_vldrepl_w(row, 0);
    row += 4;
    n -= 4;

    while (n >= 4)
    {
       vec_1 = __lsx_vldrepl_w(row, 0);
       vec_1 = __lsx_vadd_b(vec_1, vec_0);
       __lsx_vstelm_w(vec_1, row, 0, 0);
       vec_0 = vec_1;
       row += 4;
       n -= 4;
    }
 }

 void png_read_filter_row_avg3_lsx(png_row_infop row_info, png_bytep row,
                                   png_const_bytep prev_row)
 {
    size_t n = row_info->rowbytes;
    png_bytep nxt = row;
    png_const_bytep prev_nxt = prev_row;
    __m128i vec_0, vec_1, vec_2;

    vec_0 = __lsx_vldrepl_w(nxt, 0);
    vec_1 = __lsx_vldrepl_w(prev_nxt, 0);
    prev_nxt += 3;
    vec_1 = __lsx_vsrli_b(vec_1, 1);
    vec_1 = __lsx_vadd_b(vec_1, vec_0);
    __lsx_vstelm_h(vec_1, nxt, 0, 0);
    nxt += 2;
    __lsx_vstelm_b(vec_1, nxt, 0, 2);
    nxt += 1;
    n -= 3;

    while (n >= 3)
    {
       vec_2 = vec_1;
       vec_0 = __lsx_vldrepl_w(nxt, 0);
       vec_1 = __lsx_vldrepl_w(prev_nxt, 0);
       prev_nxt += 3;

       vec_1 = __lsx_vavg_bu(vec_1, vec_2);
       vec_1 = __lsx_vadd_b(vec_1, vec_0);

       __lsx_vstelm_h(vec_1, nxt, 0, 0);
       nxt += 2;
       __lsx_vstelm_b(vec_1, nxt, 0, 2);
       nxt += 1;
       n -= 3;
    }

    row = nxt - 3;
    while (n--)
    {
       vec_2 = __lsx_vldrepl_b(row, 0);
       row++;
       vec_0 = __lsx_vldrepl_b(nxt, 0);
       vec_1 = __lsx_vldrepl_b(prev_nxt, 0);
       prev_nxt++;

       vec_1 = __lsx_vavg_bu(vec_1, vec_2);
       vec_1 = __lsx_vadd_b(vec_1, vec_0);

       __lsx_vstelm_b(vec_1, nxt, 0, 0);
       nxt++;
    }
 }

 void png_read_filter_row_avg4_lsx(png_row_infop row_info, png_bytep row,
                                   png_const_bytep prev_row)
 {
    size_t n = row_info->rowbytes;
    __m128i vec_0, vec_1, vec_2;

    vec_0 = __lsx_vldrepl_w(row, 0);
    vec_1 = __lsx_vldrepl_w(prev_row, 0);
    prev_row += 4;
    vec_1 = __lsx_vsrli_b(vec_1, 1);
    vec_1 = __lsx_vadd_b(vec_1, vec_0);
    __lsx_vstelm_w(vec_1, row, 0, 0);
    row += 4;
    n -= 4;

    while (n >= 4)
    {
       vec_2 = vec_1;
       vec_0 = __lsx_vldrepl_w(row, 0);
       vec_1 = __lsx_vldrepl_w(prev_row, 0);
       prev_row += 4;

       vec_1 = __lsx_vavg_bu(vec_1, vec_2);
       vec_1 = __lsx_vadd_b(vec_1, vec_0);

       __lsx_vstelm_w(vec_1, row, 0, 0);
       row += 4;
       n -= 4;
    }
 }

 void png_read_filter_row_paeth3_lsx(png_row_infop row_info,
                                     png_bytep row,
                                     png_const_bytep prev_row)
 {
    size_t n = row_info->rowbytes;
    png_bytep nxt = row;
    png_const_bytep prev_nxt = prev_row;
    __m128i vec_a, vec_b, vec_c, vec_d;
    __m128i vec_pa, vec_pb, vec_pc;
    __m128i zero = {0};

    vec_a = __lsx_vldrepl_w(nxt, 0);
    vec_b = __lsx_vldrepl_w(prev_nxt, 0);
    prev_nxt += 3;
    vec_d = __lsx_vadd_b(vec_a, vec_b);
    __lsx_vstelm_h(vec_d, nxt, 0, 0);
    nxt += 2;
    __lsx_vstelm_b(vec_d, nxt, 0, 2);
    nxt += 1;
    n -= 3;

    while (n >= 3)
    {
       vec_a = vec_d;
       vec_c = vec_b;
       vec_b = __lsx_vldrepl_w(prev_nxt, 0);
       prev_nxt += 3;
       vec_d = __lsx_vldrepl_w(nxt, 0);

       LSX_ILVL_B_2(vec_b, vec_c, vec_a, vec_c, vec_pa, vec_pb);
       LSX_HSUB_HU_BU_2(vec_pa, vec_pb, vec_pa, vec_pb);
       vec_pc = __lsx_vadd_h(vec_pa, vec_pb);
       LSX_ABS_B_3(vec_pa, vec_pb, vec_pc, vec_pa, vec_pb, vec_pc);
       LSX_CMP_PICK_SMALLER(vec_pa, vec_pb, vec_pc, vec_a, vec_b, vec_c, vec_d);

       __lsx_vstelm_h(vec_d, nxt, 0, 0);
       nxt += 2;
       __lsx_vstelm_b(vec_d, nxt, 0, 2);
       nxt += 1;
       n -= 3;
    }

    prev_row = prev_nxt - 3;
    row = nxt - 3;
    while (n--)
    {
       vec_a = __lsx_vldrepl_b(row, 0);
       row++;
       vec_b = __lsx_vldrepl_b(prev_nxt, 0);
       prev_nxt++;
       vec_c = __lsx_vldrepl_b(prev_row, 0);
       prev_row++;
       vec_d = __lsx_vldrepl_b(nxt, 0);

       LSX_ILVL_B_2(vec_b, vec_c, vec_a, vec_c, vec_pa, vec_pb);
       LSX_HSUB_HU_BU_2(vec_pa, vec_pb, vec_pa, vec_pb);
       vec_pc = __lsx_vadd_h(vec_pa, vec_pb);
       LSX_ABS_B_3(vec_pa, vec_pb, vec_pc, vec_pa, vec_pb, vec_pc);
       LSX_CMP_PICK_SMALLER(vec_pa, vec_pb, vec_pc, vec_a, vec_b, vec_c, vec_d);

       __lsx_vstelm_b(vec_d, nxt, 0, 0);
       nxt++;
    }
 }

 void png_read_filter_row_paeth4_lsx(png_row_infop row_info,
                                     png_bytep row,
                                     png_const_bytep prev_row)
 {
    size_t n = row_info->rowbytes;
    __m128i vec_a, vec_b, vec_c, vec_d;
    __m128i vec_pa, vec_pb, vec_pc;
    __m128i zero = {0};

    vec_a = __lsx_vldrepl_w(row, 0);
    vec_b = __lsx_vldrepl_w(prev_row, 0);
    prev_row += 4;
    vec_d = __lsx_vadd_b(vec_a, vec_b);
    __lsx_vstelm_w(vec_d, row, 0, 0);
    row += 4;
    n -= 4;

    while (n >= 4)
    {
       vec_a = vec_d;
       vec_c = vec_b;
       vec_b = __lsx_vldrepl_w(prev_row, 0);
       prev_row += 4;
       vec_d = __lsx_vldrepl_w(row, 0);

       LSX_ILVL_B_2(vec_b, vec_c, vec_a, vec_c, vec_pa, vec_pb);
       LSX_HSUB_HU_BU_2(vec_pa, vec_pb, vec_pa, vec_pb);
       vec_pc = __lsx_vadd_h(vec_pa, vec_pb);
       LSX_ABS_B_3(vec_pa, vec_pb, vec_pc, vec_pa, vec_pb, vec_pc);
       LSX_CMP_PICK_SMALLER(vec_pa, vec_pb, vec_pc, vec_a, vec_b, vec_c, vec_d);

       __lsx_vstelm_w(vec_d, row, 0, 0);
       row += 4;
       n -= 4;
    }
 }

 #endif /* PNG_LOONGARCH_LSX_IMPLEMENTATION == 1 (intrinsics) */
 #endif /* PNG_READ_SUPPORTED */
	/* filter_lsx_intrinsics.c - LSX optimized filter functions
	*
	* Copyright (c) 2021 Loongson Technology Corporation Limited
	* All rights reserved.
	* Copyright (c) 2018 Cosmin Truta
	* Copyright (c) 2016 Glenn Randers-Pehrson
	* Contributed by Jin Bo (jinbo@loongson.cn)
	*
	* This code is released under the libpng license.
	* For conditions of distribution and use, see the disclaimer
	* and license in png.h
	*/

	#include "../pngpriv.h"

	#ifdef PNG_READ_SUPPORTED

	#if PNG_LOONGARCH_LSX_IMPLEMENTATION == 1 /* intrinsics code from pngpriv.h */

	#include <lsxintrin.h>

	#define LSX_LD(psrc) __lsx_vld((psrc), 0)

	#define LSX_LD_2(psrc, stride, out0, out1) \
	{ \
	out0 = LSX_LD(psrc); \
	out1 = LSX_LD(psrc + stride); \
	}

	#define LSX_LD_4(psrc, stride, out0, out1, out2, out3) \
	{ \
	LSX_LD_2(psrc, stride, out0, out1); \
	LSX_LD_2(psrc + stride * 2, stride, out2, out3); \
	}

	#define LSX_ST(in, pdst) __lsx_vst(in, (pdst), 0)

	#define LSX_ST_2(in0, in1, pdst, stride) \
	{ \
	LSX_ST(in0, pdst); \
	LSX_ST(in1, pdst + stride); \
	}

	#define LSX_ST_4(in0, in1, in2, in3, pdst, stride) \
	{ \
	LSX_ST_2(in0, in1, pdst, stride); \
	LSX_ST_2(in2, in3, pdst + stride * 2, stride); \
	}

	#define LSX_ADD_B(in0, in1, out0) \
	{ \
	out0 = __lsx_vadd_b(in0, in1); \
	}

	#define LSX_ADD_B_2(in0, in1, in2, in3, out0, out1) \
	{ \
	LSX_ADD_B(in0, in1, out0); \
	LSX_ADD_B(in2, in3, out1); \
	}

	#define LSX_ADD_B_4(in0, in1, in2, in3, in4, in5, \
	in6, in7, out0, out1, out2, out3) \
	{ \
	LSX_ADD_B_2(in0, in1, in2, in3, out0, out1); \
	LSX_ADD_B_2(in4, in5, in6, in7, out2, out3); \
	}

	#define LSX_ABS_B_3(in0, in1, in2, out0, out1, out2) \
	{ \
	out0 = __lsx_vadda_h(in0, zero); \
	out1 = __lsx_vadda_h(in1, zero); \
	out2 = __lsx_vadda_h(in2, zero); \
	}

	#define LSX_ILVL_B(in_h, in_l, out0) \
	{ \
	out0 = __lsx_vilvl_b(in_h, in_l); \
	}

	#define LSX_ILVL_B_2(in0_h, in0_l, in1_h, in1_l, out0, out1) \
	{ \
	LSX_ILVL_B(in0_h, in0_l, out0); \
	LSX_ILVL_B(in1_h, in1_l, out1); \
	}

	#define LSX_HSUB_HU_BU_2(in0, in1, out0, out1) \
	{ \
	out0 = __lsx_vhsubw_hu_bu(in0, in0); \
	out1 = __lsx_vhsubw_hu_bu(in1, in1); \
	}

	#define LSX_CMP_PICK_SMALLER(in0, in1, in2, in3, in4, in5, out0) \
	{ \
	__m128i _cmph, _cmpb, _in0, _in3; \
	_cmph = __lsx_vslt_h(in1, in0); \
	_cmpb = __lsx_vpickev_b(_cmph, _cmph); \
	_in0 = __lsx_vmin_bu(in0,in1); \
	_in3 = __lsx_vbitsel_v(in3, in4, _cmpb); \
	_cmph = __lsx_vslt_h(in2, _in0); \
	_cmpb = __lsx_vpickev_b(_cmph, _cmph); \
	_in3 = __lsx_vbitsel_v(_in3, in5, _cmpb); \
	out0 = __lsx_vadd_b(out0, _in3); \
	}

	void png_read_filter_row_up_lsx(png_row_infop row_info, png_bytep row,
	png_const_bytep prev_row)
	{
	size_t n = row_info->rowbytes;
	png_bytep rp = row;
	png_const_bytep pp = prev_row;
	__m128i vec_0, vec_1, vec_2, vec_3;
	__m128i vec_4, vec_5, vec_6, vec_7;

	while (n >= 64)
	{
	LSX_LD_4(rp, 16, vec_0, vec_1, vec_2, vec_3);
	LSX_LD_4(pp, 16, vec_4, vec_5, vec_6, vec_7);
	pp += 64;
	LSX_ADD_B_4(vec_0 ,vec_4, vec_1, vec_5, vec_2, vec_6,
	vec_3, vec_7, vec_0, vec_1, vec_2, vec_3);
	LSX_ST_4(vec_0, vec_1, vec_2, vec_3, rp, 16);
	rp += 64;
	n -= 64;
	}
	if (n & 63)
	{
	if (n >= 32)
	{
	LSX_LD_2(rp, 16, vec_0, vec_1);
	LSX_LD_2(pp, 16, vec_2, vec_3);
	pp += 32;
	LSX_ADD_B_2(vec_0, vec_2, vec_1, vec_3, vec_0, vec_1);
	LSX_ST_2(vec_0, vec_1, rp, 16);
	rp += 32;
	n -= 32;
	}
	if (n & 31)
	{
	if (n >= 16)
	{
	vec_0 = LSX_LD(rp);
	vec_1 = LSX_LD(pp);
	pp += 16;
	LSX_ADD_B(vec_0, vec_1, vec_0);
	LSX_ST(vec_0, rp);
	rp += 16;
	n -= 16;
	}
	if (n >= 8)
	{
	vec_0 = __lsx_vldrepl_d(rp, 0);
	vec_1 = __lsx_vldrepl_d(pp, 0);
	vec_0 = __lsx_vadd_b(vec_0, vec_1);
	__lsx_vstelm_d(vec_0, rp, 0, 0);
	rp += 8;
	pp += 8;
	n -= 8;
	}
	while (n--)
	{
	rp = rp + *pp++;
	rp++;
	}
	}
	}
	}

	void png_read_filter_row_sub3_lsx(png_row_infop row_info, png_bytep row,
	png_const_bytep prev_row)
	{
	size_t n = row_info->rowbytes;
	png_uint_32 tmp;
	png_bytep nxt = row;
	__m128i vec_0, vec_1;

	PNG_UNUSED(prev_row);

	vec_0 = __lsx_vldrepl_w(nxt, 0);
	nxt += 3;
	n -= 3;

	while (n >= 3)
	{
	vec_1 = __lsx_vldrepl_w(nxt, 0);
	vec_1 = __lsx_vadd_b(vec_1, vec_0);
	__lsx_vstelm_h(vec_1, nxt, 0, 0);
	vec_0 = vec_1;
	nxt += 2;
	__lsx_vstelm_b(vec_1, nxt, 0, 2);
	nxt += 1;
	n -= 3;
	}

	row = nxt - 3;
	while (n--)
	{
	nxt = nxt + *row++;
	nxt++;
	}
	}

	void png_read_filter_row_sub4_lsx(png_row_infop row_info, png_bytep row,
	png_const_bytep prev_row)
	{
	size_t n = row_info->rowbytes;
	__m128i vec_0, vec_1;

	PNG_UNUSED(prev_row);

	vec_0 = __lsx_vldrepl_w(row, 0);
	row += 4;
	n -= 4;

	while (n >= 4)
	{
	vec_1 = __lsx_vldrepl_w(row, 0);
	vec_1 = __lsx_vadd_b(vec_1, vec_0);
	__lsx_vstelm_w(vec_1, row, 0, 0);
	vec_0 = vec_1;
	row += 4;
	n -= 4;
	}
	}

	void png_read_filter_row_avg3_lsx(png_row_infop row_info, png_bytep row,
	png_const_bytep prev_row)
	{
	size_t n = row_info->rowbytes;
	png_bytep nxt = row;
	png_const_bytep prev_nxt = prev_row;
	__m128i vec_0, vec_1, vec_2;

	vec_0 = __lsx_vldrepl_w(nxt, 0);
	vec_1 = __lsx_vldrepl_w(prev_nxt, 0);
	prev_nxt += 3;
	vec_1 = __lsx_vsrli_b(vec_1, 1);
	vec_1 = __lsx_vadd_b(vec_1, vec_0);
	__lsx_vstelm_h(vec_1, nxt, 0, 0);
	nxt += 2;
	__lsx_vstelm_b(vec_1, nxt, 0, 2);
	nxt += 1;
	n -= 3;

	while (n >= 3)
	{
	vec_2 = vec_1;
	vec_0 = __lsx_vldrepl_w(nxt, 0);
	vec_1 = __lsx_vldrepl_w(prev_nxt, 0);
	prev_nxt += 3;

	vec_1 = __lsx_vavg_bu(vec_1, vec_2);
	vec_1 = __lsx_vadd_b(vec_1, vec_0);

	__lsx_vstelm_h(vec_1, nxt, 0, 0);
	nxt += 2;
	__lsx_vstelm_b(vec_1, nxt, 0, 2);
	nxt += 1;
	n -= 3;
	}

	row = nxt - 3;
	while (n--)
	{
	vec_2 = __lsx_vldrepl_b(row, 0);
	row++;
	vec_0 = __lsx_vldrepl_b(nxt, 0);
	vec_1 = __lsx_vldrepl_b(prev_nxt, 0);
	prev_nxt++;

	vec_1 = __lsx_vavg_bu(vec_1, vec_2);
	vec_1 = __lsx_vadd_b(vec_1, vec_0);

	__lsx_vstelm_b(vec_1, nxt, 0, 0);
	nxt++;
	}
	}

	void png_read_filter_row_avg4_lsx(png_row_infop row_info, png_bytep row,
	png_const_bytep prev_row)
	{
	size_t n = row_info->rowbytes;
	__m128i vec_0, vec_1, vec_2;

	vec_0 = __lsx_vldrepl_w(row, 0);
	vec_1 = __lsx_vldrepl_w(prev_row, 0);
	prev_row += 4;
	vec_1 = __lsx_vsrli_b(vec_1, 1);
	vec_1 = __lsx_vadd_b(vec_1, vec_0);
	__lsx_vstelm_w(vec_1, row, 0, 0);
	row += 4;
	n -= 4;

	while (n >= 4)
	{
	vec_2 = vec_1;
	vec_0 = __lsx_vldrepl_w(row, 0);
	vec_1 = __lsx_vldrepl_w(prev_row, 0);
	prev_row += 4;

	vec_1 = __lsx_vavg_bu(vec_1, vec_2);
	vec_1 = __lsx_vadd_b(vec_1, vec_0);

	__lsx_vstelm_w(vec_1, row, 0, 0);
	row += 4;
	n -= 4;
	}
	}

	void png_read_filter_row_paeth3_lsx(png_row_infop row_info,
	png_bytep row,
	png_const_bytep prev_row)
	{
	size_t n = row_info->rowbytes;
	png_bytep nxt = row;
	png_const_bytep prev_nxt = prev_row;
	__m128i vec_a, vec_b, vec_c, vec_d;
	__m128i vec_pa, vec_pb, vec_pc;
	__m128i zero = {0};

	vec_a = __lsx_vldrepl_w(nxt, 0);
	vec_b = __lsx_vldrepl_w(prev_nxt, 0);
	prev_nxt += 3;
	vec_d = __lsx_vadd_b(vec_a, vec_b);
	__lsx_vstelm_h(vec_d, nxt, 0, 0);
	nxt += 2;
	__lsx_vstelm_b(vec_d, nxt, 0, 2);
	nxt += 1;
	n -= 3;

	while (n >= 3)
	{
	vec_a = vec_d;
	vec_c = vec_b;
	vec_b = __lsx_vldrepl_w(prev_nxt, 0);
	prev_nxt += 3;
	vec_d = __lsx_vldrepl_w(nxt, 0);

	LSX_ILVL_B_2(vec_b, vec_c, vec_a, vec_c, vec_pa, vec_pb);
	LSX_HSUB_HU_BU_2(vec_pa, vec_pb, vec_pa, vec_pb);
	vec_pc = __lsx_vadd_h(vec_pa, vec_pb);
	LSX_ABS_B_3(vec_pa, vec_pb, vec_pc, vec_pa, vec_pb, vec_pc);
	LSX_CMP_PICK_SMALLER(vec_pa, vec_pb, vec_pc, vec_a, vec_b, vec_c, vec_d);

	__lsx_vstelm_h(vec_d, nxt, 0, 0);
	nxt += 2;
	__lsx_vstelm_b(vec_d, nxt, 0, 2);
	nxt += 1;
	n -= 3;
	}

	prev_row = prev_nxt - 3;
	row = nxt - 3;
	while (n--)
	{
	vec_a = __lsx_vldrepl_b(row, 0);
	row++;
	vec_b = __lsx_vldrepl_b(prev_nxt, 0);
	prev_nxt++;
	vec_c = __lsx_vldrepl_b(prev_row, 0);
	prev_row++;
	vec_d = __lsx_vldrepl_b(nxt, 0);

	LSX_ILVL_B_2(vec_b, vec_c, vec_a, vec_c, vec_pa, vec_pb);
	LSX_HSUB_HU_BU_2(vec_pa, vec_pb, vec_pa, vec_pb);
	vec_pc = __lsx_vadd_h(vec_pa, vec_pb);
	LSX_ABS_B_3(vec_pa, vec_pb, vec_pc, vec_pa, vec_pb, vec_pc);
	LSX_CMP_PICK_SMALLER(vec_pa, vec_pb, vec_pc, vec_a, vec_b, vec_c, vec_d);

	__lsx_vstelm_b(vec_d, nxt, 0, 0);
	nxt++;
	}
	}

	void png_read_filter_row_paeth4_lsx(png_row_infop row_info,
	png_bytep row,
	png_const_bytep prev_row)
	{
	size_t n = row_info->rowbytes;
	__m128i vec_a, vec_b, vec_c, vec_d;
	__m128i vec_pa, vec_pb, vec_pc;
	__m128i zero = {0};

	vec_a = __lsx_vldrepl_w(row, 0);
	vec_b = __lsx_vldrepl_w(prev_row, 0);
	prev_row += 4;
	vec_d = __lsx_vadd_b(vec_a, vec_b);
	__lsx_vstelm_w(vec_d, row, 0, 0);
	row += 4;
	n -= 4;

	while (n >= 4)
	{
	vec_a = vec_d;
	vec_c = vec_b;
	vec_b = __lsx_vldrepl_w(prev_row, 0);
	prev_row += 4;
	vec_d = __lsx_vldrepl_w(row, 0);

	LSX_ILVL_B_2(vec_b, vec_c, vec_a, vec_c, vec_pa, vec_pb);
	LSX_HSUB_HU_BU_2(vec_pa, vec_pb, vec_pa, vec_pb);
	vec_pc = __lsx_vadd_h(vec_pa, vec_pb);
	LSX_ABS_B_3(vec_pa, vec_pb, vec_pc, vec_pa, vec_pb, vec_pc);
	LSX_CMP_PICK_SMALLER(vec_pa, vec_pb, vec_pc, vec_a, vec_b, vec_c, vec_d);

	__lsx_vstelm_w(vec_d, row, 0, 0);
	row += 4;
	n -= 4;
	}
	}

	#endif /* PNG_LOONGARCH_LSX_IMPLEMENTATION == 1 (intrinsics) */
	#endif /* PNG_READ_SUPPORTED */