Optimize png16 with loongson mmi for 64-bit os
diff --git a/CMakeLists.txt b/CMakeLists.txt
index 8c69756..9a57e83 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -227,19 +227,44 @@
       CACHE STRING "Enable MIPS_MSA optimizations: on|off; on is default")
   set_property(CACHE PNG_MIPS_MSA
                PROPERTY STRINGS ${PNG_MIPS_MSA_POSSIBLE_VALUES})
-  list(FIND PNG_MIPS_MSA_POSSIBLE_VALUES ${PNG_MIPS_MSA} index)
-  if(index EQUAL -1)
+  list(FIND PNG_MIPS_MSA_POSSIBLE_VALUES ${PNG_MIPS_MSA} index_msa)
+  if(index_msa EQUAL -1)
     message(FATAL_ERROR "PNG_MIPS_MSA must be one of [${PNG_MIPS_MSA_POSSIBLE_VALUES}]")
-  elseif(NOT PNG_MIPS_MSA STREQUAL "off")
+  endif()
+
+  set(PNG_MIPS_MMI_POSSIBLE_VALUES on off)
+  set(PNG_MIPS_MMI "on"
+      CACHE STRING "Enable MIPS_MMI optimizations: on|off; on is default")
+  set_property(CACHE PNG_MIPS_MMI
+               PROPERTY STRINGS ${PNG_MIPS_MMI_POSSIBLE_VALUES})
+  list(FIND PNG_MIPS_MMI_POSSIBLE_VALUES ${PNG_MIPS_MMI} index_mmi)
+  if(index_mmi EQUAL -1)
+    message(FATAL_ERROR "PNG_MIPS_MMI must be one of [${PNG_MIPS_MMI_POSSIBLE_VALUES}]")
+  endif()
+
+  if(PNG_MIPS_MSA STREQUAL "on" AND PNG_MIPS_MMI STREQUAL "on")
+    set(libpng_mips_sources
+        mips/mips_init.c
+        mips/filter_msa_intrinsics.c
+        mips/filter_mmi_inline_assembly.c)
+    add_definitions(-DPNG_MIPS_MSA_OPT=2)
+    add_definitions(-DPNG_MIPS_MMI_OPT=1)
+  elseif(PNG_MIPS_MSA STREQUAL "on")
     set(libpng_mips_sources
         mips/mips_init.c
         mips/filter_msa_intrinsics.c)
-    if(PNG_MIPS_MSA STREQUAL "on")
-      add_definitions(-DPNG_MIPS_MSA_OPT=2)
-    endif()
-  else()
+    add_definitions(-DPNG_MIPS_MSA_OPT=2)
+    add_definitions(-DPNG_MIPS_MMI_OPT=0)
+  elseif(PNG_MIPS_MMI STREQUAL "on")
+    set(libpng_mips_sources
+        mips/mips_init.c
+        mips/filter_mmi_inline_assembly.c)
     add_definitions(-DPNG_MIPS_MSA_OPT=0)
-  endif()
+    add_definitions(-DPNG_MIPS_MMI_OPT=1)
+    else()
+    add_definitions(-DPNG_MIPS_MSA_OPT=0)
+    add_definitions(-DPNG_MIPS_MMI_OPT=0)
+    endif()
 endif()
 
 else(PNG_HARDWARE_OPTIMIZATIONS)
diff --git a/Makefile.am b/Makefile.am
index 370bdbf..43ad6e2 100644
--- a/Makefile.am
+++ b/Makefile.am
@@ -117,6 +117,13 @@
 	mips/filter_msa_intrinsics.c
 endif
 
+if PNG_MIPS_MMI
+if !PNG_MIPS_MSA
+libpng@PNGLIB_MAJOR@@PNGLIB_MINOR@_la_SOURCES += mips/mips_init.c
+endif
+libpng@PNGLIB_MAJOR@@PNGLIB_MINOR@_la_SOURCES += mips/filter_mmi_inline_assembly.c
+endif
+
 if PNG_INTEL_SSE
 libpng@PNGLIB_MAJOR@@PNGLIB_MINOR@_la_SOURCES += intel/intel_init.c\
 	intel/filter_sse2_intrinsics.c
diff --git a/configure.ac b/configure.ac
index 938c106..c485a6f 100644
--- a/configure.ac
+++ b/configure.ac
@@ -328,6 +328,9 @@
          enable_mips_msa=no
          AC_DEFINE([PNG_MIPS_MSA_OPT], [0],
            [Disable MIPS_MSA optimizations])
+         enable_mips_mmi=no
+         AC_DEFINE([PNG_MIPS_MMI_OPT], [0],
+           [Disable MIPS_MMI optimizations])
          enable_powerpc_vsx=no
          AC_DEFINE([PNG_POWERPC_VSX_OPT], [0],
            [Disable POWERPC VSX optimizations])
@@ -347,7 +350,10 @@
                 [Enable ARM_NEON optimizations])
               ;;
             mipsel*|mips64el*)
+              enable_mips_mmi=yes
               enable_mips_msa=yes
+              AC_DEFINE([PNG_MIPS_MMI_OPT], [1],
+                [Enable MIPS_MMI optimizations])
               AC_DEFINE([PNG_MIPS_MSA_OPT], [2],
                 [Enable MIPS_MSA optimizations])
               ;;
@@ -461,6 +467,51 @@
       mipsel*|mips64el*) : ;;
     esac])
 
+# MIPS
+# ===
+#
+# MIPS MMI (SIMD) support.
+
+AC_ARG_ENABLE([mips-mmi],
+   AS_HELP_STRING([[[--enable-mips-mmi]]],
+      [Enable MIPS MMI optimizations: =no/off, check, api, yes/on:]
+      [no/off: disable the optimizations; check: use internal checking code]
+      [(deprecated and poorly supported); api: disable by default, enable by]
+      [a call to png_set_option; yes/on: turn on unconditionally.]
+      [If not specified: determined by the compiler.]),
+   [case "$enableval" in
+      no|off)
+         # disable the default enabling on __mips_mmi systems:
+         AC_DEFINE([PNG_MIPS_MMI_OPT], [0],
+                   [Disable MIPS MMI optimizations])
+         # Prevent inclusion of the assembler files below:
+         enable_mips_mmi=no;;
+      check)
+         AC_DEFINE([PNG_MIPS_MMI_CHECK_SUPPORTED], [],
+                   [Check for MIPS MMI support at run-time]);;
+      api)
+         AC_DEFINE([PNG_MIPS_MMI_API_SUPPORTED], [],
+                   [Turn on MIPS MMI optimizations at run-time]);;
+      yes|on)
+         AC_DEFINE([PNG_MIPS_MMI_OPT], [1],
+                   [Enable MIPS MMI optimizations])
+         AC_MSG_WARN([--enable-mips-mmi: please specify 'check' or 'api', if]
+            [you want the optimizations unconditionally pass '-mloongson-mmi -march=loongson3a']
+            [to the compiler.]);;
+      *)
+         AC_MSG_ERROR([--enable-mips-mmi=${enable_mips_mmi}: invalid value])
+   esac])
+
+# Add MIPS specific files to all builds where the host_cpu is mips ('mips*') or
+# where MIPS optimizations were explicitly requested (this allows a fallback if a
+# future host CPU does not match 'mips*')
+
+AM_CONDITIONAL([PNG_MIPS_MMI],
+   [test "$enable_mips_mmi" != 'no' &&
+    case "$host_cpu" in
+      mipsel*|mips64el*) :;;
+    esac])
+
 # INTEL
 # =====
 #
diff --git a/contrib/mips-mmi/linux.c b/contrib/mips-mmi/linux.c
new file mode 100644
index 0000000..5bb79a6
--- /dev/null
+++ b/contrib/mips-mmi/linux.c
@@ -0,0 +1,140 @@
+/* contrib/mips-mmi/linux.c
+ *
+ * Written by guxiwei 2023
+ *
+ * This code is released under the libpng license.
+ * For conditions of distribution and use, see the disclaimer
+ * and license in png.h
+ *
+ */
+#include <stdint.h>
+#include <stdio.h>
+#include <string.h>
+#include <sys/auxv.h>
+
+/*
+ * parse_r var, r - Helper assembler macro for parsing register names.
+ *
+ * This converts the register name in $n form provided in \r to the
+ * corresponding register number, which is assigned to the variable \var. It is
+ * needed to allow explicit encoding of instructions in inline assembly where
+ * registers are chosen by the compiler in $n form, allowing us to avoid using
+ * fixed register numbers.
+ *
+ * It also allows newer instructions (not implemented by the assembler) to be
+ * transparently implemented using assembler macros, instead of needing separate
+ * cases depending on toolchain support.
+ *
+ * Simple usage example:
+ * __asm__ __volatile__("parse_r __rt, %0\n\t"
+ *                      ".insn\n\t"
+ *                      "# di    %0\n\t"
+ *                      ".word   (0x41606000 | (__rt << 16))"
+ *                      : "=r" (status);
+ */
+
+/* Match an individual register number and assign to \var */
+#define _IFC_REG(n)                                \
+        ".ifc        \\r, $" #n "\n\t"             \
+        "\\var        = " #n "\n\t"                \
+        ".endif\n\t"
+
+__asm__(".macro        parse_r var r\n\t"
+        "\\var        = -1\n\t"
+        _IFC_REG(0)  _IFC_REG(1)  _IFC_REG(2)  _IFC_REG(3)
+        _IFC_REG(4)  _IFC_REG(5)  _IFC_REG(6)  _IFC_REG(7)
+        _IFC_REG(8)  _IFC_REG(9)  _IFC_REG(10) _IFC_REG(11)
+        _IFC_REG(12) _IFC_REG(13) _IFC_REG(14) _IFC_REG(15)
+        _IFC_REG(16) _IFC_REG(17) _IFC_REG(18) _IFC_REG(19)
+        _IFC_REG(20) _IFC_REG(21) _IFC_REG(22) _IFC_REG(23)
+        _IFC_REG(24) _IFC_REG(25) _IFC_REG(26) _IFC_REG(27)
+        _IFC_REG(28) _IFC_REG(29) _IFC_REG(30) _IFC_REG(31)
+        ".iflt        \\var\n\t"
+        ".error        \"Unable to parse register name \\r\"\n\t"
+        ".endif\n\t"
+        ".endm");
+
+#define HWCAP_LOONGSON_CPUCFG (1 << 14)
+
+static int cpucfg_available(void)
+{
+    return getauxval(AT_HWCAP) & HWCAP_LOONGSON_CPUCFG;
+}
+
+static int strstart(const char *str, const char *pfx, const char **ptr)
+{
+    while (*pfx && *pfx == *str) {
+        pfx++;
+        str++;
+    }
+    if (!*pfx && ptr)
+        *ptr = str;
+    return !*pfx;
+}
+
+/* Most toolchains have no CPUCFG support yet */
+static uint32_t read_cpucfg(uint32_t reg)
+{
+        uint32_t __res;
+
+        __asm__ __volatile__(
+                "parse_r __res,%0\n\t"
+                "parse_r reg,%1\n\t"
+                ".insn \n\t"
+                ".word (0xc8080118 | (reg << 21) | (__res << 11))\n\t"
+                :"=r"(__res)
+                :"r"(reg)
+                :
+                );
+        return __res;
+}
+
+#define LOONGSON_CFG1 0x1
+
+#define LOONGSON_CFG1_MMI    (1 << 4)
+
+static int cpu_flags_cpucfg(void)
+{
+    int flags = 0;
+    uint32_t cfg1 = read_cpucfg(LOONGSON_CFG1);
+
+    if (cfg1 & LOONGSON_CFG1_MMI)
+        flags = 1;
+
+    return flags;
+}
+
+static int cpu_flags_cpuinfo(void)
+{
+    FILE *f = fopen("/proc/cpuinfo", "r");
+    char buf[200];
+    int flags = 0;
+
+    if (!f)
+        return flags;
+
+    while (fgets(buf, sizeof(buf), f)) {
+        /* Legacy kernel may not export MMI in ASEs implemented */
+        if (strstart(buf, "cpu model", NULL)) {
+            if (strstr(buf, "Loongson-3 "))
+                flags = 1;
+            break;
+        }
+        if (strstart(buf, "ASEs implemented", NULL)) {
+            if (strstr(buf, " loongson-mmi"))
+                flags = 1;
+            break;
+        }
+    }
+    fclose(f);
+    return flags;
+}
+
+static int png_have_mmi()
+{
+    if (cpucfg_available())
+        return cpu_flags_cpucfg();
+    else
+        return cpu_flags_cpuinfo();
+    return 0;
+}
diff --git a/mips/filter_mmi_inline_assembly.c b/mips/filter_mmi_inline_assembly.c
new file mode 100644
index 0000000..06cb1cc
--- /dev/null
+++ b/mips/filter_mmi_inline_assembly.c
@@ -0,0 +1,524 @@
+/* filter_mmi_intrinsics.c - MMI optimized filter functions
+ *
+ * Written by zhanglixia and guxiwei
+ *
+ * This code is released under the libpng license.
+ * For conditions of distribution and use, see the disclaimer
+ * and license in png.h
+ */
+
+#include "../pngpriv.h"
+
+#ifdef PNG_READ_SUPPORTED
+
+#if PNG_MIPS_MMI_IMPLEMENTATION == 2 /* Inline Assembly */
+
+/* Functions in this file look at most 3 pixels (a,b,c) to predict the 4th (d).
+ * They're positioned like this:
+ *    prev:  c b
+ *    row:   a d
+ * The Sub filter predicts d=a, Avg d=(a+b)/2, and Paeth predicts d to be
+ * whichever of a, b, or c is closest to p=a+b-c.
+ */
+
+void png_read_filter_row_up_mmi(png_row_infop row_info, png_bytep row,
+                                png_const_bytep prev_row)
+{
+   int istop = row_info->rowbytes;
+   double rp,pp;
+   __asm__ volatile (
+       "1:                                          \n\t"
+       "ldc1   %[rp],       0x00(%[row])            \n\t"
+       "ldc1   %[pp],       0x00(%[prev_row])       \n\t"
+       "paddb  %[rp],       %[rp],            %[pp] \n\t"
+       "sdc1   %[rp],       0x00(%[row])            \n\t"
+
+       "daddiu %[row],      %[row],           0x08  \n\t"
+       "daddiu %[prev_row], %[prev_row],      0x08  \n\t"
+       "daddiu %[istop],    %[istop],        -0x08  \n\t"
+       "bgtz   %[istop],    1b                      \n\t"
+       : [rp]"=&f"(rp), [pp]"=&f"(pp)
+       : [row]"r"(row), [prev_row]"r"(prev_row),
+         [istop]"r"(istop)
+       : "memory"
+   );
+}
+
+void png_read_filter_row_sub3_mmi(png_row_infop row_info, png_bytep row,
+   png_const_bytep prev)
+{
+   int istop = row_info->rowbytes;
+   double rp, pp, dest;
+   double eight, sixteen, twenty_four, forty_eight;
+   double tmp0;
+   double ftmp[2];
+
+   __asm__ volatile (
+        "li         %[tmp0],    0x08                          \n\t"
+        "dmtc1      %[tmp0],    %[eight]                      \n\t"
+        "li         %[tmp0],    0x10                          \n\t"
+        "dmtc1      %[tmp0],    %[sixteen]                    \n\t"
+        "li         %[tmp0],    0x18                          \n\t"
+        "dmtc1      %[tmp0],    %[twenty_four]                \n\t"
+        "li         %[tmp0],    0x30                          \n\t"
+        "dmtc1      %[tmp0],    %[forty_eight]                \n\t"
+        "xor        %[dest],    %[dest],       %[dest]        \n\t"
+
+        "1:                                                   \n\t"
+        "gsldrc1    %[rp],      0x00(%[row])                  \n\t"
+        "gsldlc1    %[rp],      0x07(%[row])                  \n\t"
+        "gsldrc1    %[pp],      0x08(%[row])                  \n\t"
+        "gsldlc1    %[pp],      0x0f(%[row])                  \n\t"
+
+        "paddb      %[ftmp0],   %[dest],      %[rp]           \n\t"
+        "swc1       %[ftmp0],   0x00(%[row])                  \n\t"
+
+        "dsrl       %[ftmp1],   %[rp],        %[twenty_four]  \n\t"
+        "paddb      %[dest],    %[ftmp1],     %[ftmp0]        \n\t"
+        "gsswrc1    %[dest],    0x03(%[row])                  \n\t"
+        "gsswlc1    %[dest],    0x06(%[row])                  \n\t"
+
+        "dsrl       %[ftmp0],   %[rp],        %[forty_eight]  \n\t"
+        "dsll       %[ftmp1],   %[pp],        %[sixteen]      \n\t"
+        "or         %[ftmp0],   %[ftmp0],     %[ftmp1]        \n\t"
+        "paddb      %[dest],    %[dest],      %[ftmp0]        \n\t"
+        "gsswrc1    %[dest],    0x06(%[row])                  \n\t"
+        "gsswlc1    %[dest],    0x09(%[row])                  \n\t"
+
+        "dsrl       %[ftmp0],   %[pp],        %[eight]        \n\t"
+        "paddb      %[dest],    %[dest],      %[ftmp0]        \n\t"
+        "gsswrc1    %[dest],    0x09(%[row])                  \n\t"
+        "daddiu     %[row],     %[row],       0x0c            \n\t"
+        "daddiu     %[istop],   %[istop],    -0x0c            \n\t"
+        "bgtz       %[istop],   1b                            \n\t"
+        : [rp]"=&f"(rp), [pp]"=&f"(pp), [dest]"=&f"(dest),
+          [tmp0]"=&r"(tmp0), [ftmp0]"=&f"(ftmp[0]),
+          [ftmp1]"=&f"(ftmp[1]), [eight]"=&f"(eight),
+          [sixteen]"=&f"(sixteen), [twenty_four]"=&f"(twenty_four),
+          [forty_eight]"=&f"(forty_eight)
+        : [row]"r"(row), [istop]"r"(istop)
+        : "memory"
+   );
+
+   PNG_UNUSED(prev)
+}
+
+void png_read_filter_row_sub4_mmi(png_row_infop row_info, png_bytep row,
+   png_const_bytep prev)
+{
+   /* The Sub filter predicts each pixel as the previous pixel, a.
+    * There is no pixel to the left of the first pixel.  It's encoded directly.
+    * That works with our main loop if we just say that left pixel was zero.
+    */
+   int istop = row_info->rowbytes;
+   double rp,pp;
+
+   __asm__ volatile (
+        "1:                                          \n\t"
+        "lwc1   %[pp],       0x00(%[row])            \n\t"
+        "lwc1   %[rp],       0x04(%[row])            \n\t"
+        "paddb  %[rp],       %[rp],       %[pp]      \n\t"
+        "swc1   %[rp],       0x04(%[row])            \n\t"
+
+        "daddiu %[row],      %[row],      0x04       \n\t"
+        "daddiu %[istop],    %[istop],   -0x04       \n\t"
+        "bgtz   %[istop],    1b                      \n\t"
+        : [rp]"=&f"(rp), [pp]"=&f"(pp)
+        : [row]"r"(row), [istop]"r"(istop)
+        : "memory"
+   );
+
+   PNG_UNUSED(prev)
+}
+
+void png_read_filter_row_avg3_mmi(png_row_infop row_info, png_bytep row,
+   png_const_bytep prev)
+{
+   int istop = row_info->rowbytes;
+   double rp, pp, rp1, pp1;
+   double tmp0;
+   double ftmp[3];
+   double one, dest;
+   double eight, sixteen, twenty_four, forty_eight;
+
+   __asm__ volatile (
+        "li         %[tmp0],    0x08                          \n\t"
+        "dmtc1      %[tmp0],    %[eight]                      \n\t"
+        "li         %[tmp0],    0x10                          \n\t"
+        "dmtc1      %[tmp0],    %[sixteen]                    \n\t"
+        "li         %[tmp0],    0x18                          \n\t"
+        "dmtc1      %[tmp0],    %[twenty_four]                \n\t"
+        "li         %[tmp0],    0x30                          \n\t"
+        "dmtc1      %[tmp0],    %[forty_eight]                \n\t"
+        "xor        %[dest],    %[dest],       %[dest]        \n\t"
+
+        "li         %[tmp0],   0x01                           \n\t"
+        "ins        %[tmp0],   %[tmp0],        8,   8         \n\t"
+        "dmtc1      %[tmp0],   %[one]                         \n\t"
+        "pshufh     %[one],    %[one],         %[dest]        \n\t"
+
+        "1:                                                   \n\t"
+        "gsldrc1    %[rp],      0x00(%[row])                  \n\t"
+        "gsldlc1    %[rp],      0x07(%[row])                  \n\t"
+        "gsldrc1    %[pp],      0x00(%[prev])                 \n\t"
+        "gsldlc1    %[pp],      0x07(%[prev])                 \n\t"
+        "gsldrc1    %[rp1],     0x08(%[row])                  \n\t"
+        "gsldlc1    %[rp1],     0x0f(%[row])                  \n\t"
+        "gsldrc1    %[pp1],     0x08(%[prev])                 \n\t"
+        "gsldlc1    %[pp1],     0x0f(%[prev])                 \n\t"
+
+        "xor        %[ftmp0],   %[pp],         %[dest]        \n\t"
+        "pavgb      %[ftmp1],   %[pp],         %[dest]        \n\t"
+        "and        %[ftmp0],   %[ftmp0],      %[one]         \n\t"
+        "psubb      %[ftmp1],   %[ftmp1],      %[ftmp0]       \n\t"
+        "paddb      %[dest],    %[rp],         %[ftmp1]       \n\t"
+        "swc1       %[dest],    0x00(%[row])                  \n\t"
+
+        "dsrl       %[ftmp0],   %[rp],         %[twenty_four] \n\t"
+        "dsrl       %[ftmp1],   %[pp],         %[twenty_four] \n\t"
+
+        "xor        %[ftmp2],   %[ftmp1],      %[dest]        \n\t"
+        "pavgb      %[ftmp1],   %[ftmp1],      %[dest]        \n\t"
+        "and        %[ftmp2],   %[ftmp2],      %[one]         \n\t"
+        "psubb      %[ftmp1],   %[ftmp1],      %[ftmp2]       \n\t"
+        "paddb      %[dest],    %[ftmp0],      %[ftmp1]       \n\t"
+        "gsswrc1    %[dest],    0x03(%[row])                  \n\t"
+        "gsswlc1    %[dest],    0x06(%[row])                  \n\t"
+
+        "dsrl       %[ftmp0],   %[rp],         %[forty_eight] \n\t"
+        "dsll       %[ftmp1],   %[rp1],        %[sixteen]     \n\t"
+        "or         %[ftmp0],   %[ftmp0],      %[ftmp1]       \n\t"
+        "dsrl       %[ftmp2],   %[pp],         %[forty_eight] \n\t"
+        "dsll       %[ftmp1],   %[pp1],        %[sixteen]     \n\t"
+        "or         %[ftmp1],   %[ftmp2],      %[ftmp1]       \n\t"
+
+        "xor        %[ftmp2],   %[ftmp1],      %[dest]        \n\t"
+        "pavgb      %[ftmp1],   %[ftmp1],      %[dest]        \n\t"
+        "and        %[ftmp2],   %[ftmp2],      %[one]         \n\t"
+        "psubb      %[ftmp1],   %[ftmp1],      %[ftmp2]       \n\t"
+        "paddb      %[dest],    %[ftmp0],      %[ftmp1]       \n\t"
+        "gsswrc1    %[dest],    0x06(%[row])                  \n\t"
+        "gsswlc1    %[dest],    0x09(%[row])                  \n\t"
+
+        "dsrl       %[ftmp0],   %[rp1],        %[eight]       \n\t"
+        "dsrl       %[ftmp1],   %[pp1],        %[eight]       \n\t"
+
+        "xor        %[ftmp2],   %[ftmp1],      %[dest]        \n\t"
+        "pavgb      %[ftmp1],   %[ftmp1],      %[dest]        \n\t"
+        "and        %[ftmp2],   %[ftmp2],      %[one]         \n\t"
+        "psubb      %[ftmp1],   %[ftmp1],      %[ftmp2]       \n\t"
+        "paddb      %[dest],    %[ftmp0],      %[ftmp1]       \n\t"
+        "gsswrc1    %[dest],    0x09(%[row])                  \n\t"
+        "daddiu     %[row],     %[row],        0x0c           \n\t"
+        "daddiu     %[prev],    %[prev],       0x0c           \n\t"
+        "daddiu     %[istop],   %[istop],     -0x0c           \n\t"
+        "bgtz       %[istop],   1b                            \n\t"
+        : [rp]"=&f"(rp), [pp]"=&f"(pp), [rp1]"=&f"(rp1),
+          [pp1]"=&f"(pp1), [tmp0]"=&r"(tmp0), [ftmp0]"=&f"(ftmp[0]),
+          [ftmp1]"=&f"(ftmp[1]), [ftmp2]"=&f"(ftmp[2]), [one]"=&f"(one),
+          [dest]"=&f"(dest), [eight]"=&f"(eight), [sixteen]"=&f"(sixteen),
+          [twenty_four]"=&f"(twenty_four), [forty_eight]"=&f"(forty_eight)
+        : [row]"r"(row), [prev]"r"(prev), [istop]"r"(istop)
+        : "memory"
+   );
+}
+
+void png_read_filter_row_avg4_mmi(png_row_infop row_info, png_bytep row,
+   png_const_bytep prev)
+{
+   int istop = row_info->rowbytes;
+   double rp,pp;
+   double dest;
+   double ftmp[2];
+   double tmp;
+
+   __asm__ volatile (
+        "xor        %[dest],   %[dest],       %[dest]  \n\t"
+        "li         %[tmp],    0x01                    \n\t"
+        "ins        %[tmp],    %[tmp],        8,  8    \n\t"
+        "dmtc1      %[tmp],    %[ftmp1]                \n\t"
+        "pshufh     %[ftmp1],  %[ftmp1],      %[dest]  \n\t"
+
+        "1:                                            \n\t"
+        "lwc1       %[rp],     0x00(%[row])            \n\t"
+        "lwc1       %[pp],     0x00(%[prev])           \n\t"
+        "xor        %[ftmp0],  %[pp],         %[dest]  \n\t"
+        "pavgb      %[pp],     %[pp],         %[dest]  \n\t"
+        "and        %[ftmp0],  %[ftmp0],      %[ftmp1] \n\t"
+        "psubb      %[pp],     %[pp],         %[ftmp0] \n\t"
+        "paddb      %[dest],   %[rp],         %[pp]    \n\t"
+        "swc1       %[dest],   0x00(%[row])            \n\t"
+        "daddiu     %[row],    %[row],        0x04     \n\t"
+        "daddiu     %[prev],   %[prev],       0x04     \n\t"
+        "daddiu     %[istop],  %[istop],     -0x04     \n\t"
+        "bgtz       %[istop],  1b                      \n\t"
+        : [rp]"=&f"(rp), [pp]"=&f"(pp), [ftmp0]"=&f"(ftmp[0]),
+          [ftmp1]"=&f"(ftmp[1]), [dest]"=&f"(dest), [tmp]"=&r"(tmp)
+        : [row]"r"(row), [prev]"r"(prev), [istop]"r"(istop)
+        : "memory"
+   );
+}
+
+void png_read_filter_row_paeth3_mmi(png_row_infop row_info, png_bytep row,
+   png_const_bytep prev)
+{
+   /* Paeth tries to predict pixel d using the pixel to the left of it, a,
+    * and two pixels from the previous row, b and c:
+    *   prev: c b
+    *   row:  a d
+    * The Paeth function predicts d to be whichever of a, b, or c is nearest to
+    * p=a+b-c.
+    *
+    * The first pixel has no left context, and so uses an Up filter, p = b.
+    * This works naturally with our main loop's p = a+b-c if we force a and c
+    * to zero.
+    * Here we zero b and d, which become c and a respectively at the start of
+    * the loop.
+    */
+   int istop = row_info->rowbytes;
+   double rp, pp, rp1, pp1, zero;
+   double a, b, c, d, pa, pb, pc;
+   double tmp0;
+   double ftmp[3];
+   double eight, sixteen, twenty_four, forty_eight;
+
+   __asm__ volatile (
+        "xor        %[a],      %[a],           %[a]           \n\t"
+        "xor        %[c],      %[c],           %[c]           \n\t"
+        "xor        %[zero],   %[zero],        %[zero]        \n\t"
+        "li         %[tmp0],    0x08                          \n\t"
+        "dmtc1      %[tmp0],    %[eight]                      \n\t"
+        "li         %[tmp0],    0x10                          \n\t"
+        "dmtc1      %[tmp0],    %[sixteen]                    \n\t"
+        "li         %[tmp0],    0x18                          \n\t"
+        "dmtc1      %[tmp0],    %[twenty_four]                \n\t"
+        "li         %[tmp0],    0x30                          \n\t"
+        "dmtc1      %[tmp0],    %[forty_eight]                \n\t"
+
+        "1:                                                   \n\t"
+        "gsldrc1    %[rp],      0x00(%[row])                  \n\t"
+        "gsldlc1    %[rp],      0x07(%[row])                  \n\t"
+        "gsldrc1    %[pp],      0x00(%[prev])                 \n\t"
+        "gsldlc1    %[pp],      0x07(%[prev])                 \n\t"
+        "gsldrc1    %[rp1],     0x08(%[row])                  \n\t"
+        "gsldlc1    %[rp1],     0x0f(%[row])                  \n\t"
+        "gsldrc1    %[pp1],     0x08(%[prev])                 \n\t"
+        "gsldlc1    %[pp1],     0x0f(%[prev])                 \n\t"
+
+        "punpcklbh  %[b],      %[pp],          %[zero]        \n\t"
+        "punpcklbh  %[d],      %[rp],          %[zero]        \n\t"
+        "packushb   %[ftmp0],  %[c],           %[c]           \n\t"
+        "packushb   %[ftmp1],  %[a],           %[a]           \n\t"
+        "pasubub    %[pa],     %[pp],          %[ftmp0]       \n\t"
+        "pasubub    %[pb],     %[ftmp1],       %[ftmp0]       \n\t"
+        "psubh      %[ftmp0],  %[b],           %[c]           \n\t"
+        "psubh      %[ftmp1],  %[a],           %[c]           \n\t"
+        "paddh      %[pc],     %[ftmp0],       %[ftmp1]       \n\t"
+        "pcmpgth    %[ftmp0],  %[zero],        %[pc]          \n\t"
+        "xor        %[pc],     %[pc],          %[ftmp0]       \n\t"
+        "psubh      %[pc],     %[pc],          %[ftmp0]       \n\t"
+        "punpcklbh  %[pa],     %[pa],          %[zero]        \n\t"
+        "punpcklbh  %[pb],     %[pb],          %[zero]        \n\t"
+        "pcmpgth    %[ftmp0],  %[pa],          %[pb]          \n\t"
+        "and        %[ftmp1],  %[b],           %[ftmp0]       \n\t"
+        "pandn      %[a],      %[ftmp0],       %[a]           \n\t"
+        "or         %[a],      %[a],           %[ftmp1]       \n\t"
+        "pminsh     %[pa],     %[pa],          %[pb]          \n\t"
+        "pcmpgth    %[ftmp0],  %[pa],          %[pc]          \n\t"
+        "and        %[ftmp1],  %[c],           %[ftmp0]       \n\t"
+        "pandn      %[a],      %[ftmp0],       %[a]           \n\t"
+        "or         %[a],      %[a],           %[ftmp1]       \n\t"
+        "paddb      %[a],      %[a],           %[d]           \n\t"
+        "packushb   %[d],      %[a],           %[a]           \n\t"
+        "punpcklbh  %[c],      %[pp],          %[zero]        \n\t"
+        "swc1       %[d],      0x00(%[row])                   \n\t"
+
+        "dsrl       %[ftmp0],  %[rp],          %[twenty_four] \n\t"
+        "dsrl       %[ftmp2],  %[pp],          %[twenty_four] \n\t"
+
+        "punpcklbh  %[b],      %[ftmp2],       %[zero]        \n\t"
+        "punpcklbh  %[d],      %[ftmp0],       %[zero]        \n\t"
+        "packushb   %[ftmp0],  %[c],           %[c]           \n\t"
+        "packushb   %[ftmp1],  %[a],           %[a]           \n\t"
+        "pasubub    %[pa],     %[ftmp2],       %[ftmp0]       \n\t"
+        "pasubub    %[pb],     %[ftmp1],       %[ftmp0]       \n\t"
+        "psubh      %[ftmp0],  %[b],           %[c]           \n\t"
+        "psubh      %[ftmp1],  %[a],           %[c]           \n\t"
+        "paddh      %[pc],     %[ftmp0],       %[ftmp1]       \n\t"
+        "pcmpgth    %[ftmp0],  %[zero],        %[pc]          \n\t"
+        "xor        %[pc],     %[pc],          %[ftmp0]       \n\t"
+        "psubh      %[pc],     %[pc],          %[ftmp0]       \n\t"
+        "punpcklbh  %[pa],     %[pa],          %[zero]        \n\t"
+        "punpcklbh  %[pb],     %[pb],          %[zero]        \n\t"
+        "pcmpgth    %[ftmp0],  %[pa],          %[pb]          \n\t"
+        "and        %[ftmp1],  %[b],           %[ftmp0]       \n\t"
+        "pandn      %[a],      %[ftmp0],       %[a]           \n\t"
+        "or         %[a],      %[a],           %[ftmp1]       \n\t"
+        "pminsh     %[pa],     %[pa],          %[pb]          \n\t"
+        "pcmpgth    %[ftmp0],  %[pa],          %[pc]          \n\t"
+        "and        %[ftmp1],  %[c],           %[ftmp0]       \n\t"
+        "pandn      %[a],      %[ftmp0],       %[a]           \n\t"
+        "or         %[a],      %[a],           %[ftmp1]       \n\t"
+        "paddb      %[a],      %[a],           %[d]           \n\t"
+        "packushb   %[d],      %[a],           %[a]           \n\t"
+        "punpcklbh  %[c],      %[ftmp2],       %[zero]        \n\t"
+        "gsswrc1    %[d],      0x03(%[row])                   \n\t"
+        "gsswlc1    %[d],      0x06(%[row])                   \n\t"
+
+        "dsrl       %[ftmp0],  %[rp],          %[forty_eight] \n\t"
+        "dsll       %[ftmp1],  %[rp1],         %[sixteen]     \n\t"
+        "or         %[ftmp0],  %[ftmp0],       %[ftmp1]       \n\t"
+        "dsrl       %[ftmp2],  %[pp],          %[forty_eight] \n\t"
+        "dsll       %[ftmp1],  %[pp1],         %[sixteen]     \n\t"
+        "or         %[ftmp2],  %[ftmp2],       %[ftmp1]       \n\t"
+
+        "punpcklbh  %[b],      %[ftmp2],       %[zero]        \n\t"
+        "punpcklbh  %[d],      %[ftmp0],       %[zero]        \n\t"
+        "packushb   %[ftmp0],  %[c],           %[c]           \n\t"
+        "packushb   %[ftmp1],  %[a],           %[a]           \n\t"
+        "pasubub    %[pa],     %[ftmp2],       %[ftmp0]       \n\t"
+        "pasubub    %[pb],     %[ftmp1],       %[ftmp0]       \n\t"
+        "psubh      %[ftmp0],  %[b],           %[c]           \n\t"
+        "psubh      %[ftmp1],  %[a],           %[c]           \n\t"
+        "paddh      %[pc],     %[ftmp0],       %[ftmp1]       \n\t"
+        "pcmpgth    %[ftmp0],  %[zero],        %[pc]          \n\t"
+        "xor        %[pc],     %[pc],          %[ftmp0]       \n\t"
+        "psubh      %[pc],     %[pc],          %[ftmp0]       \n\t"
+        "punpcklbh  %[pa],     %[pa],          %[zero]        \n\t"
+        "punpcklbh  %[pb],     %[pb],          %[zero]        \n\t"
+        "pcmpgth    %[ftmp0],  %[pa],          %[pb]          \n\t"
+        "and        %[ftmp1],  %[b],           %[ftmp0]       \n\t"
+        "pandn      %[a],      %[ftmp0],       %[a]           \n\t"
+        "or         %[a],      %[a],           %[ftmp1]       \n\t"
+        "pminsh     %[pa],     %[pa],          %[pb]          \n\t"
+        "pcmpgth    %[ftmp0],  %[pa],          %[pc]          \n\t"
+        "and        %[ftmp1],  %[c],           %[ftmp0]       \n\t"
+        "pandn      %[a],      %[ftmp0],       %[a]           \n\t"
+        "or         %[a],      %[a],           %[ftmp1]       \n\t"
+        "paddb      %[a],      %[a],           %[d]           \n\t"
+        "packushb   %[d],      %[a],           %[a]           \n\t"
+        "punpcklbh  %[c],      %[ftmp2],       %[zero]        \n\t"
+        "gsswrc1    %[d],      0x06(%[row])                   \n\t"
+        "gsswlc1    %[d],      0x09(%[row])                   \n\t"
+
+        "dsrl       %[ftmp0],   %[rp1],        %[eight]       \n\t"
+        "dsrl       %[ftmp2],   %[pp1],        %[eight]       \n\t"
+
+        "punpcklbh  %[b],      %[ftmp2],       %[zero]        \n\t"
+        "punpcklbh  %[d],      %[ftmp0],       %[zero]        \n\t"
+        "packushb   %[ftmp0],  %[c],           %[c]           \n\t"
+        "packushb   %[ftmp1],  %[a],           %[a]           \n\t"
+        "pasubub    %[pa],     %[ftmp2],       %[ftmp0]       \n\t"
+        "pasubub    %[pb],     %[ftmp1],       %[ftmp0]       \n\t"
+        "psubh      %[ftmp0],  %[b],           %[c]           \n\t"
+        "psubh      %[ftmp1],  %[a],           %[c]           \n\t"
+        "paddh      %[pc],     %[ftmp0],       %[ftmp1]       \n\t"
+        "pcmpgth    %[ftmp0],  %[zero],        %[pc]          \n\t"
+        "xor        %[pc],     %[pc],          %[ftmp0]       \n\t"
+        "psubh      %[pc],     %[pc],          %[ftmp0]       \n\t"
+        "punpcklbh  %[pa],     %[pa],          %[zero]        \n\t"
+        "punpcklbh  %[pb],     %[pb],          %[zero]        \n\t"
+        "pcmpgth    %[ftmp0],  %[pa],          %[pb]          \n\t"
+        "and        %[ftmp1],  %[b],           %[ftmp0]       \n\t"
+        "pandn      %[a],      %[ftmp0],       %[a]           \n\t"
+        "or         %[a],      %[a],           %[ftmp1]       \n\t"
+        "pminsh     %[pa],     %[pa],          %[pb]          \n\t"
+        "pcmpgth    %[ftmp0],  %[pa],          %[pc]          \n\t"
+        "and        %[ftmp1],  %[c],           %[ftmp0]       \n\t"
+        "pandn      %[a],      %[ftmp0],       %[a]           \n\t"
+        "or         %[a],      %[a],           %[ftmp1]       \n\t"
+        "paddb      %[a],      %[a],           %[d]           \n\t"
+        "packushb   %[d],      %[a],           %[a]           \n\t"
+        "punpcklbh  %[c],      %[ftmp2],       %[zero]        \n\t"
+        "gsswrc1    %[d],      0x09(%[row])                   \n\t"
+
+        "daddiu     %[row],    %[row],         0x0c           \n\t"
+        "daddiu     %[prev],   %[prev],        0x0c           \n\t"
+        "daddiu     %[istop],  %[istop],      -0x0c           \n\t"
+        "bgtz       %[istop],  1b                             \n\t"
+        : [rp]"=&f"(rp), [pp]"=&f"(pp), [rp1]"=&f"(rp1), [pp1]"=&f"(pp1),
+          [zero]"=&f"(zero), [a]"=&f"(a),[b]"=&f"(b), [c]"=&f"(c),
+          [d]"=&f"(d), [pa]"=&f"(pa), [pb]"=&f"(pb), [pc]"=&f"(pc),
+          [tmp0]"=&r"(tmp0), [ftmp0]"=&f"(ftmp[0]), [ftmp1]"=&f"(ftmp[1]),
+          [ftmp2]"=&f"(ftmp[2]), [eight]"=&f"(eight), [sixteen]"=&f"(sixteen),
+          [twenty_four]"=&f"(twenty_four), [forty_eight]"=&f"(forty_eight)
+        : [row]"r"(row), [prev]"r"(prev), [istop]"r"(istop)
+        : "memory"
+   );
+}
+
+void png_read_filter_row_paeth4_mmi(png_row_infop row_info, png_bytep row,
+   png_const_bytep prev)
+{
+   /* Paeth tries to predict pixel d using the pixel to the left of it, a,
+    * and two pixels from the previous row, b and c:
+    *   prev: c b
+    *   row:  a d
+    * The Paeth function predicts d to be whichever of a, b, or c is nearest to
+    * p=a+b-c.
+    *
+    * The first pixel has no left context, and so uses an Up filter, p = b.
+    * This works naturally with our main loop's p = a+b-c if we force a and c
+    * to zero.
+    * Here we zero b and d, which become c and a respectively at the start of
+    * the loop.
+    */
+   int istop = row_info->rowbytes;
+   double rp, pp, zero;
+   double a, b, c, d, pa, pb, pc;
+   double ftmp[2];
+
+   __asm__ volatile (
+        "xor        %[a],      %[a],           %[a]     \n\t"
+        "xor        %[c],      %[c],           %[c]     \n\t"
+        "xor        %[zero],   %[zero],        %[zero]  \n\t"
+
+        "1:                                             \n\t"
+        "lwc1       %[rp],     0x00(%[row])             \n\t"
+        "lwc1       %[pp],     0x00(%[prev])            \n\t"
+        "punpcklbh  %[b],      %[pp],          %[zero]  \n\t"
+        "punpcklbh  %[d],      %[rp],          %[zero]  \n\t"
+
+        "packushb   %[ftmp0],  %[c],           %[c]     \n\t"
+        "packushb   %[ftmp1],  %[a],           %[a]     \n\t"
+        "pasubub    %[pa],     %[pp],          %[ftmp0] \n\t"
+        "pasubub    %[pb],     %[ftmp1],       %[ftmp0] \n\t"
+        "psubh      %[ftmp0],  %[b],           %[c]     \n\t"
+        "psubh      %[ftmp1],  %[a],           %[c]     \n\t"
+        "paddh      %[pc],     %[ftmp0],       %[ftmp1] \n\t"
+        "pcmpgth    %[ftmp0],  %[zero],        %[pc]    \n\t"
+        "xor        %[pc],     %[pc],          %[ftmp0] \n\t"
+        "psubh      %[pc],     %[pc],          %[ftmp0] \n\t"
+
+        "punpcklbh  %[pa],     %[pa],           %[zero] \n\t"
+        "punpcklbh  %[pb],     %[pb],           %[zero] \n\t"
+
+        "pcmpgth    %[ftmp0],  %[pa],          %[pb]    \n\t"
+        "and        %[ftmp1],  %[b],           %[ftmp0] \n\t"
+        "pandn      %[a],      %[ftmp0],       %[a]     \n\t"
+        "or         %[a],      %[a],           %[ftmp1] \n\t"
+        "pminsh     %[pa],     %[pa],          %[pb]    \n\t"
+
+        "pcmpgth    %[ftmp0],  %[pa],          %[pc]    \n\t"
+        "and        %[ftmp1],  %[c],           %[ftmp0] \n\t"
+        "pandn      %[a],      %[ftmp0],       %[a]     \n\t"
+        "or         %[a],      %[a],           %[ftmp1] \n\t"
+        "paddb      %[a],      %[a],           %[d]     \n\t"
+        "packushb   %[d],      %[a],           %[a]     \n\t"
+        "swc1       %[d],      0x00(%[row])             \n\t"
+        "punpcklbh  %[c],      %[pp],          %[zero]  \n\t"
+        "daddiu     %[row],    %[row],         0x04     \n\t"
+        "daddiu     %[prev],   %[prev],        0x04     \n\t"
+        "daddiu     %[istop],  %[istop],      -0x04     \n\t"
+        "bgtz       %[istop],  1b                       \n\t"
+        : [rp]"=&f"(rp), [pp]"=&f"(pp), [zero]"=&f"(zero),
+          [a]"=&f"(a), [b]"=&f"(b), [c]"=&f"(c), [d]"=&f"(d),
+          [pa]"=&f"(pa), [pb]"=&f"(pb), [pc]"=&f"(pc),
+          [ftmp0]"=&f"(ftmp[0]), [ftmp1]"=&f"(ftmp[1])
+        : [row]"r"(row), [prev]"r"(prev), [istop]"r"(istop)
+        : "memory"
+   );
+}
+
+#endif /* PNG_MIPS_MMI_IMPLEMENTATION > 0 */
+#endif /* READ */
diff --git a/mips/mips_init.c b/mips/mips_init.c
index 8dd283d..20a9fa8 100644
--- a/mips/mips_init.c
+++ b/mips/mips_init.c
@@ -4,6 +4,7 @@
  * Copyright (c) 2018 Cosmin Truta
  * Copyright (c) 2016 Glenn Randers-Pehrson
  * Written by Mandar Sahastrabuddhe, 2016.
+ * Update by guxiwei, 2023.
  *
  * This code is released under the libpng license.
  * For conditions of distribution and use, see the disclaimer
@@ -20,8 +21,9 @@
 
 #ifdef PNG_READ_SUPPORTED
 
-#if PNG_MIPS_MSA_OPT > 0
-#ifdef PNG_MIPS_MSA_CHECK_SUPPORTED /* Do run-time checks */
+#if PNG_MIPS_MSA_OPT > 0 || PNG_MIPS_MMI_IMPLEMENTATION > 0
+
+#ifdef PNG_MIPS_MSA_CHECK_SUPPORTED /* Do MIPS MSA run-time checks */
 /* WARNING: it is strongly recommended that you do not build libpng with
  * run-time checks for CPU features if at all possible.  In the case of the MIPS
  * MSA instructions there is no processor-specific way of detecting the
@@ -51,13 +53,83 @@
 #endif /* PNG_MIPS_MSA_FILE */
 #endif /* PNG_MIPS_MSA_CHECK_SUPPORTED */
 
+#ifdef PNG_MIPS_MMI_CHECK_SUPPORTED /* Do MIPS MMI run-times checks */
+#ifndef PNG_MIPS_MMI_FILE
+#  ifdef __linux__
+#     define PNG_MIPS_MMI_FILE "contrib/mips-mmi/linux.c"
+#  endif
+#endif
+
+#ifdef PNG_MIPS_MMI_FILE
+
+#include <signal.h> /* for sig_atomic_t */
+static int png_have_mmi();
+#include PNG_MIPS_MMI_FILE
+
+#else  /* PNG_MIPS_MMI_FILE */
+#  error "PNG_MIPS_MMI_FILE undefined: no support for run-time MIPS MMI checks"
+#endif /* PNG_MIPS_MMI_FILE */
+#endif /* PNG_MIPS_MMI_CHECK_SUPPORTED*/
+
 #ifndef PNG_ALIGNED_MEMORY_SUPPORTED
 #  error "ALIGNED_MEMORY is required; set: -DPNG_ALIGNED_MEMORY_SUPPORTED"
 #endif
 
+/* MIPS supports two optimizations: MMI and MSA. The appropriate
+ * optimization is chosen at runtime
+ */
 void
-png_init_filter_functions_msa(png_structp pp, unsigned int bpp)
+png_init_filter_functions_mips(png_structp pp, unsigned int bpp)
 {
+#if PNG_MIPS_MMI_IMPLEMENTATION  > 0
+#ifdef PNG_MIPS_MMI_API_SUPPORTED
+   switch ((pp->options >> PNG_MIPS_MMI) & 3)
+   {
+      case PNG_OPTION_UNSET:
+#endif /* PNG_MIPS_MMI_API_SUPPORTED */
+#ifdef PNG_MIPS_MMI_CHECK_SUPPORTED
+         {
+            static volatile sig_atomic_t no_mmi = -1; /* not checked */
+
+            if (no_mmi < 0)
+               no_mmi = !png_have_mmi();
+
+            if (no_mmi)
+              goto MIPS_MSA_INIT;
+         }
+#ifdef PNG_MIPS_MMI_API_SUPPORTED
+         break;
+#endif
+#endif /* PNG_MIPS_MMI_CHECK_SUPPORTED */
+
+#ifdef PNG_MIPS_MMI_API_SUPPORTED
+      default: /* OFF or INVALID */
+         goto MIPS_MSA_INIT;
+
+      case PNG_OPTION_ON:
+         /* Option turned on */
+         break;
+   }
+#endif
+   pp->read_filter[PNG_FILTER_VALUE_UP-1] = png_read_filter_row_up_mmi;
+   if (bpp == 3)
+   {
+      pp->read_filter[PNG_FILTER_VALUE_SUB-1] = png_read_filter_row_sub3_mmi;
+      pp->read_filter[PNG_FILTER_VALUE_AVG-1] = png_read_filter_row_avg3_mmi;
+      pp->read_filter[PNG_FILTER_VALUE_PAETH-1] =
+         png_read_filter_row_paeth3_mmi;
+   }
+   else if (bpp == 4)
+   {
+      pp->read_filter[PNG_FILTER_VALUE_SUB-1] = png_read_filter_row_sub4_mmi;
+      pp->read_filter[PNG_FILTER_VALUE_AVG-1] = png_read_filter_row_avg4_mmi;
+      pp->read_filter[PNG_FILTER_VALUE_PAETH-1] =
+          png_read_filter_row_paeth4_mmi;
+   }
+#endif /* PNG_MIPS_MMI_IMPLEMENTATION > 0 */
+
+MIPS_MSA_INIT:
+#if PNG_MIPS_MSA_OPT > 0
    /* The switch statement is compiled in for MIPS_MSA_API, the call to
     * png_have_msa is compiled in for MIPS_MSA_CHECK. If both are defined
     * the check is only performed if the API has not set the MSA option on
@@ -125,6 +197,8 @@
       pp->read_filter[PNG_FILTER_VALUE_AVG-1] = png_read_filter_row_avg4_msa;
       pp->read_filter[PNG_FILTER_VALUE_PAETH-1] = png_read_filter_row_paeth4_msa;
    }
-}
 #endif /* PNG_MIPS_MSA_OPT > 0 */
+   return;
+}
+#endif /* PNG_MIPS_MSA_OPT > 0 || PNG_MIPS_MMI_IMPLEMENTATION > 0 */
 #endif /* READ */
diff --git a/png.h b/png.h
index eaee5a3..457c932 100644
--- a/png.h
+++ b/png.h
@@ -3208,7 +3208,11 @@
 #ifdef PNG_POWERPC_VSX_API_SUPPORTED
 #  define PNG_POWERPC_VSX   10 /* HARDWARE: PowerPC VSX SIMD instructions supported */
 #endif
-#define PNG_OPTION_NEXT  12 /* Next option - numbers must be even */
+#ifdef PNG_MIPS_MMI_API_SUPPORTED
+#  define PNG_MIPS_MMI   12 /* HARDWARE: MIPS MMI SIMD instructions supported */
+#endif
+
+#define PNG_OPTION_NEXT  14 /* Next option - numbers must be even */
 
 /* Return values: NOTE: there are four values and 'off' is *not* zero */
 #define PNG_OPTION_UNSET   0 /* Unset - defaults to off */
diff --git a/pngpriv.h b/pngpriv.h
index cdbc6c3..fdf0f35 100644
--- a/pngpriv.h
+++ b/pngpriv.h
@@ -197,6 +197,16 @@
 #  endif
 #endif
 
+#ifndef PNG_MIPS_MMI_OPT
+#  ifdef PNG_MIPS_MMI
+#    if defined(__mips_loongson_mmi) && (_MIPS_SIM == _ABI64) && defined(PNG_ALIGNED_MEMORY_SUPPORTED)
+#       define PNG_MIPS_MMI_OPT 1
+#    else
+#       define PNG_MIPS_MMI_OPT 0
+#    endif
+#  endif
+#endif
+
 #ifndef PNG_POWERPC_VSX_OPT
 #  if defined(__PPC64__) && defined(__ALTIVEC__) && defined(__VSX__)
 #     define PNG_POWERPC_VSX_OPT 2
@@ -248,7 +258,7 @@
 #endif
 
 #if PNG_MIPS_MSA_OPT > 0
-#  define PNG_FILTER_OPTIMIZATIONS png_init_filter_functions_msa
+#  define PNG_FILTER_OPTIMIZATIONS png_init_filter_functions_mips
 #  ifndef PNG_MIPS_MSA_IMPLEMENTATION
 #     if defined(__mips_msa)
 #        if defined(__clang__)
@@ -269,6 +279,22 @@
 #  define PNG_MIPS_MSA_IMPLEMENTATION 0
 #endif /* PNG_MIPS_MSA_OPT > 0 */
 
+#if PNG_MIPS_MMI_OPT > 0
+#  ifndef PNG_MIPS_MMI_IMPLEMENTATION
+#     if defined(__mips_loongson_mmi) && (_MIPS_SIM == _ABI64)
+#        define PNG_MIPS_MMI_IMPLEMENTATION 2
+#     else /* !defined __mips_loongson_mmi  || _MIPS_SIM != _ABI64 */
+#        define PNG_MIPS_MMI_IMPLEMENTATION 0
+#     endif /* __mips_loongson_mmi  && _MIPS_SIM == _ABI64 */
+#  endif /* !PNG_MIPS_MMI_IMPLEMENTATION */
+
+#   if PNG_MIPS_MMI_IMPLEMENTATION > 0
+#      define PNG_FILTER_OPTIMIZATIONS png_init_filter_functions_mips
+#   endif
+#else
+#   define PNG_INTEL_SSE_IMPLEMENTATION 0
+#endif /* PNG_MIPS_MMI_OPT > 0 */
+
 #if PNG_POWERPC_VSX_OPT > 0
 #  define PNG_FILTER_OPTIMIZATIONS png_init_filter_functions_vsx
 #  define PNG_POWERPC_VSX_IMPLEMENTATION 1
@@ -1329,6 +1355,23 @@
     row_info, png_bytep row, png_const_bytep prev_row),PNG_EMPTY);
 #endif
 
+#if PNG_MIPS_MMI_IMPLEMENTATION > 0
+PNG_INTERNAL_FUNCTION(void,png_read_filter_row_up_mmi,(png_row_infop row_info,
+    png_bytep row, png_const_bytep prev_row),PNG_EMPTY);
+PNG_INTERNAL_FUNCTION(void,png_read_filter_row_sub3_mmi,(png_row_infop
+    row_info, png_bytep row, png_const_bytep prev_row),PNG_EMPTY);
+PNG_INTERNAL_FUNCTION(void,png_read_filter_row_sub4_mmi,(png_row_infop
+    row_info, png_bytep row, png_const_bytep prev_row),PNG_EMPTY);
+PNG_INTERNAL_FUNCTION(void,png_read_filter_row_avg3_mmi,(png_row_infop
+    row_info, png_bytep row, png_const_bytep prev_row),PNG_EMPTY);
+PNG_INTERNAL_FUNCTION(void,png_read_filter_row_avg4_mmi,(png_row_infop
+    row_info, png_bytep row, png_const_bytep prev_row),PNG_EMPTY);
+PNG_INTERNAL_FUNCTION(void,png_read_filter_row_paeth3_mmi,(png_row_infop
+    row_info, png_bytep row, png_const_bytep prev_row),PNG_EMPTY);
+PNG_INTERNAL_FUNCTION(void,png_read_filter_row_paeth4_mmi,(png_row_infop
+    row_info, png_bytep row, png_const_bytep prev_row),PNG_EMPTY);
+#endif
+
 #if PNG_POWERPC_VSX_OPT > 0
 PNG_INTERNAL_FUNCTION(void,png_read_filter_row_up_vsx,(png_row_infop row_info,
     png_bytep row, png_const_bytep prev_row),PNG_EMPTY);
@@ -2118,10 +2161,15 @@
 #endif
 
 #if PNG_MIPS_MSA_OPT > 0
-PNG_INTERNAL_FUNCTION(void, png_init_filter_functions_msa,
+PNG_INTERNAL_FUNCTION(void, png_init_filter_functions_mips,
    (png_structp png_ptr, unsigned int bpp), PNG_EMPTY);
 #endif
 
+#  if PNG_MIPS_MMI_IMPLEMENTATION > 0
+PNG_INTERNAL_FUNCTION(void, png_init_filter_functions_mips,
+   (png_structp png_ptr, unsigned int bpp), PNG_EMPTY);
+#  endif
+
 #  if PNG_INTEL_SSE_IMPLEMENTATION > 0
 PNG_INTERNAL_FUNCTION(void, png_init_filter_functions_sse2,
    (png_structp png_ptr, unsigned int bpp), PNG_EMPTY);
diff --git a/scripts/pnglibconf.dfa b/scripts/pnglibconf.dfa
index e15a23c..f4d14a2 100644
--- a/scripts/pnglibconf.dfa
+++ b/scripts/pnglibconf.dfa
@@ -256,6 +256,55 @@
 option POWERPC_VSX_CHECK disabled,
   sets POWERPC_VSX_OPT 1
 
+# These options are specific to the MIPS MSA hardware optimizations.
+#
+# MIPS_MSA_OPT: unset: check at compile time (__mips_msa must be defined by
+#                      the compiler, typically as a result of specifying
+#                      "-mmsa -mfp64" compiler flags)
+#                   0: disable (even if the CPU supports MSA.)
+#                   1: check at run time (via MIPS_MSA_{API,CHECK})
+#                   2: switch on unconditionally (inadvisable - instead pass
+#                      -mmsa -mfp64 to compiler options)
+#           When building libpng avoid using any setting other than '0'; '1' is
+#           set automatically when either 'API' or 'CHECK' are configured in,
+#           '2' should not be necessary as "-mmsa -mfp64" will achieve the same
+#           effect as well as applying MSA optimizations to the rest of the
+#           libpng code.
+#           NOTE: any setting other than '0' requires ALIGNED_MEMORY
+# MIPS_MSA_API:   (PNG_MIPS_MSA == 1) allow the optimization to be switched on
+#                 with png_set_option.
+# MIPS_MSA_CHECK: (PNG_MIPS_MSA == 1) compile a run-time check to see if MSA
+#                 extensions are supported.
+setting MIPS_MSA_OPT
+option MIPS_MSA_API disabled requires ALIGNED_MEMORY enables SET_OPTION,
+  sets MIPS_MSA_OPT 1
+option MIPS_MSA_CHECK disabled requires ALIGNED_MEMORY,
+  sets MIPS_MSA_OPT 1
+
+# These options are specific to the MIPS MMI hardware optimizations.
+#
+# MIPS_MMI_OPT: unset: check at compile time (__mips_loongson_mmi must be defined by
+#                      the compiler, typically as a result of specifying
+#                      "-mloongson-mmi -march=loongson3a" compiler flags)
+#                   0: disable (even if the CPU supports MMI.)
+#                   1: check at run time (via MIPS_MMI_{API,CHECK})
+#                   2: switch on unconditionally (inadvisable - instead pass
+#                      -mloongson-mmi -march=loongson3a to compiler options)
+#           When building libpng avoid using any setting other than '0'; '1' is
+#           set automatically when either 'API' or 'CHECK' are configured in,
+#           '2' should not be necessary as "-mloongson-mmi -march=loongson3a" will achieve the same
+#           effect as well as applying MMI optimizations to the rest of the
+#           libpng code.
+# MIPS_MMI_API:   (PNG_MIPS_MMI == 1) allow the optimization to be switched on
+#                 with png_set_option
+# MIPS_MMI_CHECK: (PNG_MIPS_MMI == 1) compile a run-time check to see if MMI
+#                 extensions are supported.
+setting MIPS_MMI_OPT
+option MIPS_MMI_API disabled requires ALIGNED_MEMORY enables SET_OPTION,
+  sets MIPS_MMI_OPT 1
+option MIPS_MMI_CHECK disabled requires ALIGNED_MEMORY,
+  sets MIPS_MMI_OPT 1
+
 
 # These settings configure the default compression level (0-9) and 'strategy';
 # strategy is as defined by the implementors of zlib. It describes the input