diff --git a/simd/jsimd.h b/simd/jsimd.h
index 1c598f0..a312930 100644
--- a/simd/jsimd.h
+++ b/simd/jsimd.h
@@ -95,6 +95,13 @@
         (JDIMENSION img_width, JSAMPARRAY input_buf, JSAMPIMAGE output_buf,
          JDIMENSION output_row, int num_rows);
 
+EXTERN(void) jsimd_extrgb_ycc_convert_neon_slowld3
+        (JDIMENSION img_width, JSAMPARRAY input_buf, JSAMPIMAGE output_buf,
+         JDIMENSION output_row, int num_rows);
+EXTERN(void) jsimd_extbgr_ycc_convert_neon_slowld3
+        (JDIMENSION img_width, JSAMPARRAY input_buf, JSAMPIMAGE output_buf,
+         JDIMENSION output_row, int num_rows);
+
 EXTERN(void) jsimd_rgb_ycc_convert_mips_dspr2
         (JDIMENSION img_width, JSAMPARRAY input_buf, JSAMPIMAGE output_buf,
          JDIMENSION output_row, int num_rows);
@@ -300,6 +307,13 @@
         (JDIMENSION out_width, JSAMPIMAGE input_buf, JDIMENSION input_row,
          JSAMPARRAY output_buf, int num_rows);
 
+EXTERN(void) jsimd_ycc_extrgb_convert_neon_slowst3
+        (JDIMENSION out_width, JSAMPIMAGE input_buf, JDIMENSION input_row,
+         JSAMPARRAY output_buf, int num_rows);
+EXTERN(void) jsimd_ycc_extbgr_convert_neon_slowst3
+        (JDIMENSION out_width, JSAMPIMAGE input_buf, JDIMENSION input_row,
+         JSAMPARRAY output_buf, int num_rows);
+
 EXTERN(void) jsimd_ycc_rgb_convert_mips_dspr2
         (JDIMENSION out_width, JSAMPIMAGE input_buf, JDIMENSION input_row,
          JSAMPARRAY output_buf, int num_rows);
diff --git a/simd/jsimd_arm64.c b/simd/jsimd_arm64.c
index 583a62b..8633162 100644
--- a/simd/jsimd_arm64.c
+++ b/simd/jsimd_arm64.c
@@ -26,8 +26,12 @@
 #include <string.h>
 #include <ctype.h>
 
+#define JSIMD_FASTLD3 1
+#define JSIMD_FASTST3 2
+
 static unsigned int simd_support = ~0;
 static unsigned int simd_huffman = 1;
+static unsigned int simd_features = JSIMD_FASTLD3 | JSIMD_FASTST3;
 
 #if defined(__linux__) || defined(ANDROID) || defined(__ANDROID__)
 
@@ -81,8 +85,9 @@
       }
       if (check_cpuinfo(buffer, "CPU part", "0x0a1"))
         /* The SIMD version of Huffman encoding is slower than the C version on
-           Cavium ThunderX. */
-        simd_huffman = 0;
+           Cavium ThunderX.  Also, ld3 and st3 are abyssmally slow on that
+           CPU. */
+        simd_huffman = simd_features = 0;
     }
     fclose(fd);
   }
@@ -136,6 +141,16 @@
   env = getenv("JSIMD_NOHUFFENC");
   if ((env != NULL) && (strcmp(env, "1") == 0))
     simd_huffman = 0;
+  env = getenv("JSIMD_FASTLD3");
+  if ((env != NULL) && (strcmp(env, "1") == 0))
+    simd_features |= JSIMD_FASTLD3;
+  if ((env != NULL) && (strcmp(env, "0") == 0))
+    simd_features &= ~JSIMD_FASTLD3;
+  env = getenv("JSIMD_FASTST3");
+  if ((env != NULL) && (strcmp(env, "1") == 0))
+    simd_features |= JSIMD_FASTST3;
+  if ((env != NULL) && (strcmp(env, "0") == 0))
+    simd_features &= ~JSIMD_FASTST3;
 }
 
 GLOBAL(int)
@@ -210,14 +225,20 @@
 
   switch(cinfo->in_color_space) {
     case JCS_EXT_RGB:
-      neonfct=jsimd_extrgb_ycc_convert_neon;
+      if (simd_features & JSIMD_FASTLD3)
+        neonfct=jsimd_extrgb_ycc_convert_neon;
+      else
+        neonfct=jsimd_extrgb_ycc_convert_neon_slowld3;
       break;
     case JCS_EXT_RGBX:
     case JCS_EXT_RGBA:
       neonfct=jsimd_extrgbx_ycc_convert_neon;
       break;
     case JCS_EXT_BGR:
-      neonfct=jsimd_extbgr_ycc_convert_neon;
+      if (simd_features & JSIMD_FASTLD3)
+        neonfct=jsimd_extbgr_ycc_convert_neon;
+      else
+        neonfct=jsimd_extbgr_ycc_convert_neon_slowld3;
       break;
     case JCS_EXT_BGRX:
     case JCS_EXT_BGRA:
@@ -232,7 +253,10 @@
       neonfct=jsimd_extxrgb_ycc_convert_neon;
       break;
     default:
-      neonfct=jsimd_extrgb_ycc_convert_neon;
+      if (simd_features & JSIMD_FASTLD3)
+        neonfct=jsimd_extrgb_ycc_convert_neon;
+      else
+        neonfct=jsimd_extrgb_ycc_convert_neon_slowld3;
       break;
   }
 
@@ -255,14 +279,20 @@
 
   switch(cinfo->out_color_space) {
     case JCS_EXT_RGB:
-      neonfct=jsimd_ycc_extrgb_convert_neon;
+      if (simd_features & JSIMD_FASTST3)
+        neonfct=jsimd_ycc_extrgb_convert_neon;
+      else
+        neonfct=jsimd_ycc_extrgb_convert_neon_slowst3;
       break;
     case JCS_EXT_RGBX:
     case JCS_EXT_RGBA:
       neonfct=jsimd_ycc_extrgbx_convert_neon;
       break;
     case JCS_EXT_BGR:
-      neonfct=jsimd_ycc_extbgr_convert_neon;
+      if (simd_features & JSIMD_FASTST3)
+        neonfct=jsimd_ycc_extbgr_convert_neon;
+      else
+        neonfct=jsimd_ycc_extbgr_convert_neon_slowst3;
       break;
     case JCS_EXT_BGRX:
     case JCS_EXT_BGRA:
@@ -277,7 +307,10 @@
       neonfct=jsimd_ycc_extxrgb_convert_neon;
       break;
     default:
-      neonfct=jsimd_ycc_extrgb_convert_neon;
+      if (simd_features & JSIMD_FASTST3)
+        neonfct=jsimd_ycc_extrgb_convert_neon;
+      else
+        neonfct=jsimd_ycc_extrgb_convert_neon_slowst3;
       break;
   }
 
diff --git a/simd/jsimd_arm64_neon.S b/simd/jsimd_arm64_neon.S
index c1998ba..3f003ce 100644
--- a/simd/jsimd_arm64_neon.S
+++ b/simd/jsimd_arm64_neon.S
@@ -1445,12 +1445,6 @@
  * Colorspace conversion YCbCr -> RGB
  */
 
-#if defined(__APPLE__) || defined(__ANDROID__)
-/* TODO: expand this to include other devices that are known not to have a slow
- *       st3 implementation. */
-#define ST3_IS_FAST
-#endif
-
 .macro do_load size
   .if \size == 8
     ld1             {v4.8b}, [U], 8
@@ -1488,44 +1482,44 @@
   .endif
 .endm
 
-.macro do_store bpp, size
+.macro do_store bpp, size, fast_st3
   .if \bpp == 24
     .if \size == 8
-#ifdef ST3_IS_FAST
-      st3           {v10.8b, v11.8b, v12.8b}, [RGB], 24
-#else
-      st1           {v10.b}[0], [RGB], #1
-      st1           {v11.b}[0], [RGB], #1
-      st1           {v12.b}[0], [RGB], #1
+      .if \fast_st3 == 1
+        st3         {v10.8b, v11.8b, v12.8b}, [RGB], 24
+      .else
+        st1         {v10.b}[0], [RGB], #1
+        st1         {v11.b}[0], [RGB], #1
+        st1         {v12.b}[0], [RGB], #1
 
-      st1           {v10.b}[1], [RGB], #1
-      st1           {v11.b}[1], [RGB], #1
-      st1           {v12.b}[1], [RGB], #1
+        st1         {v10.b}[1], [RGB], #1
+        st1         {v11.b}[1], [RGB], #1
+        st1         {v12.b}[1], [RGB], #1
 
-      st1           {v10.b}[2], [RGB], #1
-      st1           {v11.b}[2], [RGB], #1
-      st1           {v12.b}[2], [RGB], #1
+        st1         {v10.b}[2], [RGB], #1
+        st1         {v11.b}[2], [RGB], #1
+        st1         {v12.b}[2], [RGB], #1
 
-      st1           {v10.b}[3], [RGB], #1
-      st1           {v11.b}[3], [RGB], #1
-      st1           {v12.b}[3], [RGB], #1
+        st1         {v10.b}[3], [RGB], #1
+        st1         {v11.b}[3], [RGB], #1
+        st1         {v12.b}[3], [RGB], #1
 
-      st1           {v10.b}[4], [RGB], #1
-      st1           {v11.b}[4], [RGB], #1
-      st1           {v12.b}[4], [RGB], #1
+        st1         {v10.b}[4], [RGB], #1
+        st1         {v11.b}[4], [RGB], #1
+        st1         {v12.b}[4], [RGB], #1
 
-      st1           {v10.b}[5], [RGB], #1
-      st1           {v11.b}[5], [RGB], #1
-      st1           {v12.b}[5], [RGB], #1
+        st1         {v10.b}[5], [RGB], #1
+        st1         {v11.b}[5], [RGB], #1
+        st1         {v12.b}[5], [RGB], #1
 
-      st1           {v10.b}[6], [RGB], #1
-      st1           {v11.b}[6], [RGB], #1
-      st1           {v12.b}[6], [RGB], #1
+        st1         {v10.b}[6], [RGB], #1
+        st1         {v11.b}[6], [RGB], #1
+        st1         {v12.b}[6], [RGB], #1
 
-      st1           {v10.b}[7], [RGB], #1
-      st1           {v11.b}[7], [RGB], #1
-      st1           {v12.b}[7], [RGB], #1
-#endif
+        st1         {v10.b}[7], [RGB], #1
+        st1         {v11.b}[7], [RGB], #1
+        st1         {v12.b}[7], [RGB], #1
+      .endif
     .elseif \size == 4
       st3           {v10.b, v11.b, v12.b}[0], [RGB], 3
       st3           {v10.b, v11.b, v12.b}[1], [RGB], 3
@@ -1573,7 +1567,9 @@
   .endif
 .endm
 
-.macro generate_jsimd_ycc_rgb_convert_neon colorid, bpp, r_offs, rsize, g_offs, gsize, b_offs, bsize, defsize
+.macro generate_jsimd_ycc_rgb_convert_neon colorid, bpp, r_offs, rsize, \
+                                           g_offs, gsize, b_offs, bsize, \
+                                           defsize, fast_st3 = 1
 
 /*
  * 2-stage pipelined YCbCr->RGB conversion
@@ -1615,7 +1611,7 @@
   .endif
 .endm
 
-.macro do_yuv_to_rgb_stage2_store_load_stage1
+.macro do_yuv_to_rgb_stage2_store_load_stage1 fast_st3
     rshrn           v20.4h, v20.4s, #15
     rshrn           v24.4h, v24.4s, #14
     rshrn           v28.4h, v28.4s, #14
@@ -1662,7 +1658,7 @@
     prfm            pldl1keep, [Y, #64]
     sri             v25.8h, v29.8h, #11
   .endif
-    do_store        \bpp, 8
+    do_store        \bpp, 8, \fast_st3
     smull           v28.4s, v6.4h, v1.h[3]  /* multiply by 29033 */
     smull2          v30.4s, v6.8h, v1.h[3]  /* multiply by 29033 */
 .endm
@@ -1677,13 +1673,21 @@
  */
 
 .balign 16
+.if \fast_st3 == 1
 Ljsimd_ycc_\colorid\()_neon_consts:
+.else
+Ljsimd_ycc_\colorid\()_neon_slowst3_consts:
+.endif
   .short 0,      0,     0,      0
   .short 22971, -11277, -23401, 29033
   .short -128,  -128,   -128,   -128
   .short -128,  -128,   -128,   -128
 
+.if \fast_st3 == 1
 asm_function jsimd_ycc_\colorid\()_convert_neon
+.else
+asm_function jsimd_ycc_\colorid\()_convert_neon_slowst3
+.endif
     OUTPUT_WIDTH    .req x0
     INPUT_BUF       .req x1
     INPUT_ROW       .req x2
@@ -1753,12 +1757,12 @@
     subs            N, N, #8
     b.lt            2f
 1:
-    do_yuv_to_rgb_stage2_store_load_stage1
+    do_yuv_to_rgb_stage2_store_load_stage1 \fast_st3
     subs            N, N, #8
     b.ge            1b
 2:
     do_yuv_to_rgb_stage2
-    do_store        \bpp, 8
+    do_store        \bpp, 8, \fast_st3
     tst             N, #7
     b.eq            8f
 3:
@@ -1777,15 +1781,15 @@
     do_yuv_to_rgb
     tst             N, #4
     b.eq            6f
-    do_store        \bpp, 4
+    do_store        \bpp, 4, \fast_st3
 6:
     tst             N, #2
     b.eq            7f
-    do_store        \bpp, 2
+    do_store        \bpp, 2, \fast_st3
 7:
     tst             N, #1
     b.eq            8f
-    do_store        \bpp, 1
+    do_store        \bpp, 1, \fast_st3
 8:
     subs            NUM_ROWS, NUM_ROWS, #1
     b.gt            0b
@@ -1827,7 +1831,7 @@
 
 .endm
 
-/*--------------------------------- id ----- bpp R  rsize G  gsize B  bsize defsize */
+/*--------------------------------- id ----- bpp R  rsize G  gsize B  bsize defsize fast_st3*/
 generate_jsimd_ycc_rgb_convert_neon extrgb,  24, 0, .4h,  1, .4h,  2, .4h,  .8b
 generate_jsimd_ycc_rgb_convert_neon extbgr,  24, 2, .4h,  1, .4h,  0, .4h,  .8b
 generate_jsimd_ycc_rgb_convert_neon extrgbx, 32, 0, .4h,  1, .4h,  2, .4h,  .8b
@@ -1836,6 +1840,9 @@
 generate_jsimd_ycc_rgb_convert_neon extxrgb, 32, 1, .4h,  2, .4h,  3, .4h,  .8b
 generate_jsimd_ycc_rgb_convert_neon rgb565,  16, 0, .4h,  0, .4h,  0, .4h,  .8b
 
+generate_jsimd_ycc_rgb_convert_neon extrgb,  24, 0, .4h,  1, .4h,  2, .4h,  .8b,    0
+generate_jsimd_ycc_rgb_convert_neon extbgr,  24, 2, .4h,  1, .4h,  0, .4h,  .8b,    0
+
 .purgem do_load
 .purgem do_store
 
@@ -1887,50 +1894,44 @@
   .endif
 .endm
 
-#if defined(__APPLE__) || defined(__ANDROID__)
-/* TODO: expand this to include other devices that are known not to have a slow
- *       ld3 implementation. */
-#define LD3_IS_FAST
-#endif
-
-.macro do_load bpp, size
+.macro do_load bpp, size, fast_ld3
   .if \bpp == 24
     .if \size == 8
-#ifdef LD3_IS_FAST
-      ld3           {v10.8b, v11.8b, v12.8b}, [RGB], #24
-#else
-      ld1           {v10.b}[0], [RGB], #1
-      ld1           {v11.b}[0], [RGB], #1
-      ld1           {v12.b}[0], [RGB], #1
+      .if \fast_ld3 == 1
+        ld3         {v10.8b, v11.8b, v12.8b}, [RGB], #24
+      .else
+        ld1         {v10.b}[0], [RGB], #1
+        ld1         {v11.b}[0], [RGB], #1
+        ld1         {v12.b}[0], [RGB], #1
 
-      ld1           {v10.b}[1], [RGB], #1
-      ld1           {v11.b}[1], [RGB], #1
-      ld1           {v12.b}[1], [RGB], #1
+        ld1         {v10.b}[1], [RGB], #1
+        ld1         {v11.b}[1], [RGB], #1
+        ld1         {v12.b}[1], [RGB], #1
 
-      ld1           {v10.b}[2], [RGB], #1
-      ld1           {v11.b}[2], [RGB], #1
-      ld1           {v12.b}[2], [RGB], #1
+        ld1         {v10.b}[2], [RGB], #1
+        ld1         {v11.b}[2], [RGB], #1
+        ld1         {v12.b}[2], [RGB], #1
 
-      ld1           {v10.b}[3], [RGB], #1
-      ld1           {v11.b}[3], [RGB], #1
-      ld1           {v12.b}[3], [RGB], #1
+        ld1         {v10.b}[3], [RGB], #1
+        ld1         {v11.b}[3], [RGB], #1
+        ld1         {v12.b}[3], [RGB], #1
 
-      ld1           {v10.b}[4], [RGB], #1
-      ld1           {v11.b}[4], [RGB], #1
-      ld1           {v12.b}[4], [RGB], #1
+        ld1         {v10.b}[4], [RGB], #1
+        ld1         {v11.b}[4], [RGB], #1
+        ld1         {v12.b}[4], [RGB], #1
 
-      ld1           {v10.b}[5], [RGB], #1
-      ld1           {v11.b}[5], [RGB], #1
-      ld1           {v12.b}[5], [RGB], #1
+        ld1         {v10.b}[5], [RGB], #1
+        ld1         {v11.b}[5], [RGB], #1
+        ld1         {v12.b}[5], [RGB], #1
 
-      ld1           {v10.b}[6], [RGB], #1
-      ld1           {v11.b}[6], [RGB], #1
-      ld1           {v12.b}[6], [RGB], #1
+        ld1         {v10.b}[6], [RGB], #1
+        ld1         {v11.b}[6], [RGB], #1
+        ld1         {v12.b}[6], [RGB], #1
 
-      ld1           {v10.b}[7], [RGB], #1
-      ld1           {v11.b}[7], [RGB], #1
-      ld1           {v12.b}[7], [RGB], #1
-#endif
+        ld1         {v10.b}[7], [RGB], #1
+        ld1         {v11.b}[7], [RGB], #1
+        ld1         {v12.b}[7], [RGB], #1
+      .endif
       prfm          pldl1keep, [RGB, #128]
     .elseif \size == 4
       ld3           {v10.b, v11.b, v12.b}[0], [RGB], #3
@@ -1967,7 +1968,8 @@
   .endif
 .endm
 
-.macro generate_jsimd_rgb_ycc_convert_neon colorid, bpp, r_offs, g_offs, b_offs
+.macro generate_jsimd_rgb_ycc_convert_neon colorid, bpp, r_offs, g_offs, \
+                                           b_offs, fast_ld3 = 1
 
 /*
  * 2-stage pipelined RGB->YCbCr conversion
@@ -2020,9 +2022,9 @@
 
 /* TODO: expand macros and interleave instructions if some in-order
  *       ARM64 processor actually can dual-issue LOAD/STORE with ALU */
-.macro do_rgb_to_yuv_stage2_store_load_stage1
+.macro do_rgb_to_yuv_stage2_store_load_stage1 fast_ld3
     do_rgb_to_yuv_stage2
-    do_load         \bpp, 8
+    do_load         \bpp, 8, \fast_ld3
     st1             {v20.8b}, [Y], #8
     st1             {v21.8b}, [U], #8
     st1             {v22.8b}, [V], #8
@@ -2030,13 +2032,21 @@
 .endm
 
 .balign 16
+.if \fast_ld3 == 1
 Ljsimd_\colorid\()_ycc_neon_consts:
+.else
+Ljsimd_\colorid\()_ycc_neon_slowld3_consts:
+.endif
   .short 19595, 38470, 7471, 11059
   .short 21709, 32768, 27439, 5329
   .short 32767, 128, 32767, 128
   .short 32767, 128, 32767, 128
 
+.if \fast_ld3 == 1
 asm_function jsimd_\colorid\()_ycc_convert_neon
+.else
+asm_function jsimd_\colorid\()_ycc_convert_neon_slowld3
+.endif
     OUTPUT_WIDTH    .req w0
     INPUT_BUF       .req x1
     OUTPUT_BUF      .req x2
@@ -2081,12 +2091,12 @@
     /* Inner loop over pixels */
     subs            N, N, #8
     b.lt            3f
-    do_load         \bpp, 8
+    do_load         \bpp, 8, \fast_ld3
     do_rgb_to_yuv_stage1
     subs            N, N, #8
     b.lt            2f
 1:
-    do_rgb_to_yuv_stage2_store_load_stage1
+    do_rgb_to_yuv_stage2_store_load_stage1 \fast_ld3
     subs            N, N, #8
     b.ge            1b
 2:
@@ -2096,13 +2106,13 @@
     b.eq            8f
 3:
     tbz             N, #2, 3f
-    do_load         \bpp, 4
+    do_load         \bpp, 4, \fast_ld3
 3:
     tbz             N, #1, 4f
-    do_load         \bpp, 2
+    do_load         \bpp, 2, \fast_ld3
 4:
     tbz             N, #0, 5f
-    do_load         \bpp, 1
+    do_load         \bpp, 1, \fast_ld3
 5:
     do_rgb_to_yuv
     tbz             N, #2, 6f
@@ -2143,7 +2153,7 @@
 
 .endm
 
-/*--------------------------------- id ----- bpp R  G  B */
+/*--------------------------------- id ----- bpp R  G  B  Fast LD3 */
 generate_jsimd_rgb_ycc_convert_neon extrgb,  24, 0, 1, 2
 generate_jsimd_rgb_ycc_convert_neon extbgr,  24, 2, 1, 0
 generate_jsimd_rgb_ycc_convert_neon extrgbx, 32, 0, 1, 2
@@ -2151,6 +2161,9 @@
 generate_jsimd_rgb_ycc_convert_neon extxbgr, 32, 3, 2, 1
 generate_jsimd_rgb_ycc_convert_neon extxrgb, 32, 1, 2, 3
 
+generate_jsimd_rgb_ycc_convert_neon extrgb,  24, 0, 1, 2, 0
+generate_jsimd_rgb_ycc_convert_neon extbgr,  24, 2, 1, 0, 0
+
 .purgem do_load
 .purgem do_store
 
