Make sure that Java is disabled when 12-bit is enabled (Java requires TurboJPEG, which currently only supports 8-bit samples), and tabs-->spaces
git-svn-id: svn+ssh://svn.code.sf.net/p/libjpeg-turbo/code/trunk@1585 632fc199-4ca6-4c93-a231-07263d6284db
diff --git a/BUILDING.txt b/BUILDING.txt
index c40fc7c..485cfbe 100644
--- a/BUILDING.txt
+++ b/BUILDING.txt
@@ -81,14 +81,10 @@
for 64-bit build instructions.)
cd {source_directory}
- autoreconf -fiv
cd {build_directory}
sh {source_directory}/configure [additional configure flags]
make
-NOTE: Running autoreconf in the source directory is usually only necessary if
-building libjpeg-turbo from the SVN repository.
-
This will generate the following files under .libs/
libjpeg.a
@@ -422,12 +418,6 @@
Once built, lipo can be used to combine the ARMv6, v7, v7s, and/or v8 variants
into a universal library.
-NOTE: If you are building libjpeg-turbo from the "official" project tarball,
-then it is highly likely that you will need to run 'autoreconf -fiv' in the
-source tree prior to building ARMv7, v7s, or v8 iOS binaries using the
-techniques described above. Otherwise, you may get a libtool error such as
-"unable to infer tagged configuration."
-
Building libjpeg-turbo for Android
----------------------------------
diff --git a/CMakeLists.txt b/CMakeLists.txt
index 5ee91a4..2ceb109 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -9,7 +9,7 @@
endif()
project(libjpeg-turbo C)
-set(VERSION 1.4.1)
+set(VERSION 1.4.80)
if(CYGWIN OR NOT CMAKE_HOST_SYSTEM_NAME STREQUAL "Windows")
execute_process(COMMAND "date" "+%Y%m%d" OUTPUT_VARIABLE BUILD)
diff --git a/ChangeLog.txt b/ChangeLog.txt
index c3b3316..d9df904 100644
--- a/ChangeLog.txt
+++ b/ChangeLog.txt
@@ -1,3 +1,17 @@
+1.5 pre-beta
+============
+
+[1] Added full SIMD acceleration for PowerPC platforms using AltiVec VMX
+(128-bit SIMD) instructions. Although the performance of libjpeg-turbo on
+PowerPC was already good, due to the increased number of registers available
+to the compiler vs. x86, it was still possible to speed up compression by about
+3-4x and decompression by about 2-2.5x (relative to libjpeg v6b) through the
+use of AltiVec instructions.
+
+[2] Added a new libjpeg API function (jpeg_skip_scanlines()) that can be used
+to partially decode a JPEG image. See libjpeg.txt for more details.
+
+
1.4.1
=====
diff --git a/README-turbo.txt b/README-turbo.txt
index 2a779dd..32b1d10 100755
--- a/README-turbo.txt
+++ b/README-turbo.txt
@@ -3,12 +3,12 @@
*******************************************************************************
libjpeg-turbo is a JPEG image codec that uses SIMD instructions (MMX, SSE2,
-NEON) to accelerate baseline JPEG compression and decompression on x86, x86-64,
-and ARM systems. On such systems, libjpeg-turbo is generally 2-4x as fast as
-libjpeg, all else being equal. On other types of systems, libjpeg-turbo can
-still outperform libjpeg by a significant amount, by virtue of its
-highly-optimized Huffman coding routines. In many cases, the performance of
-libjpeg-turbo rivals that of proprietary high-speed JPEG codecs.
+NEON, AltiVec) to accelerate baseline JPEG compression and decompression on
+x86, x86-64, ARM, and PowerPC systems. On such systems, libjpeg-turbo is
+generally 2-4x as fast as libjpeg, all else being equal. On other types of
+systems, libjpeg-turbo can still outperform libjpeg by a significant amount, by
+virtue of its highly-optimized Huffman coding routines. In many cases, the
+performance of libjpeg-turbo rivals that of proprietary high-speed JPEG codecs.
libjpeg-turbo implements both the traditional libjpeg API as well as the less
powerful but more straightforward TurboJPEG API. libjpeg-turbo also features
diff --git a/configure.ac b/configure.ac
index 6ef4b5c..be9fcdf 100644
--- a/configure.ac
+++ b/configure.ac
@@ -2,7 +2,7 @@
# Process this file with autoconf to produce a configure script.
AC_PREREQ([2.56])
-AC_INIT([libjpeg-turbo], [1.4.1])
+AC_INIT([libjpeg-turbo], [1.4.80])
BUILD=`date +%Y%m%d`
AM_INIT_AUTOMAKE([-Wall foreign dist-bzip2])
@@ -495,6 +495,10 @@
fi
fi
;;
+ powerpc*)
+ AC_MSG_RESULT([yes (powerpc)])
+ simd_arch=powerpc
+ ;;
*)
AC_MSG_RESULT([no ("$host_cpu")])
with_simd=no;
@@ -520,6 +524,7 @@
AM_CONDITIONAL([SIMD_ARM], [test "x$simd_arch" = "xarm"])
AM_CONDITIONAL([SIMD_ARM_64], [test "x$simd_arch" = "xaarch64"])
AM_CONDITIONAL([SIMD_MIPS], [test "x$simd_arch" = "xmips"])
+AM_CONDITIONAL([SIMD_POWERPC], [test "x$simd_arch" = "xpowerpc"])
AM_CONDITIONAL([X86_64], [test "x$host_cpu" = "xx86_64" -o "x$host_cpu" = "xamd64"])
AM_CONDITIONAL([WITH_TURBOJPEG], [test "x$with_turbojpeg" != "xno"])
diff --git a/djpeg.c b/djpeg.c
index 8ddff96..6e3a0b9 100644
--- a/djpeg.c
+++ b/djpeg.c
@@ -4,7 +4,8 @@
* This file was part of the Independent JPEG Group's software:
* Copyright (C) 1991-1997, Thomas G. Lane.
* libjpeg-turbo Modifications:
- * Copyright (C) 2010-2011, 2013-2014, D. R. Commander.
+ * Copyright (C) 2010-2011, 2013-2015, D. R. Commander.
+ * Copyright (C) 2015, Google, Inc.
* For conditions of distribution and use, see the accompanying README file.
*
* This file contains a command-line user interface for the JPEG decompressor.
@@ -88,6 +89,8 @@
static const char * progname; /* program name for error messages */
static char * outfilename; /* for -outfile switch */
boolean memsrc; /* for -memsrc switch */
+boolean stripe, skip;
+JDIMENSION startY, endY;
#define INPUT_BUF_SIZE 4096
@@ -164,6 +167,8 @@
fprintf(stderr, " -memsrc Load input file into memory before decompressing\n");
#endif
+ fprintf(stderr, " -skip Y0,Y1 Skip decoding a horizontal stripe of the image [Y0, Y1)\n");
+ fprintf(stderr, " -stripe Y0,Y1 Decode only a horizontal stripe of the image [Y0, Y1)\n");
fprintf(stderr, " -verbose or -debug Emit debug output\n");
fprintf(stderr, " -version Print version information and exit\n");
exit(EXIT_FAILURE);
@@ -189,6 +194,8 @@
requested_fmt = DEFAULT_FMT; /* set default output file format */
outfilename = NULL;
memsrc = FALSE;
+ stripe = FALSE;
+ skip = FALSE;
cinfo->err->trace_level = 0;
/* Scan command line options, adjust parameters */
@@ -361,7 +368,7 @@
/* RLE output format. */
requested_fmt = FMT_RLE;
- } else if (keymatch(arg, "scale", 1)) {
+ } else if (keymatch(arg, "scale", 2)) {
/* Scale the output image by a fraction M/N. */
if (++argn >= argc) /* advance to next argument */
usage();
@@ -369,6 +376,21 @@
&cinfo->scale_num, &cinfo->scale_denom) != 2)
usage();
+ } else if (keymatch(arg, "stripe", 2)) {
+ if (++argn >= argc)
+ usage();
+ if (sscanf(argv[argn], "%d,%d", &startY, &endY) != 2 || startY > endY)
+ usage();
+ stripe = TRUE;
+
+
+ } else if (keymatch(arg, "skip", 2)) {
+ if (++argn >= argc)
+ usage();
+ if (sscanf(argv[argn], "%d,%d", &startY, &endY) != 2 || startY > endY)
+ usage();
+ skip = TRUE;
+
} else if (keymatch(arg, "targa", 1)) {
/* Targa output format. */
requested_fmt = FMT_TARGA;
@@ -634,14 +656,64 @@
/* Start decompressor */
(void) jpeg_start_decompress(&cinfo);
- /* Write output file header */
- (*dest_mgr->start_output) (&cinfo, dest_mgr);
+ /* Stripe decode */
+ if (stripe || skip) {
+ JDIMENSION tmp;
- /* Process data */
- while (cinfo.output_scanline < cinfo.output_height) {
- num_scanlines = jpeg_read_scanlines(&cinfo, dest_mgr->buffer,
- dest_mgr->buffer_height);
- (*dest_mgr->put_pixel_rows) (&cinfo, dest_mgr, num_scanlines);
+ /* Check for valid endY. We cannot check this value until after
+ * jpeg_start_decompress() is called. Note that we have already verified
+ * that startY <= endY.
+ */
+ if (endY > cinfo.output_height) {
+ fprintf(stderr, "%s: stripe %d-%d exceeds image height %d\n", progname,
+ startY, endY, cinfo.output_height);
+ exit(EXIT_FAILURE);
+ }
+
+ /* Write output file header. This is a hack to ensure that the destination
+ * manager creates an image of the proper size for the partial decode.
+ */
+ tmp = cinfo.output_height;
+ cinfo.output_height = endY - startY;
+ if (skip)
+ cinfo.output_height = tmp - cinfo.output_height;
+ (*dest_mgr->start_output) (&cinfo, dest_mgr);
+ cinfo.output_height = tmp;
+
+ /* Process data */
+ if (skip) {
+ while (cinfo.output_scanline < startY) {
+ num_scanlines = jpeg_read_scanlines(&cinfo, dest_mgr->buffer,
+ dest_mgr->buffer_height);
+ (*dest_mgr->put_pixel_rows) (&cinfo, dest_mgr, num_scanlines);
+ }
+ jpeg_skip_scanlines(&cinfo, endY - startY);
+ while (cinfo.output_scanline < cinfo.output_height) {
+ num_scanlines = jpeg_read_scanlines(&cinfo, dest_mgr->buffer,
+ dest_mgr->buffer_height);
+ (*dest_mgr->put_pixel_rows) (&cinfo, dest_mgr, num_scanlines);
+ }
+ } else {
+ jpeg_skip_scanlines(&cinfo, startY);
+ while (cinfo.output_scanline < endY) {
+ num_scanlines = jpeg_read_scanlines(&cinfo, dest_mgr->buffer,
+ dest_mgr->buffer_height);
+ (*dest_mgr->put_pixel_rows) (&cinfo, dest_mgr, num_scanlines);
+ }
+ jpeg_skip_scanlines(&cinfo, cinfo.output_height - endY);
+ }
+
+ /* Normal full image decode */
+ } else {
+ /* Write output file header */
+ (*dest_mgr->start_output) (&cinfo, dest_mgr);
+
+ /* Process data */
+ while (cinfo.output_scanline < cinfo.output_height) {
+ num_scanlines = jpeg_read_scanlines(&cinfo, dest_mgr->buffer,
+ dest_mgr->buffer_height);
+ (*dest_mgr->put_pixel_rows) (&cinfo, dest_mgr, num_scanlines);
+ }
}
#ifdef PROGRESS_REPORT
diff --git a/jdapistd.c b/jdapistd.c
index 3be527c..51b7dc6 100644
--- a/jdapistd.c
+++ b/jdapistd.c
@@ -4,7 +4,8 @@
* This file was part of the Independent JPEG Group's software:
* Copyright (C) 1994-1996, Thomas G. Lane.
* libjpeg-turbo Modifications:
- * Copyright (C) 2010, D. R. Commander.
+ * Copyright (C) 2010, 2015, D. R. Commander.
+ * Copyright (C) 2015, Google, Inc.
* For conditions of distribution and use, see the accompanying README file.
*
* This file contains application interface code for the decompression half
@@ -16,11 +17,10 @@
* whole decompression library into a transcoder.
*/
-#define JPEG_INTERNALS
-#include "jinclude.h"
-#include "jpeglib.h"
-#include "jpegcomp.h"
-
+#include "jdmainct.h"
+#include "jdcoefct.h"
+#include "jdsample.h"
+#include "jmemsys.h"
/* Forward declarations */
LOCAL(boolean) output_pass_setup (j_decompress_ptr cinfo);
@@ -179,6 +179,242 @@
}
+/* Prepare temporary row buffer */
+
+LOCAL(void)
+dummy_buffer_setup (j_decompress_ptr cinfo)
+{
+ int nc;
+
+ if (!cinfo->master || cinfo->master->dummy_row_buffer)
+ return;
+
+ nc = (cinfo->out_color_space == JCS_RGB565) ?
+ 2 : cinfo->out_color_components;
+ cinfo->master->dummy_row_buffer =
+ jpeg_get_small((j_common_ptr) cinfo,
+ cinfo->output_width * nc * sizeof(JSAMPLE));
+}
+
+
+/*
+ * Called by jpeg_skip_scanlines(). This partially skips a decompress block by
+ * incrementing the rowgroup counter.
+ */
+
+LOCAL(void)
+increment_simple_rowgroup_ctr (j_decompress_ptr cinfo, JDIMENSION rows)
+{
+ int i;
+ JDIMENSION rows_left;
+ my_main_ptr main_ptr = (my_main_ptr) cinfo->main;
+
+ /* Increment the counter to the next row group after the skipped rows. */
+ main_ptr->rowgroup_ctr += rows / cinfo->max_v_samp_factor;
+
+ /* Partially skipping a row group would involve modifying the internal state
+ * of the upsampler, so read the remaining rows into a dummy buffer instead.
+ */
+ rows_left = rows % cinfo->max_v_samp_factor;
+ cinfo->output_scanline += rows - rows_left;
+
+ dummy_buffer_setup(cinfo);
+ for (i = 0; i < rows_left; i++)
+ jpeg_read_scanlines(cinfo, &(cinfo->master->dummy_row_buffer), 1);
+}
+
+
+/*
+ * Called by jpeg_skip_scanlines(). When we skip iMCU rows, we must update the
+ * iMCU row counter.
+ */
+
+LOCAL(void)
+increment_iMCU_ctr (j_decompress_ptr cinfo, JDIMENSION iMCU_rows)
+{
+ my_main_ptr main_ptr = (my_main_ptr) cinfo->main;
+ if (main_ptr->iMCU_row_ctr == 0 && iMCU_rows > 0)
+ set_wraparound_pointers(cinfo);
+ main_ptr->iMCU_row_ctr += iMCU_rows;
+}
+
+
+/*
+ * Skips some scanlines of data from the JPEG decompressor.
+ *
+ * The return value will be the number of lines actually skipped. If skipping
+ * num_lines would move beyond the end of the image, then the actual number of
+ * lines remaining in the image is returned. Otherwise, the return value will
+ * be equal to num_lines.
+ *
+ * Refer to libjpeg.txt for more information.
+ */
+
+GLOBAL(JDIMENSION)
+jpeg_skip_scanlines (j_decompress_ptr cinfo, JDIMENSION num_lines)
+{
+ my_main_ptr main_ptr = (my_main_ptr) cinfo->main;
+ my_coef_ptr coef = (my_coef_ptr) cinfo->coef;
+ my_upsample_ptr upsample = (my_upsample_ptr) cinfo->upsample;
+ int i, y, x;
+ JDIMENSION lines_per_iMCU_row, lines_left_in_iMCU_row, lines_after_iMCU_row;
+ JDIMENSION lines_to_skip, lines_to_read;
+
+ if (cinfo->global_state != DSTATE_SCANNING)
+ ERREXIT1(cinfo, JERR_BAD_STATE, cinfo->global_state);
+
+ /* Do not skip past the bottom of the image. */
+ if (cinfo->output_scanline + num_lines >= cinfo->output_height) {
+ cinfo->output_scanline = cinfo->output_height;
+ return cinfo->output_height - cinfo->output_scanline;
+ }
+
+ if (num_lines == 0)
+ return 0;
+
+ lines_per_iMCU_row = cinfo->_min_DCT_scaled_size * cinfo->max_v_samp_factor;
+ lines_left_in_iMCU_row =
+ (lines_per_iMCU_row - (cinfo->output_scanline % lines_per_iMCU_row)) %
+ lines_per_iMCU_row;
+ lines_after_iMCU_row = num_lines - lines_left_in_iMCU_row;
+
+ /* Skip the lines remaining in the current iMCU row. When upsampling
+ * requires context rows, we need the previous and next rows in order to read
+ * the current row. This adds some complexity.
+ */
+ if (cinfo->upsample->need_context_rows) {
+ /* If the skipped lines would not move us past the current iMCU row, we
+ * read the lines and ignore them. There might be a faster way of doing
+ * this, but we are facing increasing complexity for diminishing returns.
+ * The increasing complexity would be a by-product of meddling with the
+ * state machine used to skip context rows. Near the end of an iMCU row,
+ * the next iMCU row may have already been entropy-decoded. In this unique
+ * case, we will read the next iMCU row if we cannot skip past it as well.
+ */
+ if ((num_lines < lines_left_in_iMCU_row + 1) ||
+ (lines_left_in_iMCU_row <= 1 && main_ptr->buffer_full &&
+ lines_after_iMCU_row < lines_per_iMCU_row + 1)) {
+ dummy_buffer_setup(cinfo);
+ for (i = 0; i < num_lines; i++)
+ jpeg_read_scanlines(cinfo, &(cinfo->master->dummy_row_buffer), 1);
+ return num_lines;
+ }
+
+ /* If the next iMCU row has already been entropy-decoded, make sure that
+ * we do not skip too far.
+ */
+ if (lines_left_in_iMCU_row <= 1 && main_ptr->buffer_full) {
+ cinfo->output_scanline += lines_left_in_iMCU_row + lines_per_iMCU_row;
+ lines_after_iMCU_row -= lines_per_iMCU_row;
+ } else {
+ cinfo->output_scanline += lines_left_in_iMCU_row;
+ }
+ main_ptr->buffer_full = FALSE;
+ main_ptr->rowgroup_ctr = 0;
+ main_ptr->context_state = CTX_PREPARE_FOR_IMCU;
+ upsample->next_row_out = cinfo->max_v_samp_factor;
+ upsample->rows_to_go = cinfo->output_height - cinfo->output_scanline;
+ }
+
+ /* Skipping is much simpler when context rows are not required. */
+ else {
+ if (num_lines < lines_left_in_iMCU_row) {
+ increment_simple_rowgroup_ctr(cinfo, num_lines);
+ return num_lines;
+ } else {
+ cinfo->output_scanline += lines_left_in_iMCU_row;
+ main_ptr->buffer_full = FALSE;
+ main_ptr->rowgroup_ctr = 0;
+ upsample->next_row_out = cinfo->max_v_samp_factor;
+ upsample->rows_to_go = cinfo->output_height - cinfo->output_scanline;
+ }
+ }
+
+ /* Calculate how many full iMCU rows we can skip. */
+ if (cinfo->upsample->need_context_rows)
+ lines_to_skip = ((lines_after_iMCU_row - 1) / lines_per_iMCU_row) *
+ lines_per_iMCU_row;
+ else
+ lines_to_skip = (lines_after_iMCU_row / lines_per_iMCU_row) *
+ lines_per_iMCU_row;
+ /* Calculate the number of lines that remain to be skipped after skipping all
+ * of the full iMCU rows that we can. We will not read these lines unless we
+ * have to.
+ */
+ lines_to_read = lines_after_iMCU_row - lines_to_skip;
+
+ /* For images requiring multiple scans (progressive, non-interleaved, etc.),
+ * all of the entropy decoding occurs in jpeg_start_decompress(), assuming
+ * that the input data source is non-suspending. This makes skipping easy.
+ */
+ if (cinfo->inputctl->has_multiple_scans) {
+ if (cinfo->upsample->need_context_rows) {
+ cinfo->output_scanline += lines_to_skip;
+ cinfo->output_iMCU_row += lines_to_skip / lines_per_iMCU_row;
+ increment_iMCU_ctr(cinfo, lines_after_iMCU_row / lines_per_iMCU_row);
+ /* It is complex to properly move to the middle of a context block, so
+ * read the remaining lines instead of skipping them.
+ */
+ dummy_buffer_setup(cinfo);
+ for (i = 0; i < lines_to_read; i++)
+ jpeg_read_scanlines(cinfo, &(cinfo->master->dummy_row_buffer), 1);
+ } else {
+ cinfo->output_scanline += lines_to_skip;
+ cinfo->output_iMCU_row += lines_to_skip / lines_per_iMCU_row;
+ increment_simple_rowgroup_ctr(cinfo, lines_to_read);
+ }
+ upsample->rows_to_go = cinfo->output_height - cinfo->output_scanline;
+ return num_lines;
+ }
+
+ /* Skip the iMCU rows that we can safely skip. */
+ for (i = 0; i < lines_to_skip; i += lines_per_iMCU_row) {
+ for (y = 0; y < coef->MCU_rows_per_iMCU_row; y++) {
+ for (x = 0; x < cinfo->MCUs_per_row; x++) {
+ /* Calling decode_mcu() with a NULL pointer causes it to discard the
+ * decoded coefficients. This is ~5% faster for large subsets, but
+ * it's tough to tell a difference for smaller images. Another
+ * advantage of discarding coefficients is that it allows us to avoid
+ * accessing the private field cinfo->coef->MCU_buffer (which would
+ * normally be a parameter to decode_mcu().)
+ */
+ (*cinfo->entropy->decode_mcu) (cinfo, NULL);
+ }
+ }
+ cinfo->input_iMCU_row++;
+ cinfo->output_iMCU_row++;
+ if (cinfo->input_iMCU_row < cinfo->total_iMCU_rows)
+ start_iMCU_row(cinfo);
+ else
+ (*cinfo->inputctl->finish_input_pass) (cinfo);
+ }
+ cinfo->output_scanline += lines_to_skip;
+
+ if (cinfo->upsample->need_context_rows) {
+ /* Context-based upsampling keeps track of iMCU rows. */
+ increment_iMCU_ctr(cinfo, lines_to_skip / lines_per_iMCU_row);
+
+ /* It is complex to properly move to the middle of a context block, so
+ * read the remaining lines instead of skipping them.
+ */
+ dummy_buffer_setup(cinfo);
+ for (i = 0; i < lines_to_read; i++)
+ jpeg_read_scanlines(cinfo, &(cinfo->master->dummy_row_buffer), 1);
+ } else {
+ increment_simple_rowgroup_ctr(cinfo, lines_to_read);
+ }
+
+ /* Since skipping lines involves skipping the upsampling step, the value of
+ * "rows_to_go" will become invalid unless we set it here. NOTE: This is a
+ * bit odd, since "rows_to_go" seems to be redundantly keeping track of
+ * output_scanline.
+ */
+ upsample->rows_to_go = cinfo->output_height - cinfo->output_scanline;
+
+ /* Always skip the requested number of lines. */
+ return num_lines;
+}
+
/*
* Alternate entry point to read raw data.
* Processes exactly one iMCU row per call, unless suspended.
@@ -271,6 +507,13 @@
if ((*cinfo->inputctl->consume_input) (cinfo) == JPEG_SUSPENDED)
return FALSE; /* Suspend, come back later */
}
+ /* Clean up row buffer */
+ if (cinfo->master->dummy_row_buffer) {
+ int nc = (cinfo->out_color_space == JCS_RGB565) ?
+ 2 : cinfo->out_color_components;
+ jpeg_free_small((j_common_ptr) cinfo, cinfo->master->dummy_row_buffer,
+ cinfo->output_width * nc * sizeof(JSAMPLE));
+ }
cinfo->global_state = DSTATE_BUFIMAGE;
return TRUE;
}
diff --git a/jdarith.c b/jdarith.c
index c6a1a99..24e67fb 100644
--- a/jdarith.c
+++ b/jdarith.c
@@ -3,8 +3,8 @@
*
* This file was part of the Independent JPEG Group's software:
* Developed 1997-2009 by Guido Vollbeding.
- * It was modified by The libjpeg-turbo Project to include only code relevant
- * to libjpeg-turbo.
+ * libjpeg-turbo Modifications:
+ * Copyright (C) 2015, D. R. Commander.
* For conditions of distribution and use, see the accompanying README file.
*
* This file contains portable arithmetic entropy decoding routines for JPEG
@@ -516,7 +516,7 @@
/* Outer loop handles each block in the MCU */
for (blkn = 0; blkn < cinfo->blocks_in_MCU; blkn++) {
- block = MCU_data[blkn];
+ block = MCU_data ? MCU_data[blkn] : NULL;
ci = cinfo->MCU_membership[blkn];
compptr = cinfo->cur_comp_info[ci];
@@ -563,7 +563,8 @@
entropy->last_dc_val[ci] += v;
}
- (*block)[0] = (JCOEF) entropy->last_dc_val[ci];
+ if (block)
+ (*block)[0] = (JCOEF) entropy->last_dc_val[ci];
/* Sections F.2.4.2 & F.1.4.4.2: Decoding of AC coefficients */
@@ -607,7 +608,8 @@
while (m >>= 1)
if (arith_decode(cinfo, st)) v |= m;
v += 1; if (sign) v = -v;
- (*block)[jpeg_natural_order[k]] = (JCOEF) v;
+ if (block)
+ (*block)[jpeg_natural_order[k]] = (JCOEF) v;
}
}
diff --git a/jdcoefct.c b/jdcoefct.c
index 199a628..17a97b1 100644
--- a/jdcoefct.c
+++ b/jdcoefct.c
@@ -4,6 +4,7 @@
* This file was part of the Independent JPEG Group's software:
* Copyright (C) 1994-1997, Thomas G. Lane.
* libjpeg-turbo Modifications:
+ * Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
* Copyright (C) 2010, D. R. Commander.
* For conditions of distribution and use, see the accompanying README file.
*
@@ -16,53 +17,9 @@
* Also, the input side (only) is used when reading a file for transcoding.
*/
-#define JPEG_INTERNALS
-#include "jinclude.h"
-#include "jpeglib.h"
+#include "jdcoefct.h"
#include "jpegcomp.h"
-/* Block smoothing is only applicable for progressive JPEG, so: */
-#ifndef D_PROGRESSIVE_SUPPORTED
-#undef BLOCK_SMOOTHING_SUPPORTED
-#endif
-
-/* Private buffer controller object */
-
-typedef struct {
- struct jpeg_d_coef_controller pub; /* public fields */
-
- /* These variables keep track of the current location of the input side. */
- /* cinfo->input_iMCU_row is also used for this. */
- JDIMENSION MCU_ctr; /* counts MCUs processed in current row */
- int MCU_vert_offset; /* counts MCU rows within iMCU row */
- int MCU_rows_per_iMCU_row; /* number of such rows needed */
-
- /* The output side's location is represented by cinfo->output_iMCU_row. */
-
- /* In single-pass modes, it's sufficient to buffer just one MCU.
- * We allocate a workspace of D_MAX_BLOCKS_IN_MCU coefficient blocks,
- * and let the entropy decoder write into that workspace each time.
- * In multi-pass modes, this array points to the current MCU's blocks
- * within the virtual arrays; it is used only by the input side.
- */
- JBLOCKROW MCU_buffer[D_MAX_BLOCKS_IN_MCU];
-
- /* Temporary workspace for one MCU */
- JCOEF * workspace;
-
-#ifdef D_MULTISCAN_FILES_SUPPORTED
- /* In multi-pass modes, we need a virtual block array for each component. */
- jvirt_barray_ptr whole_image[MAX_COMPONENTS];
-#endif
-
-#ifdef BLOCK_SMOOTHING_SUPPORTED
- /* When doing block smoothing, we latch coefficient Al values here */
- int * coef_bits_latch;
-#define SAVED_COEFS 6 /* we save coef_bits[0..5] */
-#endif
-} my_coef_controller;
-
-typedef my_coef_controller * my_coef_ptr;
/* Forward declarations */
METHODDEF(int) decompress_onepass
@@ -78,30 +35,6 @@
#endif
-LOCAL(void)
-start_iMCU_row (j_decompress_ptr cinfo)
-/* Reset within-iMCU-row counters for a new row (input side) */
-{
- my_coef_ptr coef = (my_coef_ptr) cinfo->coef;
-
- /* In an interleaved scan, an MCU row is the same as an iMCU row.
- * In a noninterleaved scan, an iMCU row has v_samp_factor MCU rows.
- * But at the bottom of the image, process only what's left.
- */
- if (cinfo->comps_in_scan > 1) {
- coef->MCU_rows_per_iMCU_row = 1;
- } else {
- if (cinfo->input_iMCU_row < (cinfo->total_iMCU_rows-1))
- coef->MCU_rows_per_iMCU_row = cinfo->cur_comp_info[0]->v_samp_factor;
- else
- coef->MCU_rows_per_iMCU_row = cinfo->cur_comp_info[0]->last_row_height;
- }
-
- coef->MCU_ctr = 0;
- coef->MCU_vert_offset = 0;
-}
-
-
/*
* Initialize for an input processing pass.
*/
diff --git a/jdcoefct.h b/jdcoefct.h
new file mode 100644
index 0000000..2f7bbe5
--- /dev/null
+++ b/jdcoefct.h
@@ -0,0 +1,82 @@
+/*
+ * jdcoefct.h
+ *
+ * This file was part of the Independent JPEG Group's software:
+ * Copyright (C) 1994-1997, Thomas G. Lane.
+ * libjpeg-turbo Modifications:
+ * Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
+ * For conditions of distribution and use, see the accompanying README file.
+ */
+
+#define JPEG_INTERNALS
+#include "jinclude.h"
+#include "jpeglib.h"
+
+
+/* Block smoothing is only applicable for progressive JPEG, so: */
+#ifndef D_PROGRESSIVE_SUPPORTED
+#undef BLOCK_SMOOTHING_SUPPORTED
+#endif
+
+
+/* Private buffer controller object */
+
+typedef struct {
+ struct jpeg_d_coef_controller pub; /* public fields */
+
+ /* These variables keep track of the current location of the input side. */
+ /* cinfo->input_iMCU_row is also used for this. */
+ JDIMENSION MCU_ctr; /* counts MCUs processed in current row */
+ int MCU_vert_offset; /* counts MCU rows within iMCU row */
+ int MCU_rows_per_iMCU_row; /* number of such rows needed */
+
+ /* The output side's location is represented by cinfo->output_iMCU_row. */
+
+ /* In single-pass modes, it's sufficient to buffer just one MCU.
+ * We allocate a workspace of D_MAX_BLOCKS_IN_MCU coefficient blocks,
+ * and let the entropy decoder write into that workspace each time.
+ * In multi-pass modes, this array points to the current MCU's blocks
+ * within the virtual arrays; it is used only by the input side.
+ */
+ JBLOCKROW MCU_buffer[D_MAX_BLOCKS_IN_MCU];
+
+ /* Temporary workspace for one MCU */
+ JCOEF * workspace;
+
+#ifdef D_MULTISCAN_FILES_SUPPORTED
+ /* In multi-pass modes, we need a virtual block array for each component. */
+ jvirt_barray_ptr whole_image[MAX_COMPONENTS];
+#endif
+
+#ifdef BLOCK_SMOOTHING_SUPPORTED
+ /* When doing block smoothing, we latch coefficient Al values here */
+ int * coef_bits_latch;
+#define SAVED_COEFS 6 /* we save coef_bits[0..5] */
+#endif
+} my_coef_controller;
+
+typedef my_coef_controller * my_coef_ptr;
+
+
+LOCAL(void)
+start_iMCU_row (j_decompress_ptr cinfo)
+/* Reset within-iMCU-row counters for a new row (input side) */
+{
+ my_coef_ptr coef = (my_coef_ptr) cinfo->coef;
+
+ /* In an interleaved scan, an MCU row is the same as an iMCU row.
+ * In a noninterleaved scan, an iMCU row has v_samp_factor MCU rows.
+ * But at the bottom of the image, process only what's left.
+ */
+ if (cinfo->comps_in_scan > 1) {
+ coef->MCU_rows_per_iMCU_row = 1;
+ } else {
+ if (cinfo->input_iMCU_row < (cinfo->total_iMCU_rows-1))
+ coef->MCU_rows_per_iMCU_row = cinfo->cur_comp_info[0]->v_samp_factor;
+ else
+ coef->MCU_rows_per_iMCU_row = cinfo->cur_comp_info[0]->last_row_height;
+ }
+
+ coef->MCU_ctr = 0;
+ coef->MCU_vert_offset = 0;
+}
diff --git a/jdhuff.c b/jdhuff.c
index cbdce7d..877ff10 100644
--- a/jdhuff.c
+++ b/jdhuff.c
@@ -4,7 +4,7 @@
* This file was part of the Independent JPEG Group's software:
* Copyright (C) 1991-1997, Thomas G. Lane.
* libjpeg-turbo Modifications:
- * Copyright (C) 2009-2011, D. R. Commander.
+ * Copyright (C) 2009-2011, 2015, D. R. Commander.
* For conditions of distribution and use, see the accompanying README file.
*
* This file contains Huffman entropy decoding routines.
@@ -562,7 +562,7 @@
ASSIGN_STATE(state, entropy->saved);
for (blkn = 0; blkn < cinfo->blocks_in_MCU; blkn++) {
- JBLOCKROW block = MCU_data[blkn];
+ JBLOCKROW block = MCU_data ? MCU_data[blkn] : NULL;
d_derived_tbl * dctbl = entropy->dc_cur_tbls[blkn];
d_derived_tbl * actbl = entropy->ac_cur_tbls[blkn];
register int s, k, r;
@@ -582,11 +582,13 @@
int ci = cinfo->MCU_membership[blkn];
s += state.last_dc_val[ci];
state.last_dc_val[ci] = s;
- /* Output the DC coefficient (assumes jpeg_natural_order[0] = 0) */
- (*block)[0] = (JCOEF) s;
+ if (block) {
+ /* Output the DC coefficient (assumes jpeg_natural_order[0] = 0) */
+ (*block)[0] = (JCOEF) s;
+ }
}
- if (entropy->ac_needed[blkn]) {
+ if (entropy->ac_needed[blkn] && block) {
/* Section F.2.2.2: decode the AC coefficients */
/* Since zeroes are skipped, output area must be cleared beforehand */
@@ -659,7 +661,7 @@
ASSIGN_STATE(state, entropy->saved);
for (blkn = 0; blkn < cinfo->blocks_in_MCU; blkn++) {
- JBLOCKROW block = MCU_data[blkn];
+ JBLOCKROW block = MCU_data ? MCU_data[blkn] : NULL;
d_derived_tbl * dctbl = entropy->dc_cur_tbls[blkn];
d_derived_tbl * actbl = entropy->ac_cur_tbls[blkn];
register int s, k, r, l;
@@ -675,10 +677,11 @@
int ci = cinfo->MCU_membership[blkn];
s += state.last_dc_val[ci];
state.last_dc_val[ci] = s;
- (*block)[0] = (JCOEF) s;
+ if (block)
+ (*block)[0] = (JCOEF) s;
}
- if (entropy->ac_needed[blkn]) {
+ if (entropy->ac_needed[blkn] && block) {
for (k = 1; k < DCTSIZE2; k++) {
HUFF_DECODE_FAST(s, l, actbl);
diff --git a/jdmainct.c b/jdmainct.c
index 7f7bd33..e4ba1c4 100644
--- a/jdmainct.c
+++ b/jdmainct.c
@@ -15,10 +15,7 @@
* supplies the equivalent of the main buffer in that case.
*/
-#define JPEG_INTERNALS
-#include "jinclude.h"
-#include "jpeglib.h"
-#include "jpegcomp.h"
+#include "jdmainct.h"
/*
@@ -112,36 +109,6 @@
*/
-/* Private buffer controller object */
-
-typedef struct {
- struct jpeg_d_main_controller pub; /* public fields */
-
- /* Pointer to allocated workspace (M or M+2 row groups). */
- JSAMPARRAY buffer[MAX_COMPONENTS];
-
- boolean buffer_full; /* Have we gotten an iMCU row from decoder? */
- JDIMENSION rowgroup_ctr; /* counts row groups output to postprocessor */
-
- /* Remaining fields are only used in the context case. */
-
- /* These are the master pointers to the funny-order pointer lists. */
- JSAMPIMAGE xbuffer[2]; /* pointers to weird pointer lists */
-
- int whichptr; /* indicates which pointer set is now in use */
- int context_state; /* process_data state machine status */
- JDIMENSION rowgroups_avail; /* row groups available to postprocessor */
- JDIMENSION iMCU_row_ctr; /* counts iMCU rows to detect image top/bot */
-} my_main_controller;
-
-typedef my_main_controller * my_main_ptr;
-
-/* context_state values: */
-#define CTX_PREPARE_FOR_IMCU 0 /* need to prepare for MCU row */
-#define CTX_PROCESS_IMCU 1 /* feeding iMCU to postprocessor */
-#define CTX_POSTPONED_ROW 2 /* feeding postponed row group */
-
-
/* Forward declarations */
METHODDEF(void) process_data_simple_main
(j_decompress_ptr cinfo, JSAMPARRAY output_buf,
@@ -238,34 +205,6 @@
LOCAL(void)
-set_wraparound_pointers (j_decompress_ptr cinfo)
-/* Set up the "wraparound" pointers at top and bottom of the pointer lists.
- * This changes the pointer list state from top-of-image to the normal state.
- */
-{
- my_main_ptr main_ptr = (my_main_ptr) cinfo->main;
- int ci, i, rgroup;
- int M = cinfo->_min_DCT_scaled_size;
- jpeg_component_info *compptr;
- JSAMPARRAY xbuf0, xbuf1;
-
- for (ci = 0, compptr = cinfo->comp_info; ci < cinfo->num_components;
- ci++, compptr++) {
- rgroup = (compptr->v_samp_factor * compptr->_DCT_scaled_size) /
- cinfo->_min_DCT_scaled_size; /* height of a row group of component */
- xbuf0 = main_ptr->xbuffer[0][ci];
- xbuf1 = main_ptr->xbuffer[1][ci];
- for (i = 0; i < rgroup; i++) {
- xbuf0[i - rgroup] = xbuf0[rgroup*(M+1) + i];
- xbuf1[i - rgroup] = xbuf1[rgroup*(M+1) + i];
- xbuf0[rgroup*(M+2) + i] = xbuf0[i];
- xbuf1[rgroup*(M+2) + i] = xbuf1[i];
- }
- }
-}
-
-
-LOCAL(void)
set_bottom_pointers (j_decompress_ptr cinfo)
/* Change the pointer lists to duplicate the last sample row at the bottom
* of the image. whichptr indicates which xbuffer holds the final iMCU row.
diff --git a/jdmainct.h b/jdmainct.h
new file mode 100644
index 0000000..37ab27d
--- /dev/null
+++ b/jdmainct.h
@@ -0,0 +1,71 @@
+/*
+ * jdmainct.h
+ *
+ * This file was part of the Independent JPEG Group's software:
+ * Copyright (C) 1994-1996, Thomas G. Lane.
+ * For conditions of distribution and use, see the accompanying README file.
+ */
+
+#define JPEG_INTERNALS
+#include "jinclude.h"
+#include "jpeglib.h"
+#include "jpegcomp.h"
+
+
+/* Private buffer controller object */
+
+typedef struct {
+ struct jpeg_d_main_controller pub; /* public fields */
+
+ /* Pointer to allocated workspace (M or M+2 row groups). */
+ JSAMPARRAY buffer[MAX_COMPONENTS];
+
+ boolean buffer_full; /* Have we gotten an iMCU row from decoder? */
+ JDIMENSION rowgroup_ctr; /* counts row groups output to postprocessor */
+
+ /* Remaining fields are only used in the context case. */
+
+ /* These are the master pointers to the funny-order pointer lists. */
+ JSAMPIMAGE xbuffer[2]; /* pointers to weird pointer lists */
+
+ int whichptr; /* indicates which pointer set is now in use */
+ int context_state; /* process_data state machine status */
+ JDIMENSION rowgroups_avail; /* row groups available to postprocessor */
+ JDIMENSION iMCU_row_ctr; /* counts iMCU rows to detect image top/bot */
+} my_main_controller;
+
+typedef my_main_controller * my_main_ptr;
+
+
+/* context_state values: */
+#define CTX_PREPARE_FOR_IMCU 0 /* need to prepare for MCU row */
+#define CTX_PROCESS_IMCU 1 /* feeding iMCU to postprocessor */
+#define CTX_POSTPONED_ROW 2 /* feeding postponed row group */
+
+
+LOCAL(void)
+set_wraparound_pointers (j_decompress_ptr cinfo)
+/* Set up the "wraparound" pointers at top and bottom of the pointer lists.
+ * This changes the pointer list state from top-of-image to the normal state.
+ */
+{
+ my_main_ptr main_ptr = (my_main_ptr) cinfo->main;
+ int ci, i, rgroup;
+ int M = cinfo->_min_DCT_scaled_size;
+ jpeg_component_info *compptr;
+ JSAMPARRAY xbuf0, xbuf1;
+
+ for (ci = 0, compptr = cinfo->comp_info; ci < cinfo->num_components;
+ ci++, compptr++) {
+ rgroup = (compptr->v_samp_factor * compptr->_DCT_scaled_size) /
+ cinfo->_min_DCT_scaled_size; /* height of a row group of component */
+ xbuf0 = main_ptr->xbuffer[0][ci];
+ xbuf1 = main_ptr->xbuffer[1][ci];
+ for (i = 0; i < rgroup; i++) {
+ xbuf0[i - rgroup] = xbuf0[rgroup*(M+1) + i];
+ xbuf1[i - rgroup] = xbuf1[rgroup*(M+1) + i];
+ xbuf0[rgroup*(M+2) + i] = xbuf0[i];
+ xbuf1[rgroup*(M+2) + i] = xbuf1[i];
+ }
+ }
+}
diff --git a/jdsample.c b/jdsample.c
index 2752966..5738c2f 100644
--- a/jdsample.c
+++ b/jdsample.c
@@ -22,51 +22,11 @@
* Pub. by IEEE Computer Society Press, Los Alamitos, CA. ISBN 0-8186-8944-7.
*/
-#define JPEG_INTERNALS
-#include "jinclude.h"
-#include "jpeglib.h"
+#include "jdsample.h"
#include "jsimd.h"
#include "jpegcomp.h"
-/* Pointer to routine to upsample a single component */
-typedef void (*upsample1_ptr) (j_decompress_ptr cinfo,
- jpeg_component_info * compptr,
- JSAMPARRAY input_data,
- JSAMPARRAY * output_data_ptr);
-
-/* Private subobject */
-
-typedef struct {
- struct jpeg_upsampler pub; /* public fields */
-
- /* Color conversion buffer. When using separate upsampling and color
- * conversion steps, this buffer holds one upsampled row group until it
- * has been color converted and output.
- * Note: we do not allocate any storage for component(s) which are full-size,
- * ie do not need rescaling. The corresponding entry of color_buf[] is
- * simply set to point to the input data array, thereby avoiding copying.
- */
- JSAMPARRAY color_buf[MAX_COMPONENTS];
-
- /* Per-component upsampling method pointers */
- upsample1_ptr methods[MAX_COMPONENTS];
-
- int next_row_out; /* counts rows emitted from color_buf */
- JDIMENSION rows_to_go; /* counts rows remaining in image */
-
- /* Height of an input row group for each component. */
- int rowgroup_height[MAX_COMPONENTS];
-
- /* These arrays save pixel expansion factors so that int_expand need not
- * recompute them each time. They are unused for other upsampling methods.
- */
- UINT8 h_expand[MAX_COMPONENTS];
- UINT8 v_expand[MAX_COMPONENTS];
-} my_upsampler;
-
-typedef my_upsampler * my_upsample_ptr;
-
/*
* Initialize for an upsampling pass.
diff --git a/jdsample.h b/jdsample.h
new file mode 100644
index 0000000..5226f26
--- /dev/null
+++ b/jdsample.h
@@ -0,0 +1,50 @@
+/*
+ * jdsample.h
+ *
+ * This file was part of the Independent JPEG Group's software:
+ * Copyright (C) 1991-1996, Thomas G. Lane.
+ * For conditions of distribution and use, see the accompanying README file.
+ */
+
+#define JPEG_INTERNALS
+#include "jinclude.h"
+#include "jpeglib.h"
+
+
+/* Pointer to routine to upsample a single component */
+typedef void (*upsample1_ptr) (j_decompress_ptr cinfo,
+ jpeg_component_info * compptr,
+ JSAMPARRAY input_data,
+ JSAMPARRAY * output_data_ptr);
+
+/* Private subobject */
+
+typedef struct {
+ struct jpeg_upsampler pub; /* public fields */
+
+ /* Color conversion buffer. When using separate upsampling and color
+ * conversion steps, this buffer holds one upsampled row group until it
+ * has been color converted and output.
+ * Note: we do not allocate any storage for component(s) which are full-size,
+ * ie do not need rescaling. The corresponding entry of color_buf[] is
+ * simply set to point to the input data array, thereby avoiding copying.
+ */
+ JSAMPARRAY color_buf[MAX_COMPONENTS];
+
+ /* Per-component upsampling method pointers */
+ upsample1_ptr methods[MAX_COMPONENTS];
+
+ int next_row_out; /* counts rows emitted from color_buf */
+ JDIMENSION rows_to_go; /* counts rows remaining in image */
+
+ /* Height of an input row group for each component. */
+ int rowgroup_height[MAX_COMPONENTS];
+
+ /* These arrays save pixel expansion factors so that int_expand need not
+ * recompute them each time. They are unused for other upsampling methods.
+ */
+ UINT8 h_expand[MAX_COMPONENTS];
+ UINT8 v_expand[MAX_COMPONENTS];
+} my_upsampler;
+
+typedef my_upsampler * my_upsample_ptr;
diff --git a/jpegint.h b/jpegint.h
index 025accd..0e4dafe 100644
--- a/jpegint.h
+++ b/jpegint.h
@@ -4,8 +4,8 @@
* This file was part of the Independent JPEG Group's software:
* Copyright (C) 1991-1997, Thomas G. Lane.
* Modified 1997-2009 by Guido Vollbeding.
- * It was modified by The libjpeg-turbo Project to include only code relevant
- * to libjpeg-turbo.
+ * libjpeg-turbo Modifications:
+ * Copyright (C) 2015, Google, Inc.
* For conditions of distribution and use, see the accompanying README file.
*
* This file provides common declarations for the various JPEG modules.
@@ -137,6 +137,12 @@
/* State variables made visible to other modules */
boolean is_dummy_pass; /* True during 1st pass for 2-pass quant */
+
+ /* Buffer large enough to store an output row. This is used when
+ * jpeg_skip_scanlines() chooses to "skip" a row by reading it into this
+ * dummy buffer.
+ */
+ JSAMPROW dummy_row_buffer;
};
/* Input control module */
diff --git a/jpeglib.h b/jpeglib.h
index 9615c5d..a41bec4 100644
--- a/jpeglib.h
+++ b/jpeglib.h
@@ -6,6 +6,7 @@
* Modified 2002-2009 by Guido Vollbeding.
* libjpeg-turbo Modifications:
* Copyright (C) 2009-2011, 2013-2014, D. R. Commander.
+ * Copyright (C) 2015, Google, Inc.
* For conditions of distribution and use, see the accompanying README file.
*
* This file defines the application interface for the JPEG library.
@@ -990,6 +991,8 @@
EXTERN(JDIMENSION) jpeg_read_scanlines (j_decompress_ptr cinfo,
JSAMPARRAY scanlines,
JDIMENSION max_lines);
+EXTERN(JDIMENSION) jpeg_skip_scanlines (j_decompress_ptr cinfo,
+ JDIMENSION num_lines);
EXTERN(boolean) jpeg_finish_decompress (j_decompress_ptr cinfo);
/* Replaces jpeg_read_scanlines when reading raw downsampled data. */
diff --git a/libjpeg.txt b/libjpeg.txt
index 7dbb354..52b6d8b 100644
--- a/libjpeg.txt
+++ b/libjpeg.txt
@@ -3,7 +3,8 @@
This file was part of the Independent JPEG Group's software:
Copyright (C) 1994-2011, Thomas G. Lane, Guido Vollbeding.
libjpeg-turbo Modifications:
-Copyright (C) 2010, 2014, D. R. Commander.
+Copyright (C) 2010, 2014, 2015, D. R. Commander.
+Copyright (C) 2015, Google, Inc.
For conditions of distribution and use, see the accompanying README file.
@@ -729,6 +730,49 @@
The previous discussion of aborting compression cycles applies here too.
+Skipping rows when decompressing
+--------------------------------
+
+jpeg_skip_scanlines(j_decompress_ptr cinfo, JDIMENSION num_lines);
+
+This function provides application programmers with the ability to skip over
+multiple rows in the JPEG image, thus decoding only a subset of the image data.
+This is convenient for performance-critical applications that wish to view only
+a portion of a large JPEG image without decompressing the whole thing. It it
+also useful in memory-constrained environments (such as on mobile devices.)
+
+Suspending data sources are not supported by this function. Calling
+jpeg_skip_scanlines() with a suspending data source will result in undefined
+behavior.
+
+jpeg_skip_scanlines() will not allow skipping past the bottom of the image. If
+the value of num_lines is large enough to skip past the bottom of the image,
+then the function will skip to the end of the image instead.
+
+If the value of num_lines is valid, then jpeg_skip_scanlines() will always
+skip all of the input rows requested. There is no need to inspect the return
+value of the function in that case.
+
+Best results will be achieved by calling jpeg_skip_scanlines() for large chunks
+of rows. The function should be viewed as a way to quickly jump to a
+particular vertical offset in the JPEG image in order to decode a subset of the
+image. Used in this manner, it will provide significant performance
+improvements.
+
+Calling jpeg_skip_scanlines() for small values of num_lines has several
+potential drawbacks:
+ 1) JPEG decompression occurs in blocks, so if jpeg_skip_scanlines() is
+ called from the middle of a decompression block, then it is likely that
+ much of the decompression work has already been done for the first
+ couple of rows that need to be skipped.
+ 2) When this function returns, it must leave the decompressor in a state
+ such that it is ready to read the next line. This may involve
+ decompressing a block that must be partially skipped.
+These issues are especially tricky for cases in which upsampling requires
+context rows. In the worst case, jpeg_skip_scanlines() will perform similarly
+to jpeg_read_scanlines() (since it will actually call jpeg_read_scanlines().)
+
+
Mechanics of usage: include files, linking, etc
-----------------------------------------------
diff --git a/release/ReadMe.txt b/release/ReadMe.txt
index b9f6ca5..2f00e8a 100644
--- a/release/ReadMe.txt
+++ b/release/ReadMe.txt
@@ -1,4 +1,4 @@
-libjpeg-turbo is a JPEG image codec that uses SIMD instructions (MMX, SSE2, NEON) to accelerate baseline JPEG compression and decompression on x86, x86-64, and ARM systems. On such systems, libjpeg-turbo is generally 2-4x as fast as libjpeg, all else being equal. On other types of systems, libjpeg-turbo can still outperform libjpeg by a significant amount, by virtue of its highly-optimized Huffman coding routines. In many cases, the performance of libjpeg-turbo rivals that of proprietary high-speed JPEG codecs.
+libjpeg-turbo is a JPEG image codec that uses SIMD instructions (MMX, SSE2, NEON, AltiVec) to accelerate baseline JPEG compression and decompression on x86, x86-64, ARM, and PowerPC systems. On such systems, libjpeg-turbo is generally 2-4x as fast as libjpeg, all else being equal. On other types of systems, libjpeg-turbo can still outperform libjpeg by a significant amount, by virtue of its highly-optimized Huffman coding routines. In many cases, the performance of libjpeg-turbo rivals that of proprietary high-speed JPEG codecs.
libjpeg-turbo implements both the traditional libjpeg API as well as the less powerful but more straightforward TurboJPEG API. libjpeg-turbo also features colorspace extensions that allow it to compress from/decompress to 32-bit and big-endian pixel buffers (RGBX, XBGR, etc.), as well as a full-featured Java interface.
diff --git a/release/deb-control.tmpl b/release/deb-control.tmpl
index 510b1d6..1a6242b 100644
--- a/release/deb-control.tmpl
+++ b/release/deb-control.tmpl
@@ -9,11 +9,11 @@
Installed-Size: {__SIZE}
Description: A SIMD-accelerated JPEG codec that provides both the libjpeg and TurboJPEG APIs
libjpeg-turbo is a JPEG image codec that uses SIMD instructions (MMX, SSE2,
- NEON) to accelerate baseline JPEG compression and decompression on x86,
- x86-64, and ARM systems. On such systems, libjpeg-turbo is generally 2-4x as
- fast as libjpeg, all else being equal. On other types of systems,
- libjpeg-turbo can still outperform libjpeg by a significant amount, by virtue
- of its highly-optimized Huffman coding routines. In many cases, the
+ NEON, AltiVec) to accelerate baseline JPEG compression and decompression on
+ x86, x86-64, ARM, and PowerPC systems. On such systems, libjpeg-turbo is
+ generally 2-4x as fast as libjpeg, all else being equal. On other types of
+ systems, libjpeg-turbo can still outperform libjpeg by a significant amount,
+ by virtue of its highly-optimized Huffman coding routines. In many cases, the
performance of libjpeg-turbo rivals that of proprietary high-speed JPEG
codecs.
.
diff --git a/release/libjpeg-turbo.spec.in b/release/libjpeg-turbo.spec.in
index 23793cf..03d75d1 100644
--- a/release/libjpeg-turbo.spec.in
+++ b/release/libjpeg-turbo.spec.in
@@ -44,12 +44,12 @@
%description
libjpeg-turbo is a JPEG image codec that uses SIMD instructions (MMX, SSE2,
-NEON) to accelerate baseline JPEG compression and decompression on x86, x86-64,
-and ARM systems. On such systems, libjpeg-turbo is generally 2-4x as fast as
-libjpeg, all else being equal. On other types of systems, libjpeg-turbo can
-still outperform libjpeg by a significant amount, by virtue of its
-highly-optimized Huffman coding routines. In many cases, the performance of
-libjpeg-turbo rivals that of proprietary high-speed JPEG codecs.
+NEON, AltiVec) to accelerate baseline JPEG compression and decompression on
+x86, x86-64, ARM, and PowerPC systems. On such systems, libjpeg-turbo is
+generally 2-4x as fast as libjpeg, all else being equal. On other types of
+systems, libjpeg-turbo can still outperform libjpeg by a significant amount, by
+virtue of its highly-optimized Huffman coding routines. In many cases, the
+performance of libjpeg-turbo rivals that of proprietary high-speed JPEG codecs.
libjpeg-turbo implements both the traditional libjpeg API as well as the less
powerful but more straightforward TurboJPEG API. libjpeg-turbo also features
diff --git a/simd/Makefile.am b/simd/Makefile.am
index 3029f1c..ebb9ec6 100644
--- a/simd/Makefile.am
+++ b/simd/Makefile.am
@@ -6,7 +6,8 @@
jccolext-mmx.asm jcgryext-mmx.asm jdcolext-mmx.asm jdmrgext-mmx.asm \
jccolext-sse2.asm jcgryext-sse2.asm jdcolext-sse2.asm jdmrgext-sse2.asm \
jccolext-sse2-64.asm jcgryext-sse2-64.asm jdcolext-sse2-64.asm \
- jdmrgext-sse2-64.asm
+ jdmrgext-sse2-64.asm jccolext-altivec.c jcgryext-altivec.c \
+ jdcolext-altivec.c jdmrgext-altivec.c
if SIMD_X86_64
@@ -70,6 +71,23 @@
endif
+if SIMD_POWERPC
+
+libsimd_la_SOURCES = jsimd_powerpc.c jsimd_altivec.h jcsample.h \
+ jccolor-altivec.c jcgray-altivec.c jcsample-altivec.c \
+ jdcolor-altivec.c jdmerge-altivec.c jdsample-altivec.c \
+ jfdctfst-altivec.c jfdctint-altivec.c \
+ jidctfst-altivec.c jidctint-altivec.c \
+ jquanti-altivec.c
+libsimd_la_CFLAGS = -maltivec
+
+jccolor-altivec.lo: jccolext-altivec.c
+jcgray-altivec.lo: jcgryext-altivec.c
+jdcolor-altivec.lo: jdcolext-altivec.c
+jdmerge-altivec.lo: jdmrgext-altivec.c
+
+endif
+
AM_CPPFLAGS = -I$(top_srcdir)
.asm.lo:
diff --git a/simd/jccolext-altivec.c b/simd/jccolext-altivec.c
new file mode 100644
index 0000000..403aa96
--- /dev/null
+++ b/simd/jccolext-altivec.c
@@ -0,0 +1,267 @@
+/*
+ * AltiVec optimizations for libjpeg-turbo
+ *
+ * Copyright (C) 2014-2015, D. R. Commander.
+ * Copyright (C) 2014, Jay Foad.
+ * All rights reserved.
+ * This software is provided 'as-is', without any express or implied
+ * warranty. In no event will the authors be held liable for any damages
+ * arising from the use of this software.
+ *
+ * Permission is granted to anyone to use this software for any purpose,
+ * including commercial applications, and to alter it and redistribute it
+ * freely, subject to the following restrictions:
+ *
+ * 1. The origin of this software must not be misrepresented; you must not
+ * claim that you wrote the original software. If you use this software
+ * in a product, an acknowledgment in the product documentation would be
+ * appreciated but is not required.
+ * 2. Altered source versions must be plainly marked as such, and must not be
+ * misrepresented as being the original software.
+ * 3. This notice may not be removed or altered from any source distribution.
+ */
+
+/* This file is included by jccolor-altivec.c */
+
+
+void jsimd_rgb_ycc_convert_altivec (JDIMENSION img_width, JSAMPARRAY input_buf,
+ JSAMPIMAGE output_buf,
+ JDIMENSION output_row, int num_rows)
+{
+ JSAMPROW inptr, outptr0, outptr1, outptr2;
+ int pitch = img_width * RGB_PIXELSIZE, num_cols;
+#if __BIG_ENDIAN__
+ int offset;
+#endif
+ unsigned char __attribute__((aligned(16))) tmpbuf[RGB_PIXELSIZE * 16];
+
+ __vector unsigned char rgb0, rgb1 = {0}, rgb2 = {0},
+ rgbg0, rgbg1, rgbg2, rgbg3, y, cb, cr;
+#if __BIG_ENDIAN__ || RGB_PIXELSIZE == 4
+ __vector unsigned char rgb3 = {0};
+#endif
+#if __BIG_ENDIAN__ && RGB_PIXELSIZE == 4
+ __vector unsigned char rgb4 = {0};
+#endif
+ __vector short rg0, rg1, rg2, rg3, bg0, bg1, bg2, bg3;
+ __vector unsigned short yl, yh, crl, crh, cbl, cbh;
+ __vector int y0, y1, y2, y3, cr0, cr1, cr2, cr3, cb0, cb1, cb2, cb3;
+
+ /* Constants */
+ __vector short pw_f0299_f0337 = { __4X2(F_0_299, F_0_337) },
+ pw_f0114_f0250 = { __4X2(F_0_114, F_0_250) },
+ pw_mf016_mf033 = { __4X2(-F_0_168, -F_0_331) },
+ pw_mf008_mf041 = { __4X2(-F_0_081, -F_0_418) };
+ __vector unsigned short pw_f050_f000 = { __4X2(F_0_500, 0) };
+ __vector int pd_onehalf = { __4X(ONE_HALF) },
+ pd_onehalfm1_cj = { __4X(ONE_HALF - 1 + (CENTERJSAMPLE << SCALEBITS)) };
+ __vector unsigned char pb_zero = { __16X(0) },
+#if __BIG_ENDIAN__
+ shift_pack_index = {0,1,4,5,8,9,12,13,16,17,20,21,24,25,28,29};
+#else
+ shift_pack_index = {2,3,6,7,10,11,14,15,18,19,22,23,26,27,30,31};
+#endif
+
+ while (--num_rows >= 0) {
+ inptr = *input_buf++;
+ outptr0 = output_buf[0][output_row];
+ outptr1 = output_buf[1][output_row];
+ outptr2 = output_buf[2][output_row];
+ output_row++;
+
+ for (num_cols = pitch; num_cols > 0;
+ num_cols -= RGB_PIXELSIZE * 16, inptr += RGB_PIXELSIZE * 16,
+ outptr0 += 16, outptr1 += 16, outptr2 += 16) {
+
+#if __BIG_ENDIAN__
+ /* Load 16 pixels == 48 or 64 bytes */
+ offset = (size_t)inptr & 15;
+ if (offset) {
+ __vector unsigned char unaligned_shift_index;
+ int bytes = num_cols + offset;
+
+ if (bytes < (RGB_PIXELSIZE + 1) * 16 && (bytes & 15)) {
+ /* Slow path to prevent buffer overread. Since there is no way to
+ * read a partial AltiVec register, overread would occur on the last
+ * chunk of the last image row if the right edge is not on a 16-byte
+ * boundary. It could also occur on other rows if the bytes per row
+ * is low enough. Since we can't determine whether we're on the last
+ * image row, we have to assume every row is the last.
+ */
+ memcpy(tmpbuf, inptr, min(num_cols, RGB_PIXELSIZE * 16));
+ rgb0 = vec_ld(0, tmpbuf);
+ rgb1 = vec_ld(16, tmpbuf);
+ rgb2 = vec_ld(32, tmpbuf);
+#if RGB_PIXELSIZE == 4
+ rgb3 = vec_ld(48, tmpbuf);
+#endif
+ } else {
+ /* Fast path */
+ rgb0 = vec_ld(0, inptr);
+ if (bytes > 16)
+ rgb1 = vec_ld(16, inptr);
+ if (bytes > 32)
+ rgb2 = vec_ld(32, inptr);
+ if (bytes > 48)
+ rgb3 = vec_ld(48, inptr);
+#if RGB_PIXELSIZE == 4
+ if (bytes > 64)
+ rgb4 = vec_ld(64, inptr);
+#endif
+ unaligned_shift_index = vec_lvsl(0, inptr);
+ rgb0 = vec_perm(rgb0, rgb1, unaligned_shift_index);
+ rgb1 = vec_perm(rgb1, rgb2, unaligned_shift_index);
+ rgb2 = vec_perm(rgb2, rgb3, unaligned_shift_index);
+#if RGB_PIXELSIZE == 4
+ rgb3 = vec_perm(rgb3, rgb4, unaligned_shift_index);
+#endif
+ }
+ } else {
+#endif /* __BIG_ENDIAN__ */
+ if (num_cols < RGB_PIXELSIZE * 16 && (num_cols & 15)) {
+ /* Slow path */
+ memcpy(tmpbuf, inptr, min(num_cols, RGB_PIXELSIZE * 16));
+ rgb0 = VEC_LD(0, tmpbuf);
+ rgb1 = VEC_LD(16, tmpbuf);
+ rgb2 = VEC_LD(32, tmpbuf);
+#if RGB_PIXELSIZE == 4
+ rgb3 = VEC_LD(48, tmpbuf);
+#endif
+ } else {
+ /* Fast path */
+ rgb0 = VEC_LD(0, inptr);
+ if (num_cols > 16)
+ rgb1 = VEC_LD(16, inptr);
+ if (num_cols > 32)
+ rgb2 = VEC_LD(32, inptr);
+#if RGB_PIXELSIZE == 4
+ if (num_cols > 48)
+ rgb3 = VEC_LD(48, inptr);
+#endif
+ }
+#if __BIG_ENDIAN__
+ }
+#endif
+
+#if RGB_PIXELSIZE == 3
+ /* rgb0 = R0 G0 B0 R1 G1 B1 R2 G2 B2 R3 G3 B3 R4 G4 B4 R5
+ * rgb1 = G5 B5 R6 G6 B6 R7 G7 B7 R8 G8 B8 R9 G9 B9 Ra Ga
+ * rgb2 = Ba Rb Gb Bb Rc Gc Bc Rd Gd Bd Re Ge Be Rf Gf Bf
+ *
+ * rgbg0 = R0 G0 R1 G1 R2 G2 R3 G3 B0 G0 B1 G1 B2 G2 B3 G3
+ * rgbg1 = R4 G4 R5 G5 R6 G6 R7 G7 B4 G4 B5 G5 B6 G6 B7 G7
+ * rgbg2 = R8 G8 R9 G9 Ra Ga Rb Gb B8 G8 B9 G9 Ba Ga Bb Gb
+ * rgbg3 = Rc Gc Rd Gd Re Ge Rf Gf Bc Gc Bd Gd Be Ge Bf Gf
+ */
+ rgbg0 = vec_perm(rgb0, rgb0, (__vector unsigned char)RGBG_INDEX0);
+ rgbg1 = vec_perm(rgb0, rgb1, (__vector unsigned char)RGBG_INDEX1);
+ rgbg2 = vec_perm(rgb1, rgb2, (__vector unsigned char)RGBG_INDEX2);
+ rgbg3 = vec_perm(rgb2, rgb2, (__vector unsigned char)RGBG_INDEX3);
+#else
+ /* rgb0 = R0 G0 B0 X0 R1 G1 B1 X1 R2 G2 B2 X2 R3 G3 B3 X3
+ * rgb1 = R4 G4 B4 X4 R5 G5 B5 X5 R6 G6 B6 X6 R7 G7 B7 X7
+ * rgb2 = R8 G8 B8 X8 R9 G9 B9 X9 Ra Ga Ba Xa Rb Gb Bb Xb
+ * rgb3 = Rc Gc Bc Xc Rd Gd Bd Xd Re Ge Be Xe Rf Gf Bf Xf
+ *
+ * rgbg0 = R0 G0 R1 G1 R2 G2 R3 G3 B0 G0 B1 G1 B2 G2 B3 G3
+ * rgbg1 = R4 G4 R5 G5 R6 G6 R7 G7 B4 G4 B5 G5 B6 G6 B7 G7
+ * rgbg2 = R8 G8 R9 G9 Ra Ga Rb Gb B8 G8 B9 G9 Ba Ga Bb Gb
+ * rgbg3 = Rc Gc Rd Gd Re Ge Rf Gf Bc Gc Bd Gd Be Ge Bf Gf
+ */
+ rgbg0 = vec_perm(rgb0, rgb0, (__vector unsigned char)RGBG_INDEX);
+ rgbg1 = vec_perm(rgb1, rgb1, (__vector unsigned char)RGBG_INDEX);
+ rgbg2 = vec_perm(rgb2, rgb2, (__vector unsigned char)RGBG_INDEX);
+ rgbg3 = vec_perm(rgb3, rgb3, (__vector unsigned char)RGBG_INDEX);
+#endif
+
+ /* rg0 = R0 G0 R1 G1 R2 G2 R3 G3
+ * bg0 = B0 G0 B1 G1 B2 G2 B3 G3
+ * ...
+ *
+ * NOTE: We have to use vec_merge*() here because vec_unpack*() doesn't
+ * support unsigned vectors.
+ */
+ rg0 = (__vector signed short)VEC_UNPACKHU(rgbg0);
+ bg0 = (__vector signed short)VEC_UNPACKLU(rgbg0);
+ rg1 = (__vector signed short)VEC_UNPACKHU(rgbg1);
+ bg1 = (__vector signed short)VEC_UNPACKLU(rgbg1);
+ rg2 = (__vector signed short)VEC_UNPACKHU(rgbg2);
+ bg2 = (__vector signed short)VEC_UNPACKLU(rgbg2);
+ rg3 = (__vector signed short)VEC_UNPACKHU(rgbg3);
+ bg3 = (__vector signed short)VEC_UNPACKLU(rgbg3);
+
+ /* (Original)
+ * Y = 0.29900 * R + 0.58700 * G + 0.11400 * B
+ * Cb = -0.16874 * R - 0.33126 * G + 0.50000 * B + CENTERJSAMPLE
+ * Cr = 0.50000 * R - 0.41869 * G - 0.08131 * B + CENTERJSAMPLE
+ *
+ * (This implementation)
+ * Y = 0.29900 * R + 0.33700 * G + 0.11400 * B + 0.25000 * G
+ * Cb = -0.16874 * R - 0.33126 * G + 0.50000 * B + CENTERJSAMPLE
+ * Cr = 0.50000 * R - 0.41869 * G - 0.08131 * B + CENTERJSAMPLE
+ */
+
+ /* Calculate Y values */
+
+ y0 = vec_msums(rg0, pw_f0299_f0337, pd_onehalf);
+ y1 = vec_msums(rg1, pw_f0299_f0337, pd_onehalf);
+ y2 = vec_msums(rg2, pw_f0299_f0337, pd_onehalf);
+ y3 = vec_msums(rg3, pw_f0299_f0337, pd_onehalf);
+ y0 = vec_msums(bg0, pw_f0114_f0250, y0);
+ y1 = vec_msums(bg1, pw_f0114_f0250, y1);
+ y2 = vec_msums(bg2, pw_f0114_f0250, y2);
+ y3 = vec_msums(bg3, pw_f0114_f0250, y3);
+ /* Clever way to avoid 4 shifts + 2 packs. This packs the high word from
+ * each dword into a new 16-bit vector, which is the equivalent of
+ * descaling the 32-bit results (right-shifting by 16 bits) and then
+ * packing them.
+ */
+ yl = vec_perm((__vector unsigned short)y0, (__vector unsigned short)y1,
+ shift_pack_index);
+ yh = vec_perm((__vector unsigned short)y2, (__vector unsigned short)y3,
+ shift_pack_index);
+ y = vec_pack(yl, yh);
+ vec_st(y, 0, outptr0);
+
+ /* Calculate Cb values */
+ cb0 = vec_msums(rg0, pw_mf016_mf033, pd_onehalfm1_cj);
+ cb1 = vec_msums(rg1, pw_mf016_mf033, pd_onehalfm1_cj);
+ cb2 = vec_msums(rg2, pw_mf016_mf033, pd_onehalfm1_cj);
+ cb3 = vec_msums(rg3, pw_mf016_mf033, pd_onehalfm1_cj);
+ cb0 = (__vector int)vec_msum((__vector unsigned short)bg0, pw_f050_f000,
+ (__vector unsigned int)cb0);
+ cb1 = (__vector int)vec_msum((__vector unsigned short)bg1, pw_f050_f000,
+ (__vector unsigned int)cb1);
+ cb2 = (__vector int)vec_msum((__vector unsigned short)bg2, pw_f050_f000,
+ (__vector unsigned int)cb2);
+ cb3 = (__vector int)vec_msum((__vector unsigned short)bg3, pw_f050_f000,
+ (__vector unsigned int)cb3);
+ cbl = vec_perm((__vector unsigned short)cb0,
+ (__vector unsigned short)cb1, shift_pack_index);
+ cbh = vec_perm((__vector unsigned short)cb2,
+ (__vector unsigned short)cb3, shift_pack_index);
+ cb = vec_pack(cbl, cbh);
+ vec_st(cb, 0, outptr1);
+
+ /* Calculate Cr values */
+ cr0 = vec_msums(bg0, pw_mf008_mf041, pd_onehalfm1_cj);
+ cr1 = vec_msums(bg1, pw_mf008_mf041, pd_onehalfm1_cj);
+ cr2 = vec_msums(bg2, pw_mf008_mf041, pd_onehalfm1_cj);
+ cr3 = vec_msums(bg3, pw_mf008_mf041, pd_onehalfm1_cj);
+ cr0 = (__vector int)vec_msum((__vector unsigned short)rg0, pw_f050_f000,
+ (__vector unsigned int)cr0);
+ cr1 = (__vector int)vec_msum((__vector unsigned short)rg1, pw_f050_f000,
+ (__vector unsigned int)cr1);
+ cr2 = (__vector int)vec_msum((__vector unsigned short)rg2, pw_f050_f000,
+ (__vector unsigned int)cr2);
+ cr3 = (__vector int)vec_msum((__vector unsigned short)rg3, pw_f050_f000,
+ (__vector unsigned int)cr3);
+ crl = vec_perm((__vector unsigned short)cr0,
+ (__vector unsigned short)cr1, shift_pack_index);
+ crh = vec_perm((__vector unsigned short)cr2,
+ (__vector unsigned short)cr3, shift_pack_index);
+ cr = vec_pack(crl, crh);
+ vec_st(cr, 0, outptr2);
+ }
+ }
+}
diff --git a/simd/jccolor-altivec.c b/simd/jccolor-altivec.c
new file mode 100644
index 0000000..04b8708
--- /dev/null
+++ b/simd/jccolor-altivec.c
@@ -0,0 +1,104 @@
+/*
+ * AltiVec optimizations for libjpeg-turbo
+ *
+ * Copyright (C) 2014, D. R. Commander.
+ * All rights reserved.
+ * This software is provided 'as-is', without any express or implied
+ * warranty. In no event will the authors be held liable for any damages
+ * arising from the use of this software.
+ *
+ * Permission is granted to anyone to use this software for any purpose,
+ * including commercial applications, and to alter it and redistribute it
+ * freely, subject to the following restrictions:
+ *
+ * 1. The origin of this software must not be misrepresented; you must not
+ * claim that you wrote the original software. If you use this software
+ * in a product, an acknowledgment in the product documentation would be
+ * appreciated but is not required.
+ * 2. Altered source versions must be plainly marked as such, and must not be
+ * misrepresented as being the original software.
+ * 3. This notice may not be removed or altered from any source distribution.
+ */
+
+/* RGB --> YCC CONVERSION */
+
+#include "jsimd_altivec.h"
+
+
+#define F_0_081 5329 /* FIX(0.08131) */
+#define F_0_114 7471 /* FIX(0.11400) */
+#define F_0_168 11059 /* FIX(0.16874) */
+#define F_0_250 16384 /* FIX(0.25000) */
+#define F_0_299 19595 /* FIX(0.29900) */
+#define F_0_331 21709 /* FIX(0.33126) */
+#define F_0_418 27439 /* FIX(0.41869) */
+#define F_0_500 32768 /* FIX(0.50000) */
+#define F_0_587 38470 /* FIX(0.58700) */
+#define F_0_337 (F_0_587 - F_0_250) /* FIX(0.58700) - FIX(0.25000) */
+
+#define SCALEBITS 16
+#define ONE_HALF (1 << (SCALEBITS - 1))
+
+
+#define RGBG_INDEX0 {0,1,3,4,6,7,9,10,2,1,5,4,8,7,11,10}
+#define RGBG_INDEX1 {12,13,15,16,18,19,21,22,14,13,17,16,20,19,23,22}
+#define RGBG_INDEX2 {8,9,11,12,14,15,17,18,10,9,13,12,16,15,19,18}
+#define RGBG_INDEX3 {4,5,7,8,10,11,13,14,6,5,9,8,12,11,15,14}
+#include "jccolext-altivec.c"
+#undef RGB_PIXELSIZE
+
+#define RGB_PIXELSIZE EXT_RGB_PIXELSIZE
+#define jsimd_rgb_ycc_convert_altivec jsimd_extrgb_ycc_convert_altivec
+#include "jccolext-altivec.c"
+#undef RGB_PIXELSIZE
+#undef RGBG_INDEX0
+#undef RGBG_INDEX1
+#undef RGBG_INDEX2
+#undef RGBG_INDEX3
+#undef jsimd_rgb_ycc_convert_altivec
+
+#define RGB_PIXELSIZE EXT_RGBX_PIXELSIZE
+#define RGBG_INDEX {0,1,4,5,8,9,12,13,2,1,6,5,10,9,14,13}
+#define jsimd_rgb_ycc_convert_altivec jsimd_extrgbx_ycc_convert_altivec
+#include "jccolext-altivec.c"
+#undef RGB_PIXELSIZE
+#undef RGBG_INDEX
+#undef jsimd_rgb_ycc_convert_altivec
+
+#define RGB_PIXELSIZE EXT_BGR_PIXELSIZE
+#define RGBG_INDEX0 {2,1,5,4,8,7,11,10,0,1,3,4,6,7,9,10}
+#define RGBG_INDEX1 {14,13,17,16,20,19,23,22,12,13,15,16,18,19,21,22}
+#define RGBG_INDEX2 {10,9,13,12,16,15,19,18,8,9,11,12,14,15,17,18}
+#define RGBG_INDEX3 {6,5,9,8,12,11,15,14,4,5,7,8,10,11,13,14}
+#define jsimd_rgb_ycc_convert_altivec jsimd_extbgr_ycc_convert_altivec
+#include "jccolext-altivec.c"
+#undef RGB_PIXELSIZE
+#undef RGBG_INDEX0
+#undef RGBG_INDEX1
+#undef RGBG_INDEX2
+#undef RGBG_INDEX3
+#undef jsimd_rgb_ycc_convert_altivec
+
+#define RGB_PIXELSIZE EXT_BGRX_PIXELSIZE
+#define RGBG_INDEX {2,1,6,5,10,9,14,13,0,1,4,5,8,9,12,13}
+#define jsimd_rgb_ycc_convert_altivec jsimd_extbgrx_ycc_convert_altivec
+#include "jccolext-altivec.c"
+#undef RGB_PIXELSIZE
+#undef RGBG_INDEX
+#undef jsimd_rgb_ycc_convert_altivec
+
+#define RGB_PIXELSIZE EXT_XBGR_PIXELSIZE
+#define RGBG_INDEX {3,2,7,6,11,10,15,14,1,2,5,6,9,10,13,14}
+#define jsimd_rgb_ycc_convert_altivec jsimd_extxbgr_ycc_convert_altivec
+#include "jccolext-altivec.c"
+#undef RGB_PIXELSIZE
+#undef RGBG_INDEX
+#undef jsimd_rgb_ycc_convert_altivec
+
+#define RGB_PIXELSIZE EXT_XRGB_PIXELSIZE
+#define RGBG_INDEX {1,2,5,6,9,10,13,14,3,2,7,6,11,10,15,14}
+#define jsimd_rgb_ycc_convert_altivec jsimd_extxrgb_ycc_convert_altivec
+#include "jccolext-altivec.c"
+#undef RGB_PIXELSIZE
+#undef RGBG_INDEX
+#undef jsimd_rgb_ycc_convert_altivec
diff --git a/simd/jcgray-altivec.c b/simd/jcgray-altivec.c
new file mode 100644
index 0000000..b52fade
--- /dev/null
+++ b/simd/jcgray-altivec.c
@@ -0,0 +1,99 @@
+/*
+ * AltiVec optimizations for libjpeg-turbo
+ *
+ * Copyright (C) 2014, D. R. Commander.
+ * All rights reserved.
+ * This software is provided 'as-is', without any express or implied
+ * warranty. In no event will the authors be held liable for any damages
+ * arising from the use of this software.
+ *
+ * Permission is granted to anyone to use this software for any purpose,
+ * including commercial applications, and to alter it and redistribute it
+ * freely, subject to the following restrictions:
+ *
+ * 1. The origin of this software must not be misrepresented; you must not
+ * claim that you wrote the original software. If you use this software
+ * in a product, an acknowledgment in the product documentation would be
+ * appreciated but is not required.
+ * 2. Altered source versions must be plainly marked as such, and must not be
+ * misrepresented as being the original software.
+ * 3. This notice may not be removed or altered from any source distribution.
+ */
+
+/* RGB --> GRAYSCALE CONVERSION */
+
+#include "jsimd_altivec.h"
+
+
+#define F_0_114 7471 /* FIX(0.11400) */
+#define F_0_250 16384 /* FIX(0.25000) */
+#define F_0_299 19595 /* FIX(0.29900) */
+#define F_0_587 38470 /* FIX(0.58700) */
+#define F_0_337 (F_0_587 - F_0_250) /* FIX(0.58700) - FIX(0.25000) */
+
+#define SCALEBITS 16
+#define ONE_HALF (1 << (SCALEBITS - 1))
+
+
+#define RGBG_INDEX0 {0,1,3,4,6,7,9,10,2,1,5,4,8,7,11,10}
+#define RGBG_INDEX1 {12,13,15,16,18,19,21,22,14,13,17,16,20,19,23,22}
+#define RGBG_INDEX2 {8,9,11,12,14,15,17,18,10,9,13,12,16,15,19,18}
+#define RGBG_INDEX3 {4,5,7,8,10,11,13,14,6,5,9,8,12,11,15,14}
+#include "jcgryext-altivec.c"
+#undef RGB_PIXELSIZE
+
+#define RGB_PIXELSIZE EXT_RGB_PIXELSIZE
+#define jsimd_rgb_gray_convert_altivec jsimd_extrgb_gray_convert_altivec
+#include "jcgryext-altivec.c"
+#undef RGB_PIXELSIZE
+#undef RGBG_INDEX0
+#undef RGBG_INDEX1
+#undef RGBG_INDEX2
+#undef RGBG_INDEX3
+#undef jsimd_rgb_gray_convert_altivec
+
+#define RGB_PIXELSIZE EXT_RGBX_PIXELSIZE
+#define RGBG_INDEX {0,1,4,5,8,9,12,13,2,1,6,5,10,9,14,13}
+#define jsimd_rgb_gray_convert_altivec jsimd_extrgbx_gray_convert_altivec
+#include "jcgryext-altivec.c"
+#undef RGB_PIXELSIZE
+#undef RGBG_INDEX
+#undef jsimd_rgb_gray_convert_altivec
+
+#define RGB_PIXELSIZE EXT_BGR_PIXELSIZE
+#define RGBG_INDEX0 {2,1,5,4,8,7,11,10,0,1,3,4,6,7,9,10}
+#define RGBG_INDEX1 {14,13,17,16,20,19,23,22,12,13,15,16,18,19,21,22}
+#define RGBG_INDEX2 {10,9,13,12,16,15,19,18,8,9,11,12,14,15,17,18}
+#define RGBG_INDEX3 {6,5,9,8,12,11,15,14,4,5,7,8,10,11,13,14}
+#define jsimd_rgb_gray_convert_altivec jsimd_extbgr_gray_convert_altivec
+#include "jcgryext-altivec.c"
+#undef RGB_PIXELSIZE
+#undef RGBG_INDEX0
+#undef RGBG_INDEX1
+#undef RGBG_INDEX2
+#undef RGBG_INDEX3
+#undef jsimd_rgb_gray_convert_altivec
+
+#define RGB_PIXELSIZE EXT_BGRX_PIXELSIZE
+#define RGBG_INDEX {2,1,6,5,10,9,14,13,0,1,4,5,8,9,12,13}
+#define jsimd_rgb_gray_convert_altivec jsimd_extbgrx_gray_convert_altivec
+#include "jcgryext-altivec.c"
+#undef RGB_PIXELSIZE
+#undef RGBG_INDEX
+#undef jsimd_rgb_gray_convert_altivec
+
+#define RGB_PIXELSIZE EXT_XBGR_PIXELSIZE
+#define RGBG_INDEX {3,2,7,6,11,10,15,14,1,2,5,6,9,10,13,14}
+#define jsimd_rgb_gray_convert_altivec jsimd_extxbgr_gray_convert_altivec
+#include "jcgryext-altivec.c"
+#undef RGB_PIXELSIZE
+#undef RGBG_INDEX
+#undef jsimd_rgb_gray_convert_altivec
+
+#define RGB_PIXELSIZE EXT_XRGB_PIXELSIZE
+#define RGBG_INDEX {1,2,5,6,9,10,13,14,3,2,7,6,11,10,15,14}
+#define jsimd_rgb_gray_convert_altivec jsimd_extxrgb_gray_convert_altivec
+#include "jcgryext-altivec.c"
+#undef RGB_PIXELSIZE
+#undef RGBG_INDEX
+#undef jsimd_rgb_gray_convert_altivec
diff --git a/simd/jcgryext-altivec.c b/simd/jcgryext-altivec.c
new file mode 100644
index 0000000..c171615
--- /dev/null
+++ b/simd/jcgryext-altivec.c
@@ -0,0 +1,227 @@
+/*
+ * AltiVec optimizations for libjpeg-turbo
+ *
+ * Copyright (C) 2014-2015, D. R. Commander.
+ * Copyright (C) 2014, Jay Foad.
+ * All rights reserved.
+ * This software is provided 'as-is', without any express or implied
+ * warranty. In no event will the authors be held liable for any damages
+ * arising from the use of this software.
+ *
+ * Permission is granted to anyone to use this software for any purpose,
+ * including commercial applications, and to alter it and redistribute it
+ * freely, subject to the following restrictions:
+ *
+ * 1. The origin of this software must not be misrepresented; you must not
+ * claim that you wrote the original software. If you use this software
+ * in a product, an acknowledgment in the product documentation would be
+ * appreciated but is not required.
+ * 2. Altered source versions must be plainly marked as such, and must not be
+ * misrepresented as being the original software.
+ * 3. This notice may not be removed or altered from any source distribution.
+ */
+
+/* This file is included by jcgray-altivec.c */
+
+
+void jsimd_rgb_gray_convert_altivec (JDIMENSION img_width,
+ JSAMPARRAY input_buf,
+ JSAMPIMAGE output_buf,
+ JDIMENSION output_row, int num_rows)
+{
+ JSAMPROW inptr, outptr;
+ int pitch = img_width * RGB_PIXELSIZE, num_cols;
+#if __BIG_ENDIAN__
+ int offset;
+ unsigned char __attribute__((aligned(16))) tmpbuf[RGB_PIXELSIZE * 16];
+#endif
+
+ __vector unsigned char rgb0, rgb1 = {0}, rgb2 = {0},
+ rgbg0, rgbg1, rgbg2, rgbg3, y;
+#if __BIG_ENDIAN__ || RGB_PIXELSIZE == 4
+ __vector unsigned char rgb3 = {0};
+#endif
+#if __BIG_ENDIAN__ && RGB_PIXELSIZE == 4
+ __vector unsigned char rgb4 = {0};
+#endif
+ __vector short rg0, rg1, rg2, rg3, bg0, bg1, bg2, bg3;
+ __vector unsigned short yl, yh;
+ __vector int y0, y1, y2, y3;
+
+ /* Constants */
+ __vector short pw_f0299_f0337 = { __4X2(F_0_299, F_0_337) },
+ pw_f0114_f0250 = { __4X2(F_0_114, F_0_250) };
+ __vector int pd_onehalf = { __4X(ONE_HALF) };
+ __vector unsigned char pb_zero = { __16X(0) },
+#if __BIG_ENDIAN__
+ shift_pack_index = {0,1,4,5,8,9,12,13,16,17,20,21,24,25,28,29};
+#else
+ shift_pack_index = {2,3,6,7,10,11,14,15,18,19,22,23,26,27,30,31};
+#endif
+
+ while (--num_rows >= 0) {
+ inptr = *input_buf++;
+ outptr = output_buf[0][output_row];
+ output_row++;
+
+ for (num_cols = pitch; num_cols > 0;
+ num_cols -= RGB_PIXELSIZE * 16, inptr += RGB_PIXELSIZE * 16,
+ outptr += 16) {
+
+#if __BIG_ENDIAN__
+ /* Load 16 pixels == 48 or 64 bytes */
+ offset = (size_t)inptr & 15;
+ if (offset) {
+ __vector unsigned char unaligned_shift_index;
+ int bytes = num_cols + offset;
+
+ if (bytes < (RGB_PIXELSIZE + 1) * 16 && (bytes & 15)) {
+ /* Slow path to prevent buffer overread. Since there is no way to
+ * read a partial AltiVec register, overread would occur on the last
+ * chunk of the last image row if the right edge is not on a 16-byte
+ * boundary. It could also occur on other rows if the bytes per row
+ * is low enough. Since we can't determine whether we're on the last
+ * image row, we have to assume every row is the last.
+ */
+ memcpy(tmpbuf, inptr, min(num_cols, RGB_PIXELSIZE * 16));
+ rgb0 = vec_ld(0, tmpbuf);
+ rgb1 = vec_ld(16, tmpbuf);
+ rgb2 = vec_ld(32, tmpbuf);
+#if RGB_PIXELSIZE == 4
+ rgb3 = vec_ld(48, tmpbuf);
+#endif
+ } else {
+ /* Fast path */
+ rgb0 = vec_ld(0, inptr);
+ if (bytes > 16)
+ rgb1 = vec_ld(16, inptr);
+ if (bytes > 32)
+ rgb2 = vec_ld(32, inptr);
+ if (bytes > 48)
+ rgb3 = vec_ld(48, inptr);
+#if RGB_PIXELSIZE == 4
+ if (bytes > 64)
+ rgb4 = vec_ld(64, inptr);
+#endif
+ unaligned_shift_index = vec_lvsl(0, inptr);
+ rgb0 = vec_perm(rgb0, rgb1, unaligned_shift_index);
+ rgb1 = vec_perm(rgb1, rgb2, unaligned_shift_index);
+ rgb2 = vec_perm(rgb2, rgb3, unaligned_shift_index);
+#if RGB_PIXELSIZE == 4
+ rgb3 = vec_perm(rgb3, rgb4, unaligned_shift_index);
+#endif
+ }
+ } else {
+ if (num_cols < RGB_PIXELSIZE * 16 && (num_cols & 15)) {
+ /* Slow path */
+ memcpy(tmpbuf, inptr, min(num_cols, RGB_PIXELSIZE * 16));
+ rgb0 = vec_ld(0, tmpbuf);
+ rgb1 = vec_ld(16, tmpbuf);
+ rgb2 = vec_ld(32, tmpbuf);
+#if RGB_PIXELSIZE == 4
+ rgb3 = vec_ld(48, tmpbuf);
+#endif
+ } else {
+ /* Fast path */
+ rgb0 = vec_ld(0, inptr);
+ if (num_cols > 16)
+ rgb1 = vec_ld(16, inptr);
+ if (num_cols > 32)
+ rgb2 = vec_ld(32, inptr);
+#if RGB_PIXELSIZE == 4
+ if (num_cols > 48)
+ rgb3 = vec_ld(48, inptr);
+#endif
+ }
+ }
+#else
+ /* Little endian */
+ rgb0 = vec_vsx_ld(0, inptr);
+ if (num_cols > 16)
+ rgb1 = vec_vsx_ld(16, inptr);
+ if (num_cols > 32)
+ rgb2 = vec_vsx_ld(32, inptr);
+#if RGB_PIXELSIZE == 4
+ if (num_cols > 48)
+ rgb3 = vec_vsx_ld(48, inptr);
+#endif
+#endif
+
+#if RGB_PIXELSIZE == 3
+ /* rgb0 = R0 G0 B0 R1 G1 B1 R2 G2 B2 R3 G3 B3 R4 G4 B4 R5
+ * rgb1 = G5 B5 R6 G6 B6 R7 G7 B7 R8 G8 B8 R9 G9 B9 Ra Ga
+ * rgb2 = Ba Rb Gb Bb Rc Gc Bc Rd Gd Bd Re Ge Be Rf Gf Bf
+ *
+ * rgbg0 = R0 G0 R1 G1 R2 G2 R3 G3 B0 G0 B1 G1 B2 G2 B3 G3
+ * rgbg1 = R4 G4 R5 G5 R6 G6 R7 G7 B4 G4 B5 G5 B6 G6 B7 G7
+ * rgbg2 = R8 G8 R9 G9 Ra Ga Rb Gb B8 G8 B9 G9 Ba Ga Bb Gb
+ * rgbg3 = Rc Gc Rd Gd Re Ge Rf Gf Bc Gc Bd Gd Be Ge Bf Gf
+ */
+ rgbg0 = vec_perm(rgb0, rgb0, (__vector unsigned char)RGBG_INDEX0);
+ rgbg1 = vec_perm(rgb0, rgb1, (__vector unsigned char)RGBG_INDEX1);
+ rgbg2 = vec_perm(rgb1, rgb2, (__vector unsigned char)RGBG_INDEX2);
+ rgbg3 = vec_perm(rgb2, rgb2, (__vector unsigned char)RGBG_INDEX3);
+#else
+ /* rgb0 = R0 G0 B0 X0 R1 G1 B1 X1 R2 G2 B2 X2 R3 G3 B3 X3
+ * rgb1 = R4 G4 B4 X4 R5 G5 B5 X5 R6 G6 B6 X6 R7 G7 B7 X7
+ * rgb2 = R8 G8 B8 X8 R9 G9 B9 X9 Ra Ga Ba Xa Rb Gb Bb Xb
+ * rgb3 = Rc Gc Bc Xc Rd Gd Bd Xd Re Ge Be Xe Rf Gf Bf Xf
+ *
+ * rgbg0 = R0 G0 R1 G1 R2 G2 R3 G3 B0 G0 B1 G1 B2 G2 B3 G3
+ * rgbg1 = R4 G4 R5 G5 R6 G6 R7 G7 B4 G4 B5 G5 B6 G6 B7 G7
+ * rgbg2 = R8 G8 R9 G9 Ra Ga Rb Gb B8 G8 B9 G9 Ba Ga Bb Gb
+ * rgbg3 = Rc Gc Rd Gd Re Ge Rf Gf Bc Gc Bd Gd Be Ge Bf Gf
+ */
+ rgbg0 = vec_perm(rgb0, rgb0, (__vector unsigned char)RGBG_INDEX);
+ rgbg1 = vec_perm(rgb1, rgb1, (__vector unsigned char)RGBG_INDEX);
+ rgbg2 = vec_perm(rgb2, rgb2, (__vector unsigned char)RGBG_INDEX);
+ rgbg3 = vec_perm(rgb3, rgb3, (__vector unsigned char)RGBG_INDEX);
+#endif
+
+ /* rg0 = R0 G0 R1 G1 R2 G2 R3 G3
+ * bg0 = B0 G0 B1 G1 B2 G2 B3 G3
+ * ...
+ *
+ * NOTE: We have to use vec_merge*() here because vec_unpack*() doesn't
+ * support unsigned vectors.
+ */
+ rg0 = (__vector signed short)VEC_UNPACKHU(rgbg0);
+ bg0 = (__vector signed short)VEC_UNPACKLU(rgbg0);
+ rg1 = (__vector signed short)VEC_UNPACKHU(rgbg1);
+ bg1 = (__vector signed short)VEC_UNPACKLU(rgbg1);
+ rg2 = (__vector signed short)VEC_UNPACKHU(rgbg2);
+ bg2 = (__vector signed short)VEC_UNPACKLU(rgbg2);
+ rg3 = (__vector signed short)VEC_UNPACKHU(rgbg3);
+ bg3 = (__vector signed short)VEC_UNPACKLU(rgbg3);
+
+ /* (Original)
+ * Y = 0.29900 * R + 0.58700 * G + 0.11400 * B
+ *
+ * (This implementation)
+ * Y = 0.29900 * R + 0.33700 * G + 0.11400 * B + 0.25000 * G
+ */
+
+ /* Calculate Y values */
+
+ y0 = vec_msums(rg0, pw_f0299_f0337, pd_onehalf);
+ y1 = vec_msums(rg1, pw_f0299_f0337, pd_onehalf);
+ y2 = vec_msums(rg2, pw_f0299_f0337, pd_onehalf);
+ y3 = vec_msums(rg3, pw_f0299_f0337, pd_onehalf);
+ y0 = vec_msums(bg0, pw_f0114_f0250, y0);
+ y1 = vec_msums(bg1, pw_f0114_f0250, y1);
+ y2 = vec_msums(bg2, pw_f0114_f0250, y2);
+ y3 = vec_msums(bg3, pw_f0114_f0250, y3);
+ /* Clever way to avoid 4 shifts + 2 packs. This packs the high word from
+ * each dword into a new 16-bit vector, which is the equivalent of
+ * descaling the 32-bit results (right-shifting by 16 bits) and then
+ * packing them.
+ */
+ yl = vec_perm((__vector unsigned short)y0, (__vector unsigned short)y1,
+ shift_pack_index);
+ yh = vec_perm((__vector unsigned short)y2, (__vector unsigned short)y3,
+ shift_pack_index);
+ y = vec_pack(yl, yh);
+ vec_st(y, 0, outptr);
+ }
+ }
+}
diff --git a/simd/jcsample-altivec.c b/simd/jcsample-altivec.c
new file mode 100644
index 0000000..603492d
--- /dev/null
+++ b/simd/jcsample-altivec.c
@@ -0,0 +1,158 @@
+/*
+ * AltiVec optimizations for libjpeg-turbo
+ *
+ * Copyright (C) 2015, D. R. Commander.
+ * All rights reserved.
+ * This software is provided 'as-is', without any express or implied
+ * warranty. In no event will the authors be held liable for any damages
+ * arising from the use of this software.
+ *
+ * Permission is granted to anyone to use this software for any purpose,
+ * including commercial applications, and to alter it and redistribute it
+ * freely, subject to the following restrictions:
+ *
+ * 1. The origin of this software must not be misrepresented; you must not
+ * claim that you wrote the original software. If you use this software
+ * in a product, an acknowledgment in the product documentation would be
+ * appreciated but is not required.
+ * 2. Altered source versions must be plainly marked as such, and must not be
+ * misrepresented as being the original software.
+ * 3. This notice may not be removed or altered from any source distribution.
+ */
+
+/* CHROMA DOWNSAMPLING */
+
+#include "jsimd_altivec.h"
+#include "jcsample.h"
+
+
+void
+jsimd_h2v1_downsample_altivec (JDIMENSION image_width, int max_v_samp_factor,
+ JDIMENSION v_samp_factor,
+ JDIMENSION width_blocks,
+ JSAMPARRAY input_data, JSAMPARRAY output_data)
+{
+ int outrow, outcol;
+ JDIMENSION output_cols = width_blocks * DCTSIZE;
+ JSAMPROW inptr, outptr;
+
+ __vector unsigned char this0, next0, out;
+ __vector unsigned short this0e, this0o, next0e, next0o, outl, outh;
+
+ /* Constants */
+ __vector unsigned short pw_bias = { __4X2(0, 1) },
+ pw_one = { __8X(1) };
+ __vector unsigned char even_odd_index =
+ {0,2,4,6,8,10,12,14,1,3,5,7,9,11,13,15},
+ pb_zero = { __16X(0) };
+
+ expand_right_edge(input_data, max_v_samp_factor, image_width,
+ output_cols * 2);
+
+ for (outrow = 0; outrow < v_samp_factor; outrow++) {
+ outptr = output_data[outrow];
+ inptr = input_data[outrow];
+
+ for (outcol = output_cols; outcol > 0;
+ outcol -= 16, inptr += 32, outptr += 16) {
+
+ this0 = vec_ld(0, inptr);
+ this0 = vec_perm(this0, this0, even_odd_index);
+ this0e = (__vector unsigned short)VEC_UNPACKHU(this0);
+ this0o = (__vector unsigned short)VEC_UNPACKLU(this0);
+ outl = vec_add(this0e, this0o);
+ outl = vec_add(outl, pw_bias);
+ outl = vec_sr(outl, pw_one);
+
+ if (outcol > 8) {
+ next0 = vec_ld(16, inptr);
+ next0 = vec_perm(next0, next0, even_odd_index);
+ next0e = (__vector unsigned short)VEC_UNPACKHU(next0);
+ next0o = (__vector unsigned short)VEC_UNPACKLU(next0);
+ outh = vec_add(next0e, next0o);
+ outh = vec_add(outh, pw_bias);
+ outh = vec_sr(outh, pw_one);
+ } else
+ outh = vec_splat_u16(0);
+
+ out = vec_pack(outl, outh);
+ vec_st(out, 0, outptr);
+ }
+ }
+}
+
+
+void
+jsimd_h2v2_downsample_altivec (JDIMENSION image_width, int max_v_samp_factor,
+ JDIMENSION v_samp_factor,
+ JDIMENSION width_blocks,
+ JSAMPARRAY input_data, JSAMPARRAY output_data)
+{
+ int inrow, outrow, outcol;
+ JDIMENSION output_cols = width_blocks * DCTSIZE;
+ JSAMPROW inptr0, inptr1, outptr;
+
+ __vector unsigned char this0, next0, this1, next1, out;
+ __vector unsigned short this0e, this0o, next0e, next0o, this1e, this1o,
+ next1e, next1o, out0l, out0h, out1l, out1h, outl, outh;
+
+ /* Constants */
+ __vector unsigned short pw_bias = { __4X2(1, 2) },
+ pw_two = { __8X(2) };
+ __vector unsigned char even_odd_index =
+ { 0, 2, 4, 6, 8, 10, 12, 14, 1, 3, 5, 7, 9, 11, 13, 15 },
+ pb_zero = { __16X(0) };
+
+ expand_right_edge(input_data, max_v_samp_factor, image_width,
+ output_cols * 2);
+
+ for (inrow = 0, outrow = 0; outrow < v_samp_factor;
+ inrow += 2, outrow++) {
+
+ inptr0 = input_data[inrow];
+ inptr1 = input_data[inrow + 1];
+ outptr = output_data[outrow];
+
+ for (outcol = output_cols; outcol > 0;
+ outcol -= 16, inptr0 += 32, inptr1 += 32, outptr += 16) {
+
+ this0 = vec_ld(0, inptr0);
+ this0 = vec_perm(this0, this0, even_odd_index);
+ this0e = (__vector unsigned short)VEC_UNPACKHU(this0);
+ this0o = (__vector unsigned short)VEC_UNPACKLU(this0);
+ out0l = vec_add(this0e, this0o);
+
+ this1 = vec_ld(0, inptr1);
+ this1 = vec_perm(this1, this1, even_odd_index);
+ this1e = (__vector unsigned short)VEC_UNPACKHU(this1);
+ this1o = (__vector unsigned short)VEC_UNPACKLU(this1);
+ out1l = vec_add(this1e, this1o);
+
+ outl = vec_add(out0l, out1l);
+ outl = vec_add(outl, pw_bias);
+ outl = vec_sr(outl, pw_two);
+
+ if (outcol > 8) {
+ next0 = vec_ld(16, inptr0);
+ next0 = vec_perm(next0, next0, even_odd_index);
+ next0e = (__vector unsigned short)VEC_UNPACKHU(next0);
+ next0o = (__vector unsigned short)VEC_UNPACKLU(next0);
+ out0h = vec_add(next0e, next0o);
+
+ next1 = vec_ld(16, inptr1);
+ next1 = vec_perm(next1, next1, even_odd_index);
+ next1e = (__vector unsigned short)VEC_UNPACKHU(next1);
+ next1o = (__vector unsigned short)VEC_UNPACKLU(next1);
+ out1h = vec_add(next1e, next1o);
+
+ outh = vec_add(out0h, out1h);
+ outh = vec_add(outh, pw_bias);
+ outh = vec_sr(outh, pw_two);
+ } else
+ outh = vec_splat_u16(0);
+
+ out = vec_pack(outl, outh);
+ vec_st(out, 0, outptr);
+ }
+ }
+}
diff --git a/simd/jcsample.h b/simd/jcsample.h
new file mode 100644
index 0000000..b1ef502
--- /dev/null
+++ b/simd/jcsample.h
@@ -0,0 +1,27 @@
+/*
+ * jcsample.h
+ *
+ * This file was part of the Independent JPEG Group's software:
+ * Copyright (C) 1991-1996, Thomas G. Lane.
+ * For conditions of distribution and use, see the accompanying README file.
+ */
+
+LOCAL(void)
+expand_right_edge (JSAMPARRAY image_data, int num_rows,
+ JDIMENSION input_cols, JDIMENSION output_cols)
+{
+ register JSAMPROW ptr;
+ register JSAMPLE pixval;
+ register int count;
+ int row;
+ int numcols = (int) (output_cols - input_cols);
+
+ if (numcols > 0) {
+ for (row = 0; row < num_rows; row++) {
+ ptr = image_data[row] + input_cols;
+ pixval = ptr[-1]; /* don't need GETJSAMPLE() here */
+ for (count = numcols; count > 0; count--)
+ *ptr++ = pixval;
+ }
+ }
+}
diff --git a/simd/jdcolext-altivec.c b/simd/jdcolext-altivec.c
new file mode 100644
index 0000000..1ae91b9
--- /dev/null
+++ b/simd/jdcolext-altivec.c
@@ -0,0 +1,274 @@
+/*
+ * AltiVec optimizations for libjpeg-turbo
+ *
+ * Copyright (C) 2015, D. R. Commander.
+ * All rights reserved.
+ * This software is provided 'as-is', without any express or implied
+ * warranty. In no event will the authors be held liable for any damages
+ * arising from the use of this software.
+ *
+ * Permission is granted to anyone to use this software for any purpose,
+ * including commercial applications, and to alter it and redistribute it
+ * freely, subject to the following restrictions:
+ *
+ * 1. The origin of this software must not be misrepresented; you must not
+ * claim that you wrote the original software. If you use this software
+ * in a product, an acknowledgment in the product documentation would be
+ * appreciated but is not required.
+ * 2. Altered source versions must be plainly marked as such, and must not be
+ * misrepresented as being the original software.
+ * 3. This notice may not be removed or altered from any source distribution.
+ */
+
+/* This file is included by jdcolor-altivec.c */
+
+
+void jsimd_ycc_rgb_convert_altivec (JDIMENSION out_width, JSAMPIMAGE input_buf,
+ JDIMENSION input_row,
+ JSAMPARRAY output_buf, int num_rows)
+{
+ JSAMPROW outptr, inptr0, inptr1, inptr2;
+ int pitch = out_width * RGB_PIXELSIZE, num_cols;
+#if __BIG_ENDIAN__
+ int offset;
+#endif
+ unsigned char __attribute__((aligned(16))) tmpbuf[RGB_PIXELSIZE * 16];
+
+ __vector unsigned char rgb0, rgb1, rgb2, rgbx0, rgbx1, rgbx2, rgbx3,
+ y, cb, cr;
+#if __BIG_ENDIAN__
+ __vector unsigned char edgel, edgeh, edges, out0, out1, out2, out3;
+#if RGB_PIXELSIZE == 4
+ __vector unsigned char out4;
+#endif
+#endif
+#if RGB_PIXELSIZE == 4
+ __vector unsigned char rgb3;
+#endif
+ __vector short rg0, rg1, rg2, rg3, bx0, bx1, bx2, bx3, yl, yh, cbl, cbh,
+ crl, crh, rl, rh, gl, gh, bl, bh, g0w, g1w, g2w, g3w;
+ __vector int g0, g1, g2, g3;
+
+ /* Constants
+ * NOTE: The >> 1 is to compensate for the fact that vec_madds() returns 17
+ * high-order bits, not 16.
+ */
+ __vector short pw_f0402 = { __8X(F_0_402 >> 1) },
+ pw_mf0228 = { __8X(-F_0_228 >> 1) },
+ pw_mf0344_f0285 = { __4X2(-F_0_344, F_0_285) },
+ pw_one = { __8X(1) }, pw_255 = { __8X(255) },
+ pw_cj = { __8X(CENTERJSAMPLE) };
+ __vector int pd_onehalf = { __4X(ONE_HALF) };
+ __vector unsigned char pb_zero = { __16X(0) },
+#if __BIG_ENDIAN__
+ shift_pack_index = {0,1,4,5,8,9,12,13,16,17,20,21,24,25,28,29};
+#else
+ shift_pack_index = {2,3,6,7,10,11,14,15,18,19,22,23,26,27,30,31};
+#endif
+
+ while (--num_rows >= 0) {
+ inptr0 = input_buf[0][input_row];
+ inptr1 = input_buf[1][input_row];
+ inptr2 = input_buf[2][input_row];
+ input_row++;
+ outptr = *output_buf++;
+
+ for (num_cols = pitch; num_cols > 0;
+ num_cols -= RGB_PIXELSIZE * 16, outptr += RGB_PIXELSIZE * 16,
+ inptr0 += 16, inptr1 += 16, inptr2 += 16) {
+
+ y = vec_ld(0, inptr0);
+ /* NOTE: We have to use vec_merge*() here because vec_unpack*() doesn't
+ * support unsigned vectors.
+ */
+ yl = (__vector signed short)VEC_UNPACKHU(y);
+ yh = (__vector signed short)VEC_UNPACKLU(y);
+
+ cb = vec_ld(0, inptr1);
+ cbl = (__vector signed short)VEC_UNPACKHU(cb);
+ cbh = (__vector signed short)VEC_UNPACKLU(cb);
+ cbl = vec_sub(cbl, pw_cj);
+ cbh = vec_sub(cbh, pw_cj);
+
+ cr = vec_ld(0, inptr2);
+ crl = (__vector signed short)VEC_UNPACKHU(cr);
+ crh = (__vector signed short)VEC_UNPACKLU(cr);
+ crl = vec_sub(crl, pw_cj);
+ crh = vec_sub(crh, pw_cj);
+
+ /* (Original)
+ * R = Y + 1.40200 * Cr
+ * G = Y - 0.34414 * Cb - 0.71414 * Cr
+ * B = Y + 1.77200 * Cb
+ *
+ * (This implementation)
+ * R = Y + 0.40200 * Cr + Cr
+ * G = Y - 0.34414 * Cb + 0.28586 * Cr - Cr
+ * B = Y - 0.22800 * Cb + Cb + Cb
+ */
+ bl = vec_add(cbl, cbl);
+ bh = vec_add(cbh, cbh);
+ bl = vec_madds(bl, pw_mf0228, pw_one);
+ bh = vec_madds(bh, pw_mf0228, pw_one);
+ bl = vec_sra(bl, (__vector unsigned short)pw_one);
+ bh = vec_sra(bh, (__vector unsigned short)pw_one);
+ bl = vec_add(bl, cbl);
+ bh = vec_add(bh, cbh);
+ bl = vec_add(bl, cbl);
+ bh = vec_add(bh, cbh);
+ bl = vec_add(bl, yl);
+ bh = vec_add(bh, yh);
+
+ rl = vec_add(crl, crl);
+ rh = vec_add(crh, crh);
+ rl = vec_madds(rl, pw_f0402, pw_one);
+ rh = vec_madds(rh, pw_f0402, pw_one);
+ rl = vec_sra(rl, (__vector unsigned short)pw_one);
+ rh = vec_sra(rh, (__vector unsigned short)pw_one);
+ rl = vec_add(rl, crl);
+ rh = vec_add(rh, crh);
+ rl = vec_add(rl, yl);
+ rh = vec_add(rh, yh);
+
+ g0w = vec_mergeh(cbl, crl);
+ g1w = vec_mergel(cbl, crl);
+ g0 = vec_msums(g0w, pw_mf0344_f0285, pd_onehalf);
+ g1 = vec_msums(g1w, pw_mf0344_f0285, pd_onehalf);
+ g2w = vec_mergeh(cbh, crh);
+ g3w = vec_mergel(cbh, crh);
+ g2 = vec_msums(g2w, pw_mf0344_f0285, pd_onehalf);
+ g3 = vec_msums(g3w, pw_mf0344_f0285, pd_onehalf);
+ /* Clever way to avoid 4 shifts + 2 packs. This packs the high word from
+ * each dword into a new 16-bit vector, which is the equivalent of
+ * descaling the 32-bit results (right-shifting by 16 bits) and then
+ * packing them.
+ */
+ gl = vec_perm((__vector short)g0, (__vector short)g1, shift_pack_index);
+ gh = vec_perm((__vector short)g2, (__vector short)g3, shift_pack_index);
+ gl = vec_sub(gl, crl);
+ gh = vec_sub(gh, crh);
+ gl = vec_add(gl, yl);
+ gh = vec_add(gh, yh);
+
+ rg0 = vec_mergeh(rl, gl);
+ bx0 = vec_mergeh(bl, pw_255);
+ rg1 = vec_mergel(rl, gl);
+ bx1 = vec_mergel(bl, pw_255);
+ rg2 = vec_mergeh(rh, gh);
+ bx2 = vec_mergeh(bh, pw_255);
+ rg3 = vec_mergel(rh, gh);
+ bx3 = vec_mergel(bh, pw_255);
+
+ rgbx0 = vec_packsu(rg0, bx0);
+ rgbx1 = vec_packsu(rg1, bx1);
+ rgbx2 = vec_packsu(rg2, bx2);
+ rgbx3 = vec_packsu(rg3, bx3);
+
+#if RGB_PIXELSIZE == 3
+ /* rgbx0 = R0 G0 R1 G1 R2 G2 R3 G3 B0 X0 B1 X1 B2 X2 B3 X3
+ * rgbx1 = R4 G4 R5 G5 R6 G6 R7 G7 B4 X4 B5 X5 B6 X6 B7 X7
+ * rgbx2 = R8 G8 R9 G9 Ra Ga Rb Gb B8 X8 B9 X9 Ba Xa Bb Xb
+ * rgbx3 = Rc Gc Rd Gd Re Ge Rf Gf Bc Xc Bd Xd Be Xe Bf Xf
+ *
+ * rgb0 = R0 G0 B0 R1 G1 B1 R2 G2 B2 R3 G3 B3 R4 G4 B4 R5
+ * rgb1 = G5 B5 R6 G6 B6 R7 G7 B7 R8 G8 B8 R9 G9 B9 Ra Ga
+ * rgb2 = Ba Rb Gb Bb Rc Gc Bc Rd Gd Bd Re Ge Be Rf Gf Bf
+ */
+ rgb0 = vec_perm(rgbx0, rgbx1, (__vector unsigned char)RGB_INDEX0);
+ rgb1 = vec_perm(rgbx1, rgbx2, (__vector unsigned char)RGB_INDEX1);
+ rgb2 = vec_perm(rgbx2, rgbx3, (__vector unsigned char)RGB_INDEX2);
+#else
+ /* rgbx0 = R0 G0 R1 G1 R2 G2 R3 G3 B0 X0 B1 X1 B2 X2 B3 X3
+ * rgbx1 = R4 G4 R5 G5 R6 G6 R7 G7 B4 X4 B5 X5 B6 X6 B7 X7
+ * rgbx2 = R8 G8 R9 G9 Ra Ga Rb Gb B8 X8 B9 X9 Ba Xa Bb Xb
+ * rgbx3 = Rc Gc Rd Gd Re Ge Rf Gf Bc Xc Bd Xd Be Xe Bf Xf
+ *
+ * rgb0 = R0 G0 B0 X0 R1 G1 B1 X1 R2 G2 B2 X2 R3 G3 B3 X3
+ * rgb1 = R4 G4 B4 X4 R5 G5 B5 X5 R6 G6 B6 X6 R7 G7 B7 X7
+ * rgb2 = R8 G8 B8 X8 R9 G9 B9 X9 Ra Ga Ba Xa Rb Gb Bb Xb
+ * rgb3 = Rc Gc Bc Xc Rd Gd Bd Xd Re Ge Be Xe Rf Gf Bf Xf
+ */
+ rgb0 = vec_perm(rgbx0, rgbx0, (__vector unsigned char)RGB_INDEX);
+ rgb1 = vec_perm(rgbx1, rgbx1, (__vector unsigned char)RGB_INDEX);
+ rgb2 = vec_perm(rgbx2, rgbx2, (__vector unsigned char)RGB_INDEX);
+ rgb3 = vec_perm(rgbx3, rgbx3, (__vector unsigned char)RGB_INDEX);
+#endif
+
+#if __BIG_ENDIAN__
+ offset = (size_t)outptr & 15;
+ if (offset) {
+ __vector unsigned char unaligned_shift_index;
+ int bytes = num_cols + offset;
+
+ if (bytes < (RGB_PIXELSIZE + 1) * 16 && (bytes & 15)) {
+ /* Slow path to prevent buffer overwrite. Since there is no way to
+ * write a partial AltiVec register, overwrite would occur on the
+ * last chunk of the last image row if the right edge is not on a
+ * 16-byte boundary. It could also occur on other rows if the bytes
+ * per row is low enough. Since we can't determine whether we're on
+ * the last image row, we have to assume every row is the last.
+ */
+ vec_st(rgb0, 0, tmpbuf);
+ vec_st(rgb1, 16, tmpbuf);
+ vec_st(rgb2, 32, tmpbuf);
+#if RGB_PIXELSIZE == 4
+ vec_st(rgb3, 48, tmpbuf);
+#endif
+ memcpy(outptr, tmpbuf, min(num_cols, RGB_PIXELSIZE * 16));
+ } else {
+ /* Fast path */
+ unaligned_shift_index = vec_lvsl(0, outptr);
+ edgel = vec_ld(0, outptr);
+ edgeh = vec_ld(min(num_cols - 1, RGB_PIXELSIZE * 16), outptr);
+ edges = vec_perm(edgeh, edgel, unaligned_shift_index);
+ unaligned_shift_index = vec_lvsr(0, outptr);
+ out0 = vec_perm(edges, rgb0, unaligned_shift_index);
+ out1 = vec_perm(rgb0, rgb1, unaligned_shift_index);
+ out2 = vec_perm(rgb1, rgb2, unaligned_shift_index);
+#if RGB_PIXELSIZE == 4
+ out3 = vec_perm(rgb2, rgb3, unaligned_shift_index);
+ out4 = vec_perm(rgb3, edges, unaligned_shift_index);
+#else
+ out3 = vec_perm(rgb2, edges, unaligned_shift_index);
+#endif
+ vec_st(out0, 0, outptr);
+ if (bytes > 16)
+ vec_st(out1, 16, outptr);
+ if (bytes > 32)
+ vec_st(out2, 32, outptr);
+ if (bytes > 48)
+ vec_st(out3, 48, outptr);
+#if RGB_PIXELSIZE == 4
+ if (bytes > 64)
+ vec_st(out4, 64, outptr);
+#endif
+ }
+ } else {
+#endif /* __BIG_ENDIAN__ */
+ if (num_cols < RGB_PIXELSIZE * 16 && (num_cols & 15)) {
+ /* Slow path */
+ VEC_ST(rgb0, 0, tmpbuf);
+ VEC_ST(rgb1, 16, tmpbuf);
+ VEC_ST(rgb2, 32, tmpbuf);
+#if RGB_PIXELSIZE == 4
+ VEC_ST(rgb3, 48, tmpbuf);
+#endif
+ memcpy(outptr, tmpbuf, min(num_cols, RGB_PIXELSIZE * 16));
+ } else {
+ /* Fast path */
+ VEC_ST(rgb0, 0, outptr);
+ if (num_cols > 16)
+ VEC_ST(rgb1, 16, outptr);
+ if (num_cols > 32)
+ VEC_ST(rgb2, 32, outptr);
+#if RGB_PIXELSIZE == 4
+ if (num_cols > 48)
+ VEC_ST(rgb3, 48, outptr);
+#endif
+ }
+#if __BIG_ENDIAN__
+ }
+#endif
+ }
+ }
+}
diff --git a/simd/jdcolor-altivec.c b/simd/jdcolor-altivec.c
new file mode 100644
index 0000000..e0892d8
--- /dev/null
+++ b/simd/jdcolor-altivec.c
@@ -0,0 +1,96 @@
+/*
+ * AltiVec optimizations for libjpeg-turbo
+ *
+ * Copyright (C) 2015, D. R. Commander.
+ * All rights reserved.
+ * This software is provided 'as-is', without any express or implied
+ * warranty. In no event will the authors be held liable for any damages
+ * arising from the use of this software.
+ *
+ * Permission is granted to anyone to use this software for any purpose,
+ * including commercial applications, and to alter it and redistribute it
+ * freely, subject to the following restrictions:
+ *
+ * 1. The origin of this software must not be misrepresented; you must not
+ * claim that you wrote the original software. If you use this software
+ * in a product, an acknowledgment in the product documentation would be
+ * appreciated but is not required.
+ * 2. Altered source versions must be plainly marked as such, and must not be
+ * misrepresented as being the original software.
+ * 3. This notice may not be removed or altered from any source distribution.
+ */
+
+/* YCC --> RGB CONVERSION */
+
+#include "jsimd_altivec.h"
+
+
+#define F_0_344 22554 /* FIX(0.34414) */
+#define F_0_714 46802 /* FIX(0.71414) */
+#define F_1_402 91881 /* FIX(1.40200) */
+#define F_1_772 116130 /* FIX(1.77200) */
+#define F_0_402 (F_1_402 - 65536) /* FIX(1.40200) - FIX(1) */
+#define F_0_285 (65536 - F_0_714) /* FIX(1) - FIX(0.71414) */
+#define F_0_228 (131072 - F_1_772) /* FIX(2) - FIX(1.77200) */
+
+#define SCALEBITS 16
+#define ONE_HALF (1 << (SCALEBITS - 1))
+
+#define RGB_INDEX0 {0,1,8,2,3,10,4,5,12,6,7,14,16,17,24,18}
+#define RGB_INDEX1 {3,10,4,5,12,6,7,14,16,17,24,18,19,26,20,21}
+#define RGB_INDEX2 {12,6,7,14,16,17,24,18,19,26,20,21,28,22,23,30}
+#include "jdcolext-altivec.c"
+#undef RGB_PIXELSIZE
+
+#define RGB_PIXELSIZE EXT_RGB_PIXELSIZE
+#define jsimd_ycc_rgb_convert_altivec jsimd_ycc_extrgb_convert_altivec
+#include "jdcolext-altivec.c"
+#undef RGB_PIXELSIZE
+#undef RGB_INDEX0
+#undef RGB_INDEX1
+#undef RGB_INDEX2
+#undef jsimd_ycc_rgb_convert_altivec
+
+#define RGB_PIXELSIZE EXT_RGBX_PIXELSIZE
+#define RGB_INDEX {0,1,8,9,2,3,10,11,4,5,12,13,6,7,14,15}
+#define jsimd_ycc_rgb_convert_altivec jsimd_ycc_extrgbx_convert_altivec
+#include "jdcolext-altivec.c"
+#undef RGB_PIXELSIZE
+#undef RGB_INDEX
+#undef jsimd_ycc_rgb_convert_altivec
+
+#define RGB_PIXELSIZE EXT_BGR_PIXELSIZE
+#define RGB_INDEX0 {8,1,0,10,3,2,12,5,4,14,7,6,24,17,16,26}
+#define RGB_INDEX1 {3,2,12,5,4,14,7,6,24,17,16,26,19,18,28,21}
+#define RGB_INDEX2 {4,14,7,6,24,17,16,26,19,18,28,21,20,30,23,22}
+#define jsimd_ycc_rgb_convert_altivec jsimd_ycc_extbgr_convert_altivec
+#include "jdcolext-altivec.c"
+#undef RGB_PIXELSIZE
+#undef RGB_INDEX0
+#undef RGB_INDEX1
+#undef RGB_INDEX2
+#undef jsimd_ycc_rgb_convert_altivec
+
+#define RGB_PIXELSIZE EXT_BGRX_PIXELSIZE
+#define RGB_INDEX {8,1,0,9,10,3,2,11,12,5,4,13,14,7,6,15}
+#define jsimd_ycc_rgb_convert_altivec jsimd_ycc_extbgrx_convert_altivec
+#include "jdcolext-altivec.c"
+#undef RGB_PIXELSIZE
+#undef RGB_INDEX
+#undef jsimd_ycc_rgb_convert_altivec
+
+#define RGB_PIXELSIZE EXT_XBGR_PIXELSIZE
+#define RGB_INDEX {9,8,1,0,11,10,3,2,13,12,5,4,15,14,7,6}
+#define jsimd_ycc_rgb_convert_altivec jsimd_ycc_extxbgr_convert_altivec
+#include "jdcolext-altivec.c"
+#undef RGB_PIXELSIZE
+#undef RGB_INDEX
+#undef jsimd_ycc_rgb_convert_altivec
+
+#define RGB_PIXELSIZE EXT_XRGB_PIXELSIZE
+#define RGB_INDEX {9,0,1,8,11,2,3,10,13,4,5,12,15,6,7,14}
+#define jsimd_ycc_rgb_convert_altivec jsimd_ycc_extxrgb_convert_altivec
+#include "jdcolext-altivec.c"
+#undef RGB_PIXELSIZE
+#undef RGB_INDEX
+#undef jsimd_ycc_rgb_convert_altivec
diff --git a/simd/jdmerge-altivec.c b/simd/jdmerge-altivec.c
new file mode 100644
index 0000000..cc8d3d9
--- /dev/null
+++ b/simd/jdmerge-altivec.c
@@ -0,0 +1,108 @@
+/*
+ * AltiVec optimizations for libjpeg-turbo
+ *
+ * Copyright (C) 2015, D. R. Commander.
+ * All rights reserved.
+ * This software is provided 'as-is', without any express or implied
+ * warranty. In no event will the authors be held liable for any damages
+ * arising from the use of this software.
+ *
+ * Permission is granted to anyone to use this software for any purpose,
+ * including commercial applications, and to alter it and redistribute it
+ * freely, subject to the following restrictions:
+ *
+ * 1. The origin of this software must not be misrepresented; you must not
+ * claim that you wrote the original software. If you use this software
+ * in a product, an acknowledgment in the product documentation would be
+ * appreciated but is not required.
+ * 2. Altered source versions must be plainly marked as such, and must not be
+ * misrepresented as being the original software.
+ * 3. This notice may not be removed or altered from any source distribution.
+ */
+
+/* MERGED YCC --> RGB CONVERSION AND UPSAMPLING */
+
+#include "jsimd_altivec.h"
+
+
+#define F_0_344 22554 /* FIX(0.34414) */
+#define F_0_714 46802 /* FIX(0.71414) */
+#define F_1_402 91881 /* FIX(1.40200) */
+#define F_1_772 116130 /* FIX(1.77200) */
+#define F_0_402 (F_1_402 - 65536) /* FIX(1.40200) - FIX(1) */
+#define F_0_285 (65536 - F_0_714) /* FIX(1) - FIX(0.71414) */
+#define F_0_228 (131072 - F_1_772) /* FIX(2) - FIX(1.77200) */
+
+#define SCALEBITS 16
+#define ONE_HALF (1 << (SCALEBITS - 1))
+
+#define RGB_INDEX0 {0,1,8,2,3,10,4,5,12,6,7,14,16,17,24,18}
+#define RGB_INDEX1 {3,10,4,5,12,6,7,14,16,17,24,18,19,26,20,21}
+#define RGB_INDEX2 {12,6,7,14,16,17,24,18,19,26,20,21,28,22,23,30}
+#include "jdmrgext-altivec.c"
+#undef RGB_PIXELSIZE
+
+#define RGB_PIXELSIZE EXT_RGB_PIXELSIZE
+#define jsimd_h2v1_merged_upsample_altivec jsimd_h2v1_extrgb_merged_upsample_altivec
+#define jsimd_h2v2_merged_upsample_altivec jsimd_h2v2_extrgb_merged_upsample_altivec
+#include "jdmrgext-altivec.c"
+#undef RGB_PIXELSIZE
+#undef RGB_INDEX0
+#undef RGB_INDEX1
+#undef RGB_INDEX2
+#undef jsimd_h2v1_merged_upsample_altivec
+#undef jsimd_h2v2_merged_upsample_altivec
+
+#define RGB_PIXELSIZE EXT_RGBX_PIXELSIZE
+#define RGB_INDEX {0,1,8,9,2,3,10,11,4,5,12,13,6,7,14,15}
+#define jsimd_h2v1_merged_upsample_altivec jsimd_h2v1_extrgbx_merged_upsample_altivec
+#define jsimd_h2v2_merged_upsample_altivec jsimd_h2v2_extrgbx_merged_upsample_altivec
+#include "jdmrgext-altivec.c"
+#undef RGB_PIXELSIZE
+#undef RGB_INDEX
+#undef jsimd_h2v1_merged_upsample_altivec
+#undef jsimd_h2v2_merged_upsample_altivec
+
+#define RGB_PIXELSIZE EXT_BGR_PIXELSIZE
+#define RGB_INDEX0 {8,1,0,10,3,2,12,5,4,14,7,6,24,17,16,26}
+#define RGB_INDEX1 {3,2,12,5,4,14,7,6,24,17,16,26,19,18,28,21}
+#define RGB_INDEX2 {4,14,7,6,24,17,16,26,19,18,28,21,20,30,23,22}
+#define jsimd_h2v1_merged_upsample_altivec jsimd_h2v1_extbgr_merged_upsample_altivec
+#define jsimd_h2v2_merged_upsample_altivec jsimd_h2v2_extbgr_merged_upsample_altivec
+#include "jdmrgext-altivec.c"
+#undef RGB_PIXELSIZE
+#undef RGB_INDEX0
+#undef RGB_INDEX1
+#undef RGB_INDEX2
+#undef jsimd_h2v1_merged_upsample_altivec
+#undef jsimd_h2v2_merged_upsample_altivec
+
+#define RGB_PIXELSIZE EXT_BGRX_PIXELSIZE
+#define RGB_INDEX {8,1,0,9,10,3,2,11,12,5,4,13,14,7,6,15}
+#define jsimd_h2v1_merged_upsample_altivec jsimd_h2v1_extbgrx_merged_upsample_altivec
+#define jsimd_h2v2_merged_upsample_altivec jsimd_h2v2_extbgrx_merged_upsample_altivec
+#include "jdmrgext-altivec.c"
+#undef RGB_PIXELSIZE
+#undef RGB_INDEX
+#undef jsimd_h2v1_merged_upsample_altivec
+#undef jsimd_h2v2_merged_upsample_altivec
+
+#define RGB_PIXELSIZE EXT_XBGR_PIXELSIZE
+#define RGB_INDEX {9,8,1,0,11,10,3,2,13,12,5,4,15,14,7,6}
+#define jsimd_h2v1_merged_upsample_altivec jsimd_h2v1_extxbgr_merged_upsample_altivec
+#define jsimd_h2v2_merged_upsample_altivec jsimd_h2v2_extxbgr_merged_upsample_altivec
+#include "jdmrgext-altivec.c"
+#undef RGB_PIXELSIZE
+#undef RGB_INDEX
+#undef jsimd_h2v1_merged_upsample_altivec
+#undef jsimd_h2v2_merged_upsample_altivec
+
+#define RGB_PIXELSIZE EXT_XRGB_PIXELSIZE
+#define RGB_INDEX {9,0,1,8,11,2,3,10,13,4,5,12,15,6,7,14}
+#define jsimd_h2v1_merged_upsample_altivec jsimd_h2v1_extxrgb_merged_upsample_altivec
+#define jsimd_h2v2_merged_upsample_altivec jsimd_h2v2_extxrgb_merged_upsample_altivec
+#include "jdmrgext-altivec.c"
+#undef RGB_PIXELSIZE
+#undef RGB_INDEX
+#undef jsimd_h2v1_merged_upsample_altivec
+#undef jsimd_h2v2_merged_upsample_altivec
diff --git a/simd/jdmrgext-altivec.c b/simd/jdmrgext-altivec.c
new file mode 100644
index 0000000..3b6950d
--- /dev/null
+++ b/simd/jdmrgext-altivec.c
@@ -0,0 +1,323 @@
+/*
+ * AltiVec optimizations for libjpeg-turbo
+ *
+ * Copyright (C) 2015, D. R. Commander.
+ * All rights reserved.
+ * This software is provided 'as-is', without any express or implied
+ * warranty. In no event will the authors be held liable for any damages
+ * arising from the use of this software.
+ *
+ * Permission is granted to anyone to use this software for any purpose,
+ * including commercial applications, and to alter it and redistribute it
+ * freely, subject to the following restrictions:
+ *
+ * 1. The origin of this software must not be misrepresented; you must not
+ * claim that you wrote the original software. If you use this software
+ * in a product, an acknowledgment in the product documentation would be
+ * appreciated but is not required.
+ * 2. Altered source versions must be plainly marked as such, and must not be
+ * misrepresented as being the original software.
+ * 3. This notice may not be removed or altered from any source distribution.
+ */
+
+/* This file is included by jdmerge-altivec.c */
+
+
+void jsimd_h2v1_merged_upsample_altivec (JDIMENSION output_width,
+ JSAMPIMAGE input_buf,
+ JDIMENSION in_row_group_ctr,
+ JSAMPARRAY output_buf)
+{
+ JSAMPROW outptr, inptr0, inptr1, inptr2;
+ int pitch = output_width * RGB_PIXELSIZE, num_cols, yloop;
+#if __BIG_ENDIAN__
+ int offset;
+#endif
+ unsigned char __attribute__((aligned(16))) tmpbuf[RGB_PIXELSIZE * 16];
+
+ __vector unsigned char rgb0, rgb1, rgb2, rgbx0, rgbx1, rgbx2, rgbx3,
+ y, cb, cr;
+#if __BIG_ENDIAN__
+ __vector unsigned char edgel, edgeh, edges, out0, out1, out2, out3;
+#if RGB_PIXELSIZE == 4
+ __vector unsigned char out4;
+#endif
+#endif
+#if RGB_PIXELSIZE == 4
+ __vector unsigned char rgb3;
+#endif
+ __vector short rg0, rg1, rg2, rg3, bx0, bx1, bx2, bx3, ye, yo, cbl, cbh,
+ crl, crh, r_yl, r_yh, g_yl, g_yh, b_yl, b_yh, g_y0w, g_y1w, g_y2w, g_y3w,
+ rl, rh, gl, gh, bl, bh, re, ro, ge, go, be, bo;
+ __vector int g_y0, g_y1, g_y2, g_y3;
+
+ /* Constants
+ * NOTE: The >> 1 is to compensate for the fact that vec_madds() returns 17
+ * high-order bits, not 16.
+ */
+ __vector short pw_f0402 = { __8X(F_0_402 >> 1) },
+ pw_mf0228 = { __8X(-F_0_228 >> 1) },
+ pw_mf0344_f0285 = { __4X2(-F_0_344, F_0_285) },
+ pw_one = { __8X(1) }, pw_255 = { __8X(255) },
+ pw_cj = { __8X(CENTERJSAMPLE) };
+ __vector int pd_onehalf = { __4X(ONE_HALF) };
+ __vector unsigned char pb_zero = { __16X(0) },
+#if __BIG_ENDIAN__
+ shift_pack_index = {0,1,4,5,8,9,12,13,16,17,20,21,24,25,28,29},
+ even_index = {0,16,0,18,0,20,0,22,0,24,0,26,0,28,0,30},
+ odd_index = {0,17,0,19,0,21,0,23,0,25,0,27,0,29,0,31};
+#else
+ shift_pack_index = {2,3,6,7,10,11,14,15,18,19,22,23,26,27,30,31},
+ even_index = {16,0,18,0,20,0,22,0,24,0,26,0,28,0,30,0},
+ odd_index = {17,0,19,0,21,0,23,0,25,0,27,0,29,0,31,0};
+#endif
+
+ inptr0 = input_buf[0][in_row_group_ctr];
+ inptr1 = input_buf[1][in_row_group_ctr];
+ inptr2 = input_buf[2][in_row_group_ctr];
+ outptr = output_buf[0];
+
+ for (num_cols = pitch; num_cols > 0; inptr1 += 16, inptr2 += 16) {
+
+ cb = vec_ld(0, inptr1);
+ /* NOTE: We have to use vec_merge*() here because vec_unpack*() doesn't
+ * support unsigned vectors.
+ */
+ cbl = (__vector signed short)VEC_UNPACKHU(cb);
+ cbh = (__vector signed short)VEC_UNPACKLU(cb);
+ cbl = vec_sub(cbl, pw_cj);
+ cbh = vec_sub(cbh, pw_cj);
+
+ cr = vec_ld(0, inptr2);
+ crl = (__vector signed short)VEC_UNPACKHU(cr);
+ crh = (__vector signed short)VEC_UNPACKLU(cr);
+ crl = vec_sub(crl, pw_cj);
+ crh = vec_sub(crh, pw_cj);
+
+ /* (Original)
+ * R = Y + 1.40200 * Cr
+ * G = Y - 0.34414 * Cb - 0.71414 * Cr
+ * B = Y + 1.77200 * Cb
+ *
+ * (This implementation)
+ * R = Y + 0.40200 * Cr + Cr
+ * G = Y - 0.34414 * Cb + 0.28586 * Cr - Cr
+ * B = Y - 0.22800 * Cb + Cb + Cb
+ */
+ b_yl = vec_add(cbl, cbl);
+ b_yh = vec_add(cbh, cbh);
+ b_yl = vec_madds(b_yl, pw_mf0228, pw_one);
+ b_yh = vec_madds(b_yh, pw_mf0228, pw_one);
+ b_yl = vec_sra(b_yl, (__vector unsigned short)pw_one);
+ b_yh = vec_sra(b_yh, (__vector unsigned short)pw_one);
+ b_yl = vec_add(b_yl, cbl);
+ b_yh = vec_add(b_yh, cbh);
+ b_yl = vec_add(b_yl, cbl);
+ b_yh = vec_add(b_yh, cbh);
+
+ r_yl = vec_add(crl, crl);
+ r_yh = vec_add(crh, crh);
+ r_yl = vec_madds(r_yl, pw_f0402, pw_one);
+ r_yh = vec_madds(r_yh, pw_f0402, pw_one);
+ r_yl = vec_sra(r_yl, (__vector unsigned short)pw_one);
+ r_yh = vec_sra(r_yh, (__vector unsigned short)pw_one);
+ r_yl = vec_add(r_yl, crl);
+ r_yh = vec_add(r_yh, crh);
+
+ g_y0w = vec_mergeh(cbl, crl);
+ g_y1w = vec_mergel(cbl, crl);
+ g_y0 = vec_msums(g_y0w, pw_mf0344_f0285, pd_onehalf);
+ g_y1 = vec_msums(g_y1w, pw_mf0344_f0285, pd_onehalf);
+ g_y2w = vec_mergeh(cbh, crh);
+ g_y3w = vec_mergel(cbh, crh);
+ g_y2 = vec_msums(g_y2w, pw_mf0344_f0285, pd_onehalf);
+ g_y3 = vec_msums(g_y3w, pw_mf0344_f0285, pd_onehalf);
+ /* Clever way to avoid 4 shifts + 2 packs. This packs the high word from
+ * each dword into a new 16-bit vector, which is the equivalent of
+ * descaling the 32-bit results (right-shifting by 16 bits) and then
+ * packing them.
+ */
+ g_yl = vec_perm((__vector short)g_y0, (__vector short)g_y1,
+ shift_pack_index);
+ g_yh = vec_perm((__vector short)g_y2, (__vector short)g_y3,
+ shift_pack_index);
+ g_yl = vec_sub(g_yl, crl);
+ g_yh = vec_sub(g_yh, crh);
+
+ for (yloop = 0; yloop < 2 && num_cols > 0; yloop++,
+ num_cols -= RGB_PIXELSIZE * 16,
+ outptr += RGB_PIXELSIZE * 16, inptr0 += 16) {
+
+ y = vec_ld(0, inptr0);
+ ye = (__vector signed short)vec_perm(pb_zero, y, even_index);
+ yo = (__vector signed short)vec_perm(pb_zero, y, odd_index);
+
+ if (yloop == 0) {
+ be = vec_add(b_yl, ye);
+ bo = vec_add(b_yl, yo);
+ re = vec_add(r_yl, ye);
+ ro = vec_add(r_yl, yo);
+ ge = vec_add(g_yl, ye);
+ go = vec_add(g_yl, yo);
+ } else {
+ be = vec_add(b_yh, ye);
+ bo = vec_add(b_yh, yo);
+ re = vec_add(r_yh, ye);
+ ro = vec_add(r_yh, yo);
+ ge = vec_add(g_yh, ye);
+ go = vec_add(g_yh, yo);
+ }
+
+ rl = vec_mergeh(re, ro);
+ rh = vec_mergel(re, ro);
+ gl = vec_mergeh(ge, go);
+ gh = vec_mergel(ge, go);
+ bl = vec_mergeh(be, bo);
+ bh = vec_mergel(be, bo);
+
+ rg0 = vec_mergeh(rl, gl);
+ bx0 = vec_mergeh(bl, pw_255);
+ rg1 = vec_mergel(rl, gl);
+ bx1 = vec_mergel(bl, pw_255);
+ rg2 = vec_mergeh(rh, gh);
+ bx2 = vec_mergeh(bh, pw_255);
+ rg3 = vec_mergel(rh, gh);
+ bx3 = vec_mergel(bh, pw_255);
+
+ rgbx0 = vec_packsu(rg0, bx0);
+ rgbx1 = vec_packsu(rg1, bx1);
+ rgbx2 = vec_packsu(rg2, bx2);
+ rgbx3 = vec_packsu(rg3, bx3);
+
+#if RGB_PIXELSIZE == 3
+ /* rgbx0 = R0 G0 R1 G1 R2 G2 R3 G3 B0 X0 B1 X1 B2 X2 B3 X3
+ * rgbx1 = R4 G4 R5 G5 R6 G6 R7 G7 B4 X4 B5 X5 B6 X6 B7 X7
+ * rgbx2 = R8 G8 R9 G9 Ra Ga Rb Gb B8 X8 B9 X9 Ba Xa Bb Xb
+ * rgbx3 = Rc Gc Rd Gd Re Ge Rf Gf Bc Xc Bd Xd Be Xe Bf Xf
+ *
+ * rgb0 = R0 G0 B0 R1 G1 B1 R2 G2 B2 R3 G3 B3 R4 G4 B4 R5
+ * rgb1 = G5 B5 R6 G6 B6 R7 G7 B7 R8 G8 B8 R9 G9 B9 Ra Ga
+ * rgb2 = Ba Rb Gb Bb Rc Gc Bc Rd Gd Bd Re Ge Be Rf Gf Bf
+ */
+ rgb0 = vec_perm(rgbx0, rgbx1, (__vector unsigned char)RGB_INDEX0);
+ rgb1 = vec_perm(rgbx1, rgbx2, (__vector unsigned char)RGB_INDEX1);
+ rgb2 = vec_perm(rgbx2, rgbx3, (__vector unsigned char)RGB_INDEX2);
+#else
+ /* rgbx0 = R0 G0 R1 G1 R2 G2 R3 G3 B0 X0 B1 X1 B2 X2 B3 X3
+ * rgbx1 = R4 G4 R5 G5 R6 G6 R7 G7 B4 X4 B5 X5 B6 X6 B7 X7
+ * rgbx2 = R8 G8 R9 G9 Ra Ga Rb Gb B8 X8 B9 X9 Ba Xa Bb Xb
+ * rgbx3 = Rc Gc Rd Gd Re Ge Rf Gf Bc Xc Bd Xd Be Xe Bf Xf
+ *
+ * rgb0 = R0 G0 B0 X0 R1 G1 B1 X1 R2 G2 B2 X2 R3 G3 B3 X3
+ * rgb1 = R4 G4 B4 X4 R5 G5 B5 X5 R6 G6 B6 X6 R7 G7 B7 X7
+ * rgb2 = R8 G8 B8 X8 R9 G9 B9 X9 Ra Ga Ba Xa Rb Gb Bb Xb
+ * rgb3 = Rc Gc Bc Xc Rd Gd Bd Xd Re Ge Be Xe Rf Gf Bf Xf
+ */
+ rgb0 = vec_perm(rgbx0, rgbx0, (__vector unsigned char)RGB_INDEX);
+ rgb1 = vec_perm(rgbx1, rgbx1, (__vector unsigned char)RGB_INDEX);
+ rgb2 = vec_perm(rgbx2, rgbx2, (__vector unsigned char)RGB_INDEX);
+ rgb3 = vec_perm(rgbx3, rgbx3, (__vector unsigned char)RGB_INDEX);
+#endif
+
+#if __BIG_ENDIAN__
+ offset = (size_t)outptr & 15;
+ if (offset) {
+ __vector unsigned char unaligned_shift_index;
+ int bytes = num_cols + offset;
+
+ if (bytes < (RGB_PIXELSIZE + 1) * 16 && (bytes & 15)) {
+ /* Slow path to prevent buffer overwrite. Since there is no way to
+ * write a partial AltiVec register, overwrite would occur on the
+ * last chunk of the last image row if the right edge is not on a
+ * 16-byte boundary. It could also occur on other rows if the bytes
+ * per row is low enough. Since we can't determine whether we're on
+ * the last image row, we have to assume every row is the last.
+ */
+ vec_st(rgb0, 0, tmpbuf);
+ vec_st(rgb1, 16, tmpbuf);
+ vec_st(rgb2, 32, tmpbuf);
+#if RGB_PIXELSIZE == 4
+ vec_st(rgb3, 48, tmpbuf);
+#endif
+ memcpy(outptr, tmpbuf, min(num_cols, RGB_PIXELSIZE * 16));
+ } else {
+ /* Fast path */
+ unaligned_shift_index = vec_lvsl(0, outptr);
+ edgel = vec_ld(0, outptr);
+ edgeh = vec_ld(min(num_cols - 1, RGB_PIXELSIZE * 16), outptr);
+ edges = vec_perm(edgeh, edgel, unaligned_shift_index);
+ unaligned_shift_index = vec_lvsr(0, outptr);
+ out0 = vec_perm(edges, rgb0, unaligned_shift_index);
+ out1 = vec_perm(rgb0, rgb1, unaligned_shift_index);
+ out2 = vec_perm(rgb1, rgb2, unaligned_shift_index);
+#if RGB_PIXELSIZE == 4
+ out3 = vec_perm(rgb2, rgb3, unaligned_shift_index);
+ out4 = vec_perm(rgb3, edges, unaligned_shift_index);
+#else
+ out3 = vec_perm(rgb2, edges, unaligned_shift_index);
+#endif
+ vec_st(out0, 0, outptr);
+ if (bytes > 16)
+ vec_st(out1, 16, outptr);
+ if (bytes > 32)
+ vec_st(out2, 32, outptr);
+ if (bytes > 48)
+ vec_st(out3, 48, outptr);
+#if RGB_PIXELSIZE == 4
+ if (bytes > 64)
+ vec_st(out4, 64, outptr);
+#endif
+ }
+ } else {
+#endif /* __BIG_ENDIAN__ */
+ if (num_cols < RGB_PIXELSIZE * 16 && (num_cols & 15)) {
+ /* Slow path */
+ VEC_ST(rgb0, 0, tmpbuf);
+ VEC_ST(rgb1, 16, tmpbuf);
+ VEC_ST(rgb2, 32, tmpbuf);
+#if RGB_PIXELSIZE == 4
+ VEC_ST(rgb3, 48, tmpbuf);
+#endif
+ memcpy(outptr, tmpbuf, min(num_cols, RGB_PIXELSIZE * 16));
+ } else {
+ /* Fast path */
+ VEC_ST(rgb0, 0, outptr);
+ if (num_cols > 16)
+ VEC_ST(rgb1, 16, outptr);
+ if (num_cols > 32)
+ VEC_ST(rgb2, 32, outptr);
+#if RGB_PIXELSIZE == 4
+ if (num_cols > 48)
+ VEC_ST(rgb3, 48, outptr);
+#endif
+ }
+#if __BIG_ENDIAN__
+ }
+#endif
+ }
+ }
+}
+
+
+void jsimd_h2v2_merged_upsample_altivec (JDIMENSION output_width,
+ JSAMPIMAGE input_buf,
+ JDIMENSION in_row_group_ctr,
+ JSAMPARRAY output_buf)
+{
+ JSAMPROW inptr, outptr;
+
+ inptr = input_buf[0][in_row_group_ctr];
+ outptr = output_buf[0];
+
+ input_buf[0][in_row_group_ctr] = input_buf[0][in_row_group_ctr * 2];
+ jsimd_h2v1_merged_upsample_altivec(output_width, input_buf, in_row_group_ctr,
+ output_buf);
+
+ input_buf[0][in_row_group_ctr] = input_buf[0][in_row_group_ctr * 2 + 1];
+ output_buf[0] = output_buf[1];
+ jsimd_h2v1_merged_upsample_altivec(output_width, input_buf, in_row_group_ctr,
+ output_buf);
+
+ input_buf[0][in_row_group_ctr] = inptr;
+ output_buf[0] = outptr;
+}
diff --git a/simd/jdsample-altivec.c b/simd/jdsample-altivec.c
new file mode 100644
index 0000000..6b77d04
--- /dev/null
+++ b/simd/jdsample-altivec.c
@@ -0,0 +1,392 @@
+/*
+ * AltiVec optimizations for libjpeg-turbo
+ *
+ * Copyright (C) 2015, D. R. Commander.
+ * All rights reserved.
+ * This software is provided 'as-is', without any express or implied
+ * warranty. In no event will the authors be held liable for any damages
+ * arising from the use of this software.
+ *
+ * Permission is granted to anyone to use this software for any purpose,
+ * including commercial applications, and to alter it and redistribute it
+ * freely, subject to the following restrictions:
+ *
+ * 1. The origin of this software must not be misrepresented; you must not
+ * claim that you wrote the original software. If you use this software
+ * in a product, an acknowledgment in the product documentation would be
+ * appreciated but is not required.
+ * 2. Altered source versions must be plainly marked as such, and must not be
+ * misrepresented as being the original software.
+ * 3. This notice may not be removed or altered from any source distribution.
+ */
+
+/* CHROMA UPSAMPLING */
+
+#include "jsimd_altivec.h"
+
+
+void
+jsimd_h2v1_fancy_upsample_altivec (int max_v_samp_factor,
+ JDIMENSION downsampled_width,
+ JSAMPARRAY input_data,
+ JSAMPARRAY *output_data_ptr)
+{
+ JSAMPARRAY output_data = *output_data_ptr;
+ JSAMPROW inptr, outptr;
+ int inrow, incol;
+
+ __vector unsigned char this0, last0, p_last0, next0 = {0}, p_next0,
+ out;
+ __vector short this0e, this0o, this0l, this0h, last0l, last0h,
+ next0l, next0h, outle, outhe, outlo, outho;
+
+ /* Constants */
+ __vector unsigned char pb_zero = { __16X(0) }, pb_three = { __16X(3) },
+ last_index_col0 = {0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14},
+ last_index = {15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30},
+ next_index = {1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16},
+ next_index_lastcol = {1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,15},
+#if __BIG_ENDIAN__
+ merge_pack_index = {1,17,3,19,5,21,7,23,9,25,11,27,13,29,15,31};
+#else
+ merge_pack_index = {0,16,2,18,4,20,6,22,8,24,10,26,12,28,14,30};
+#endif
+ __vector short pw_one = { __8X(1) }, pw_two = { __8X(2) };
+
+ for (inrow = 0; inrow < max_v_samp_factor; inrow++) {
+ inptr = input_data[inrow];
+ outptr = output_data[inrow];
+
+ if (downsampled_width & 15)
+ inptr[downsampled_width] = inptr[downsampled_width - 1];
+
+ this0 = vec_ld(0, inptr);
+ p_last0 = vec_perm(this0, this0, last_index_col0);
+ last0 = this0;
+
+ for (incol = downsampled_width; incol > 0;
+ incol -= 16, inptr += 16, outptr += 32) {
+
+ if (downsampled_width - incol > 0) {
+ p_last0 = vec_perm(last0, this0, last_index);
+ last0 = this0;
+ }
+
+ if (incol <= 16)
+ p_next0 = vec_perm(this0, this0, next_index_lastcol);
+ else {
+ next0 = vec_ld(16, inptr);
+ p_next0 = vec_perm(this0, next0, next_index);
+ }
+
+ this0e = (__vector short)vec_mule(this0, pb_three);
+ this0o = (__vector short)vec_mulo(this0, pb_three);
+ this0l = vec_mergeh(this0e, this0o);
+ this0h = vec_mergel(this0e, this0o);
+
+ last0l = (__vector short)VEC_UNPACKHU(p_last0);
+ last0h = (__vector short)VEC_UNPACKLU(p_last0);
+ last0l = vec_add(last0l, pw_one);
+
+ next0l = (__vector short)VEC_UNPACKHU(p_next0);
+ next0h = (__vector short)VEC_UNPACKLU(p_next0);
+ next0l = vec_add(next0l, pw_two);
+
+ outle = vec_add(this0l, last0l);
+ outlo = vec_add(this0l, next0l);
+ outle = vec_sr(outle, (__vector unsigned short)pw_two);
+ outlo = vec_sr(outlo, (__vector unsigned short)pw_two);
+
+ out = vec_perm((__vector unsigned char)outle,
+ (__vector unsigned char)outlo, merge_pack_index);
+ vec_st(out, 0, outptr);
+
+ if (incol > 8) {
+ last0h = vec_add(last0h, pw_one);
+ next0h = vec_add(next0h, pw_two);
+
+ outhe = vec_add(this0h, last0h);
+ outho = vec_add(this0h, next0h);
+ outhe = vec_sr(outhe, (__vector unsigned short)pw_two);
+ outho = vec_sr(outho, (__vector unsigned short)pw_two);
+
+ out = vec_perm((__vector unsigned char)outhe,
+ (__vector unsigned char)outho, merge_pack_index);
+ vec_st(out, 16, outptr);
+ }
+
+ this0 = next0;
+ }
+ }
+}
+
+
+void
+jsimd_h2v2_fancy_upsample_altivec (int max_v_samp_factor,
+ JDIMENSION downsampled_width,
+ JSAMPARRAY input_data,
+ JSAMPARRAY *output_data_ptr)
+{
+ JSAMPARRAY output_data = *output_data_ptr;
+ JSAMPROW inptr_1, inptr0, inptr1, outptr0, outptr1;
+ int inrow, outrow, incol;
+
+ __vector unsigned char this_1, this0, this1, out;
+ __vector short this_1l, this_1h, this0l, this0h, this1l, this1h,
+ lastcolsum_1h, lastcolsum1h,
+ p_lastcolsum_1l, p_lastcolsum_1h, p_lastcolsum1l, p_lastcolsum1h,
+ thiscolsum_1l, thiscolsum_1h, thiscolsum1l, thiscolsum1h,
+ nextcolsum_1l = {0}, nextcolsum_1h = {0},
+ nextcolsum1l = {0}, nextcolsum1h = {0},
+ p_nextcolsum_1l, p_nextcolsum_1h, p_nextcolsum1l, p_nextcolsum1h,
+ tmpl, tmph, outle, outhe, outlo, outho;
+
+ /* Constants */
+ __vector unsigned char pb_zero = { __16X(0) },
+ last_index_col0 = {0,1,0,1,2,3,4,5,6,7,8,9,10,11,12,13},
+ last_index={14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29},
+ next_index = {2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17},
+ next_index_lastcol = {2,3,4,5,6,7,8,9,10,11,12,13,14,15,14,15},
+#if __BIG_ENDIAN__
+ merge_pack_index = {1,17,3,19,5,21,7,23,9,25,11,27,13,29,15,31};
+#else
+ merge_pack_index = {0,16,2,18,4,20,6,22,8,24,10,26,12,28,14,30};
+#endif
+ __vector short pw_zero = { __8X(0) }, pw_three = { __8X(3) },
+ pw_seven = { __8X(7) }, pw_eight = { __8X(8) };
+ __vector unsigned short pw_four = { __8X(4) };
+
+ for (inrow = 0, outrow = 0; outrow < max_v_samp_factor; inrow++) {
+
+ inptr_1 = input_data[inrow - 1];
+ inptr0 = input_data[inrow];
+ inptr1 = input_data[inrow + 1];
+ outptr0 = output_data[outrow++];
+ outptr1 = output_data[outrow++];
+
+ if (downsampled_width & 15) {
+ inptr_1[downsampled_width] = inptr_1[downsampled_width - 1];
+ inptr0[downsampled_width] = inptr0[downsampled_width - 1];
+ inptr1[downsampled_width] = inptr1[downsampled_width - 1];
+ }
+
+ this0 = vec_ld(0, inptr0);
+ this0l = (__vector short)VEC_UNPACKHU(this0);
+ this0h = (__vector short)VEC_UNPACKLU(this0);
+ this0l = vec_mladd(this0l, pw_three, pw_zero);
+ this0h = vec_mladd(this0h, pw_three, pw_zero);
+
+ this_1 = vec_ld(0, inptr_1);
+ this_1l = (__vector short)VEC_UNPACKHU(this_1);
+ this_1h = (__vector short)VEC_UNPACKLU(this_1);
+ thiscolsum_1l = vec_add(this0l, this_1l);
+ thiscolsum_1h = vec_add(this0h, this_1h);
+ lastcolsum_1h = thiscolsum_1h;
+ p_lastcolsum_1l = vec_perm(thiscolsum_1l, thiscolsum_1l, last_index_col0);
+ p_lastcolsum_1h = vec_perm(thiscolsum_1l, thiscolsum_1h, last_index);
+
+ this1 = vec_ld(0, inptr1);
+ this1l = (__vector short)VEC_UNPACKHU(this1);
+ this1h = (__vector short)VEC_UNPACKLU(this1);
+ thiscolsum1l = vec_add(this0l, this1l);
+ thiscolsum1h = vec_add(this0h, this1h);
+ lastcolsum1h = thiscolsum1h;
+ p_lastcolsum1l = vec_perm(thiscolsum1l, thiscolsum1l, last_index_col0);
+ p_lastcolsum1h = vec_perm(thiscolsum1l, thiscolsum1h, last_index);
+
+ for (incol = downsampled_width; incol > 0;
+ incol -= 16, inptr_1 += 16, inptr0 += 16, inptr1 += 16,
+ outptr0 += 32, outptr1 += 32) {
+
+ if (downsampled_width - incol > 0) {
+ p_lastcolsum_1l = vec_perm(lastcolsum_1h, thiscolsum_1l, last_index);
+ p_lastcolsum_1h = vec_perm(thiscolsum_1l, thiscolsum_1h, last_index);
+ p_lastcolsum1l = vec_perm(lastcolsum1h, thiscolsum1l, last_index);
+ p_lastcolsum1h = vec_perm(thiscolsum1l, thiscolsum1h, last_index);
+ lastcolsum_1h = thiscolsum_1h; lastcolsum1h = thiscolsum1h;
+ }
+
+ if (incol <= 16) {
+ p_nextcolsum_1l = vec_perm(thiscolsum_1l, thiscolsum_1h, next_index);
+ p_nextcolsum_1h = vec_perm(thiscolsum_1h, thiscolsum_1h,
+ next_index_lastcol);
+ p_nextcolsum1l = vec_perm(thiscolsum1l, thiscolsum1h, next_index);
+ p_nextcolsum1h = vec_perm(thiscolsum1h, thiscolsum1h,
+ next_index_lastcol);
+ } else {
+ this0 = vec_ld(16, inptr0);
+ this0l = (__vector short)VEC_UNPACKHU(this0);
+ this0h = (__vector short)VEC_UNPACKLU(this0);
+ this0l = vec_mladd(this0l, pw_three, pw_zero);
+ this0h = vec_mladd(this0h, pw_three, pw_zero);
+
+ this_1 = vec_ld(16, inptr_1);
+ this_1l = (__vector short)VEC_UNPACKHU(this_1);
+ this_1h = (__vector short)VEC_UNPACKLU(this_1);
+ nextcolsum_1l = vec_add(this0l, this_1l);
+ nextcolsum_1h = vec_add(this0h, this_1h);
+ p_nextcolsum_1l = vec_perm(thiscolsum_1l, thiscolsum_1h, next_index);
+ p_nextcolsum_1h = vec_perm(thiscolsum_1h, nextcolsum_1l, next_index);
+
+ this1 = vec_ld(16, inptr1);
+ this1l = (__vector short)VEC_UNPACKHU(this1);
+ this1h = (__vector short)VEC_UNPACKLU(this1);
+ nextcolsum1l = vec_add(this0l, this1l);
+ nextcolsum1h = vec_add(this0h, this1h);
+ p_nextcolsum1l = vec_perm(thiscolsum1l, thiscolsum1h, next_index);
+ p_nextcolsum1h = vec_perm(thiscolsum1h, nextcolsum1l, next_index);
+ }
+
+ /* Process the upper row */
+
+ tmpl = vec_mladd(thiscolsum_1l, pw_three, pw_zero);
+ outle = vec_add(tmpl, p_lastcolsum_1l);
+ outle = vec_add(outle, pw_eight);
+ outle = vec_sr(outle, pw_four);
+
+ outlo = vec_add(tmpl, p_nextcolsum_1l);
+ outlo = vec_add(outlo, pw_seven);
+ outlo = vec_sr(outlo, pw_four);
+
+ out = vec_perm((__vector unsigned char)outle,
+ (__vector unsigned char)outlo, merge_pack_index);
+ vec_st(out, 0, outptr0);
+
+ if (incol > 8) {
+ tmph = vec_mladd(thiscolsum_1h, pw_three, pw_zero);
+ outhe = vec_add(tmph, p_lastcolsum_1h);
+ outhe = vec_add(outhe, pw_eight);
+ outhe = vec_sr(outhe, pw_four);
+
+ outho = vec_add(tmph, p_nextcolsum_1h);
+ outho = vec_add(outho, pw_seven);
+ outho = vec_sr(outho, pw_four);
+
+ out = vec_perm((__vector unsigned char)outhe,
+ (__vector unsigned char)outho, merge_pack_index);
+ vec_st(out, 16, outptr0);
+ }
+
+ /* Process the lower row */
+
+ tmpl = vec_mladd(thiscolsum1l, pw_three, pw_zero);
+ outle = vec_add(tmpl, p_lastcolsum1l);
+ outle = vec_add(outle, pw_eight);
+ outle = vec_sr(outle, pw_four);
+
+ outlo = vec_add(tmpl, p_nextcolsum1l);
+ outlo = vec_add(outlo, pw_seven);
+ outlo = vec_sr(outlo, pw_four);
+
+ out = vec_perm((__vector unsigned char)outle,
+ (__vector unsigned char)outlo, merge_pack_index);
+ vec_st(out, 0, outptr1);
+
+ if (incol > 8) {
+ tmph = vec_mladd(thiscolsum1h, pw_three, pw_zero);
+ outhe = vec_add(tmph, p_lastcolsum1h);
+ outhe = vec_add(outhe, pw_eight);
+ outhe = vec_sr(outhe, pw_four);
+
+ outho = vec_add(tmph, p_nextcolsum1h);
+ outho = vec_add(outho, pw_seven);
+ outho = vec_sr(outho, pw_four);
+
+ out = vec_perm((__vector unsigned char)outhe,
+ (__vector unsigned char)outho, merge_pack_index);
+ vec_st(out, 16, outptr1);
+ }
+
+ thiscolsum_1l = nextcolsum_1l; thiscolsum_1h = nextcolsum_1h;
+ thiscolsum1l = nextcolsum1l; thiscolsum1h = nextcolsum1h;
+ }
+ }
+}
+
+
+/* These are rarely used (mainly just for decompressing YCCK images) */
+
+void
+jsimd_h2v1_upsample_altivec (int max_v_samp_factor,
+ JDIMENSION output_width,
+ JSAMPARRAY input_data,
+ JSAMPARRAY * output_data_ptr)
+{
+ JSAMPARRAY output_data = *output_data_ptr;
+ JSAMPROW inptr, outptr;
+ int inrow, incol;
+
+ __vector unsigned char in, inl, inh;
+
+ for (inrow = 0; inrow < max_v_samp_factor; inrow++) {
+ inptr = input_data[inrow];
+ outptr = output_data[inrow];
+
+ for (incol = (output_width + 31) & (~31); incol > 0;
+ incol -= 64, inptr += 32, outptr += 64) {
+
+ in = vec_ld(0, inptr);
+ inl = vec_mergeh(in, in);
+ inh = vec_mergel(in, in);
+
+ vec_st(inl, 0, outptr);
+ vec_st(inh, 16, outptr);
+
+ if (incol > 32) {
+ in = vec_ld(16, inptr);
+ inl = vec_mergeh(in, in);
+ inh = vec_mergel(in, in);
+
+ vec_st(inl, 32, outptr);
+ vec_st(inh, 48, outptr);
+ }
+ }
+ }
+}
+
+
+void
+jsimd_h2v2_upsample_altivec (int max_v_samp_factor,
+ JDIMENSION output_width,
+ JSAMPARRAY input_data,
+ JSAMPARRAY * output_data_ptr)
+{
+ JSAMPARRAY output_data = *output_data_ptr;
+ JSAMPROW inptr, outptr0, outptr1;
+ int inrow, outrow, incol;
+
+ __vector unsigned char in, inl, inh;
+
+ for (inrow = 0, outrow = 0; outrow < max_v_samp_factor; inrow++) {
+
+ inptr = input_data[inrow];
+ outptr0 = output_data[outrow++];
+ outptr1 = output_data[outrow++];
+
+ for (incol = (output_width + 31) & (~31); incol > 0;
+ incol -= 64, inptr += 32, outptr0 += 64, outptr1 += 64) {
+
+ in = vec_ld(0, inptr);
+ inl = vec_mergeh(in, in);
+ inh = vec_mergel(in, in);
+
+ vec_st(inl, 0, outptr0);
+ vec_st(inl, 0, outptr1);
+
+ vec_st(inh, 16, outptr0);
+ vec_st(inh, 16, outptr1);
+
+ if (incol > 32) {
+ in = vec_ld(16, inptr);
+ inl = vec_mergeh(in, in);
+ inh = vec_mergel(in, in);
+
+ vec_st(inl, 32, outptr0);
+ vec_st(inl, 32, outptr1);
+
+ vec_st(inh, 48, outptr0);
+ vec_st(inh, 48, outptr1);
+ }
+ }
+ }
+}
diff --git a/simd/jfdctfst-altivec.c b/simd/jfdctfst-altivec.c
new file mode 100644
index 0000000..c4cc26e
--- /dev/null
+++ b/simd/jfdctfst-altivec.c
@@ -0,0 +1,156 @@
+/*
+ * AltiVec optimizations for libjpeg-turbo
+ *
+ * Copyright (C) 2014, D. R. Commander.
+ * All rights reserved.
+ * This software is provided 'as-is', without any express or implied
+ * warranty. In no event will the authors be held liable for any damages
+ * arising from the use of this software.
+ *
+ * Permission is granted to anyone to use this software for any purpose,
+ * including commercial applications, and to alter it and redistribute it
+ * freely, subject to the following restrictions:
+ *
+ * 1. The origin of this software must not be misrepresented; you must not
+ * claim that you wrote the original software. If you use this software
+ * in a product, an acknowledgment in the product documentation would be
+ * appreciated but is not required.
+ * 2. Altered source versions must be plainly marked as such, and must not be
+ * misrepresented as being the original software.
+ * 3. This notice may not be removed or altered from any source distribution.
+ */
+
+/* FAST INTEGER FORWARD DCT
+ *
+ * This is similar to the SSE2 implementation, except that we left-shift the
+ * constants by 1 less bit (the -1 in CONST_SHIFT.) This is because
+ * vec_madds(arg1, arg2, arg3) generates the 16-bit saturated sum of:
+ * the elements in arg3 + the most significant 17 bits of
+ * (the elements in arg1 * the elements in arg2).
+ */
+
+#include "jsimd_altivec.h"
+
+
+#define F_0_382 98 /* FIX(0.382683433) */
+#define F_0_541 139 /* FIX(0.541196100) */
+#define F_0_707 181 /* FIX(0.707106781) */
+#define F_1_306 334 /* FIX(1.306562965) */
+
+#define CONST_BITS 8
+#define PRE_MULTIPLY_SCALE_BITS 2
+#define CONST_SHIFT (16 - PRE_MULTIPLY_SCALE_BITS - CONST_BITS - 1)
+
+
+#define DO_FDCT() \
+{ \
+ /* Even part */ \
+ \
+ tmp10 = vec_add(tmp0, tmp3); \
+ tmp13 = vec_sub(tmp0, tmp3); \
+ tmp11 = vec_add(tmp1, tmp2); \
+ tmp12 = vec_sub(tmp1, tmp2); \
+ \
+ out0 = vec_add(tmp10, tmp11); \
+ out4 = vec_sub(tmp10, tmp11); \
+ \
+ z1 = vec_add(tmp12, tmp13); \
+ z1 = vec_sl(z1, pre_multiply_scale_bits); \
+ z1 = vec_madds(z1, pw_0707, pw_zero); \
+ \
+ out2 = vec_add(tmp13, z1); \
+ out6 = vec_sub(tmp13, z1); \
+ \
+ /* Odd part */ \
+ \
+ tmp10 = vec_add(tmp4, tmp5); \
+ tmp11 = vec_add(tmp5, tmp6); \
+ tmp12 = vec_add(tmp6, tmp7); \
+ \
+ tmp10 = vec_sl(tmp10, pre_multiply_scale_bits); \
+ tmp12 = vec_sl(tmp12, pre_multiply_scale_bits); \
+ z5 = vec_sub(tmp10, tmp12); \
+ z5 = vec_madds(z5, pw_0382, pw_zero); \
+ \
+ z2 = vec_madds(tmp10, pw_0541, z5); \
+ z4 = vec_madds(tmp12, pw_1306, z5); \
+ \
+ tmp11 = vec_sl(tmp11, pre_multiply_scale_bits); \
+ z3 = vec_madds(tmp11, pw_0707, pw_zero); \
+ \
+ z11 = vec_add(tmp7, z3); \
+ z13 = vec_sub(tmp7, z3); \
+ \
+ out5 = vec_add(z13, z2); \
+ out3 = vec_sub(z13, z2); \
+ out1 = vec_add(z11, z4); \
+ out7 = vec_sub(z11, z4); \
+}
+
+
+void
+jsimd_fdct_ifast_altivec (DCTELEM *data)
+{
+ __vector short row0, row1, row2, row3, row4, row5, row6, row7,
+ col0, col1, col2, col3, col4, col5, col6, col7,
+ tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7, tmp10, tmp11, tmp12, tmp13,
+ z1, z2, z3, z4, z5, z11, z13,
+ out0, out1, out2, out3, out4, out5, out6, out7;
+
+ /* Constants */
+ __vector short pw_zero = { __8X(0) },
+ pw_0382 = { __8X(F_0_382 << CONST_SHIFT) },
+ pw_0541 = { __8X(F_0_541 << CONST_SHIFT) },
+ pw_0707 = { __8X(F_0_707 << CONST_SHIFT) },
+ pw_1306 = { __8X(F_1_306 << CONST_SHIFT) };
+ __vector unsigned short
+ pre_multiply_scale_bits = { __8X(PRE_MULTIPLY_SCALE_BITS) };
+
+ /* Pass 1: process rows */
+
+ row0 = vec_ld(0, data);
+ row1 = vec_ld(16, data);
+ row2 = vec_ld(32, data);
+ row3 = vec_ld(48, data);
+ row4 = vec_ld(64, data);
+ row5 = vec_ld(80, data);
+ row6 = vec_ld(96, data);
+ row7 = vec_ld(112, data);
+
+ TRANSPOSE(row, col);
+
+ tmp0 = vec_add(col0, col7);
+ tmp7 = vec_sub(col0, col7);
+ tmp1 = vec_add(col1, col6);
+ tmp6 = vec_sub(col1, col6);
+ tmp2 = vec_add(col2, col5);
+ tmp5 = vec_sub(col2, col5);
+ tmp3 = vec_add(col3, col4);
+ tmp4 = vec_sub(col3, col4);
+
+ DO_FDCT();
+
+ /* Pass 2: process columns */
+
+ TRANSPOSE(out, row);
+
+ tmp0 = vec_add(row0, row7);
+ tmp7 = vec_sub(row0, row7);
+ tmp1 = vec_add(row1, row6);
+ tmp6 = vec_sub(row1, row6);
+ tmp2 = vec_add(row2, row5);
+ tmp5 = vec_sub(row2, row5);
+ tmp3 = vec_add(row3, row4);
+ tmp4 = vec_sub(row3, row4);
+
+ DO_FDCT();
+
+ vec_st(out0, 0, data);
+ vec_st(out1, 16, data);
+ vec_st(out2, 32, data);
+ vec_st(out3, 48, data);
+ vec_st(out4, 64, data);
+ vec_st(out5, 80, data);
+ vec_st(out6, 96, data);
+ vec_st(out7, 112, data);
+}
diff --git a/simd/jfdctint-altivec.c b/simd/jfdctint-altivec.c
new file mode 100644
index 0000000..c13850a
--- /dev/null
+++ b/simd/jfdctint-altivec.c
@@ -0,0 +1,262 @@
+/*
+ * AltiVec optimizations for libjpeg-turbo
+ *
+ * Copyright (C) 2014, D. R. Commander.
+ * All rights reserved.
+ * This software is provided 'as-is', without any express or implied
+ * warranty. In no event will the authors be held liable for any damages
+ * arising from the use of this software.
+ *
+ * Permission is granted to anyone to use this software for any purpose,
+ * including commercial applications, and to alter it and redistribute it
+ * freely, subject to the following restrictions:
+ *
+ * 1. The origin of this software must not be misrepresented; you must not
+ * claim that you wrote the original software. If you use this software
+ * in a product, an acknowledgment in the product documentation would be
+ * appreciated but is not required.
+ * 2. Altered source versions must be plainly marked as such, and must not be
+ * misrepresented as being the original software.
+ * 3. This notice may not be removed or altered from any source distribution.
+ */
+
+/* SLOW INTEGER FORWARD DCT */
+
+#include "jsimd_altivec.h"
+
+
+#define F_0_298 2446 /* FIX(0.298631336) */
+#define F_0_390 3196 /* FIX(0.390180644) */
+#define F_0_541 4433 /* FIX(0.541196100) */
+#define F_0_765 6270 /* FIX(0.765366865) */
+#define F_0_899 7373 /* FIX(0.899976223) */
+#define F_1_175 9633 /* FIX(1.175875602) */
+#define F_1_501 12299 /* FIX(1.501321110) */
+#define F_1_847 15137 /* FIX(1.847759065) */
+#define F_1_961 16069 /* FIX(1.961570560) */
+#define F_2_053 16819 /* FIX(2.053119869) */
+#define F_2_562 20995 /* FIX(2.562915447) */
+#define F_3_072 25172 /* FIX(3.072711026) */
+
+#define CONST_BITS 13
+#define PASS1_BITS 2
+#define DESCALE_P1 (CONST_BITS - PASS1_BITS)
+#define DESCALE_P2 (CONST_BITS + PASS1_BITS)
+
+
+#define DO_FDCT_COMMON(PASS) \
+{ \
+ /* (Original) \
+ * z1 = (tmp12 + tmp13) * 0.541196100; \
+ * data2 = z1 + tmp13 * 0.765366865; \
+ * data6 = z1 + tmp12 * -1.847759065; \
+ * \
+ * (This implementation) \
+ * data2 = tmp13 * (0.541196100 + 0.765366865) + tmp12 * 0.541196100; \
+ * data6 = tmp13 * 0.541196100 + tmp12 * (0.541196100 - 1.847759065); \
+ */ \
+ \
+ tmp1312l = vec_mergeh(tmp13, tmp12); \
+ tmp1312h = vec_mergel(tmp13, tmp12); \
+ \
+ out2l = vec_msums(tmp1312l, pw_f130_f054, pd_descale_p##PASS); \
+ out2h = vec_msums(tmp1312h, pw_f130_f054, pd_descale_p##PASS); \
+ out6l = vec_msums(tmp1312l, pw_f054_mf130, pd_descale_p##PASS); \
+ out6h = vec_msums(tmp1312h, pw_f054_mf130, pd_descale_p##PASS); \
+ \
+ out2l = vec_sra(out2l, descale_p##PASS); \
+ out2h = vec_sra(out2h, descale_p##PASS); \
+ out6l = vec_sra(out6l, descale_p##PASS); \
+ out6h = vec_sra(out6h, descale_p##PASS); \
+ \
+ out2 = vec_pack(out2l, out2h); \
+ out6 = vec_pack(out6l, out6h); \
+ \
+ /* Odd part */ \
+ \
+ z3 = vec_add(tmp4, tmp6); \
+ z4 = vec_add(tmp5, tmp7); \
+ \
+ /* (Original) \
+ * z5 = (z3 + z4) * 1.175875602; \
+ * z3 = z3 * -1.961570560; z4 = z4 * -0.390180644; \
+ * z3 += z5; z4 += z5; \
+ * \
+ * (This implementation) \
+ * z3 = z3 * (1.175875602 - 1.961570560) + z4 * 1.175875602; \
+ * z4 = z3 * 1.175875602 + z4 * (1.175875602 - 0.390180644); \
+ */ \
+ \
+ z34l = vec_mergeh(z3, z4); \
+ z34h = vec_mergel(z3, z4); \
+ \
+ z3l = vec_msums(z34l, pw_mf078_f117, pd_descale_p##PASS); \
+ z3h = vec_msums(z34h, pw_mf078_f117, pd_descale_p##PASS); \
+ z4l = vec_msums(z34l, pw_f117_f078, pd_descale_p##PASS); \
+ z4h = vec_msums(z34h, pw_f117_f078, pd_descale_p##PASS); \
+ \
+ /* (Original) \
+ * z1 = tmp4 + tmp7; z2 = tmp5 + tmp6; \
+ * tmp4 = tmp4 * 0.298631336; tmp5 = tmp5 * 2.053119869; \
+ * tmp6 = tmp6 * 3.072711026; tmp7 = tmp7 * 1.501321110; \
+ * z1 = z1 * -0.899976223; z2 = z2 * -2.562915447; \
+ * data7 = tmp4 + z1 + z3; data5 = tmp5 + z2 + z4; \
+ * data3 = tmp6 + z2 + z3; data1 = tmp7 + z1 + z4; \
+ * \
+ * (This implementation) \
+ * tmp4 = tmp4 * (0.298631336 - 0.899976223) + tmp7 * -0.899976223; \
+ * tmp5 = tmp5 * (2.053119869 - 2.562915447) + tmp6 * -2.562915447; \
+ * tmp6 = tmp5 * -2.562915447 + tmp6 * (3.072711026 - 2.562915447); \
+ * tmp7 = tmp4 * -0.899976223 + tmp7 * (1.501321110 - 0.899976223); \
+ * data7 = tmp4 + z3; data5 = tmp5 + z4; \
+ * data3 = tmp6 + z3; data1 = tmp7 + z4; \
+ */ \
+ \
+ tmp47l = vec_mergeh(tmp4, tmp7); \
+ tmp47h = vec_mergel(tmp4, tmp7); \
+ \
+ out7l = vec_msums(tmp47l, pw_mf060_mf089, z3l); \
+ out7h = vec_msums(tmp47h, pw_mf060_mf089, z3h); \
+ out1l = vec_msums(tmp47l, pw_mf089_f060, z4l); \
+ out1h = vec_msums(tmp47h, pw_mf089_f060, z4h); \
+ \
+ out7l = vec_sra(out7l, descale_p##PASS); \
+ out7h = vec_sra(out7h, descale_p##PASS); \
+ out1l = vec_sra(out1l, descale_p##PASS); \
+ out1h = vec_sra(out1h, descale_p##PASS); \
+ \
+ out7 = vec_pack(out7l, out7h); \
+ out1 = vec_pack(out1l, out1h); \
+ \
+ tmp56l = vec_mergeh(tmp5, tmp6); \
+ tmp56h = vec_mergel(tmp5, tmp6); \
+ \
+ out5l = vec_msums(tmp56l, pw_mf050_mf256, z4l); \
+ out5h = vec_msums(tmp56h, pw_mf050_mf256, z4h); \
+ out3l = vec_msums(tmp56l, pw_mf256_f050, z3l); \
+ out3h = vec_msums(tmp56h, pw_mf256_f050, z3h); \
+ \
+ out5l = vec_sra(out5l, descale_p##PASS); \
+ out5h = vec_sra(out5h, descale_p##PASS); \
+ out3l = vec_sra(out3l, descale_p##PASS); \
+ out3h = vec_sra(out3h, descale_p##PASS); \
+ \
+ out5 = vec_pack(out5l, out5h); \
+ out3 = vec_pack(out3l, out3h); \
+}
+
+#define DO_FDCT_PASS1() \
+{ \
+ /* Even part */ \
+ \
+ tmp10 = vec_add(tmp0, tmp3); \
+ tmp13 = vec_sub(tmp0, tmp3); \
+ tmp11 = vec_add(tmp1, tmp2); \
+ tmp12 = vec_sub(tmp1, tmp2); \
+ \
+ out0 = vec_add(tmp10, tmp11); \
+ out0 = vec_sl(out0, pass1_bits); \
+ out4 = vec_sub(tmp10, tmp11); \
+ out4 = vec_sl(out4, pass1_bits); \
+ \
+ DO_FDCT_COMMON(1); \
+}
+
+#define DO_FDCT_PASS2() \
+{ \
+ /* Even part */ \
+ \
+ tmp10 = vec_add(tmp0, tmp3); \
+ tmp13 = vec_sub(tmp0, tmp3); \
+ tmp11 = vec_add(tmp1, tmp2); \
+ tmp12 = vec_sub(tmp1, tmp2); \
+ \
+ out0 = vec_add(tmp10, tmp11); \
+ out0 = vec_add(out0, pw_descale_p2x); \
+ out0 = vec_sra(out0, pass1_bits); \
+ out4 = vec_sub(tmp10, tmp11); \
+ out4 = vec_add(out4, pw_descale_p2x); \
+ out4 = vec_sra(out4, pass1_bits); \
+ \
+ DO_FDCT_COMMON(2); \
+}
+
+
+void
+jsimd_fdct_islow_altivec (DCTELEM *data)
+{
+ __vector short row0, row1, row2, row3, row4, row5, row6, row7,
+ col0, col1, col2, col3, col4, col5, col6, col7,
+ tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7, tmp10, tmp11, tmp12, tmp13,
+ tmp47l, tmp47h, tmp56l, tmp56h, tmp1312l, tmp1312h,
+ z3, z4, z34l, z34h,
+ out0, out1, out2, out3, out4, out5, out6, out7;
+ __vector int z3l, z3h, z4l, z4h,
+ out1l, out1h, out2l, out2h, out3l, out3h, out5l, out5h, out6l, out6h,
+ out7l, out7h;
+
+ /* Constants */
+ __vector short
+ pw_f130_f054 = { __4X2(F_0_541 + F_0_765, F_0_541) },
+ pw_f054_mf130 = { __4X2(F_0_541, F_0_541 - F_1_847) },
+ pw_mf078_f117 = { __4X2(F_1_175 - F_1_961, F_1_175) },
+ pw_f117_f078 = { __4X2(F_1_175, F_1_175 - F_0_390) },
+ pw_mf060_mf089 = { __4X2(F_0_298 - F_0_899, -F_0_899) },
+ pw_mf089_f060 = { __4X2(-F_0_899, F_1_501 - F_0_899) },
+ pw_mf050_mf256 = { __4X2(F_2_053 - F_2_562, -F_2_562) },
+ pw_mf256_f050 = { __4X2(-F_2_562, F_3_072 - F_2_562) },
+ pw_descale_p2x = { __8X(1 << (PASS1_BITS - 1)) };
+ __vector unsigned short pass1_bits = { __8X(PASS1_BITS) };
+ __vector int pd_descale_p1 = { __4X(1 << (DESCALE_P1 - 1)) },
+ pd_descale_p2 = { __4X(1 << (DESCALE_P2 - 1)) };
+ __vector unsigned int descale_p1 = { __4X(DESCALE_P1) },
+ descale_p2 = { __4X(DESCALE_P2) };
+
+ /* Pass 1: process rows */
+
+ row0 = vec_ld(0, data);
+ row1 = vec_ld(16, data);
+ row2 = vec_ld(32, data);
+ row3 = vec_ld(48, data);
+ row4 = vec_ld(64, data);
+ row5 = vec_ld(80, data);
+ row6 = vec_ld(96, data);
+ row7 = vec_ld(112, data);
+
+ TRANSPOSE(row, col);
+
+ tmp0 = vec_add(col0, col7);
+ tmp7 = vec_sub(col0, col7);
+ tmp1 = vec_add(col1, col6);
+ tmp6 = vec_sub(col1, col6);
+ tmp2 = vec_add(col2, col5);
+ tmp5 = vec_sub(col2, col5);
+ tmp3 = vec_add(col3, col4);
+ tmp4 = vec_sub(col3, col4);
+
+ DO_FDCT_PASS1();
+
+ /* Pass 2: process columns */
+
+ TRANSPOSE(out, row);
+
+ tmp0 = vec_add(row0, row7);
+ tmp7 = vec_sub(row0, row7);
+ tmp1 = vec_add(row1, row6);
+ tmp6 = vec_sub(row1, row6);
+ tmp2 = vec_add(row2, row5);
+ tmp5 = vec_sub(row2, row5);
+ tmp3 = vec_add(row3, row4);
+ tmp4 = vec_sub(row3, row4);
+
+ DO_FDCT_PASS2();
+
+ vec_st(out0, 0, data);
+ vec_st(out1, 16, data);
+ vec_st(out2, 32, data);
+ vec_st(out3, 48, data);
+ vec_st(out4, 64, data);
+ vec_st(out5, 80, data);
+ vec_st(out6, 96, data);
+ vec_st(out7, 112, data);
+}
diff --git a/simd/jidctfst-altivec.c b/simd/jidctfst-altivec.c
new file mode 100644
index 0000000..fd7a2a3
--- /dev/null
+++ b/simd/jidctfst-altivec.c
@@ -0,0 +1,257 @@
+/*
+ * AltiVec optimizations for libjpeg-turbo
+ *
+ * Copyright (C) 2014-2015, D. R. Commander.
+ * All rights reserved.
+ * This software is provided 'as-is', without any express or implied
+ * warranty. In no event will the authors be held liable for any damages
+ * arising from the use of this software.
+ *
+ * Permission is granted to anyone to use this software for any purpose,
+ * including commercial applications, and to alter it and redistribute it
+ * freely, subject to the following restrictions:
+ *
+ * 1. The origin of this software must not be misrepresented; you must not
+ * claim that you wrote the original software. If you use this software
+ * in a product, an acknowledgment in the product documentation would be
+ * appreciated but is not required.
+ * 2. Altered source versions must be plainly marked as such, and must not be
+ * misrepresented as being the original software.
+ * 3. This notice may not be removed or altered from any source distribution.
+ */
+
+/* FAST INTEGER INVERSE DCT
+ *
+ * This is similar to the SSE2 implementation, except that we left-shift the
+ * constants by 1 less bit (the -1 in CONST_SHIFT.) This is because
+ * vec_madds(arg1, arg2, arg3) generates the 16-bit saturated sum of:
+ * the elements in arg3 + the most significant 17 bits of
+ * (the elements in arg1 * the elements in arg2).
+ */
+
+#include "jsimd_altivec.h"
+
+
+#define F_1_082 277 /* FIX(1.082392200) */
+#define F_1_414 362 /* FIX(1.414213562) */
+#define F_1_847 473 /* FIX(1.847759065) */
+#define F_2_613 669 /* FIX(2.613125930) */
+#define F_1_613 (F_2_613 - 256) /* FIX(2.613125930) - FIX(1) */
+
+#define CONST_BITS 8
+#define PASS1_BITS 2
+#define PRE_MULTIPLY_SCALE_BITS 2
+#define CONST_SHIFT (16 - PRE_MULTIPLY_SCALE_BITS - CONST_BITS - 1)
+
+
+#define DO_IDCT(in) \
+{ \
+ /* Even part */ \
+ \
+ tmp10 = vec_add(in##0, in##4); \
+ tmp11 = vec_sub(in##0, in##4); \
+ tmp13 = vec_add(in##2, in##6); \
+ \
+ tmp12 = vec_sub(in##2, in##6); \
+ tmp12 = vec_sl(tmp12, pre_multiply_scale_bits); \
+ tmp12 = vec_madds(tmp12, pw_F1414, pw_zero); \
+ tmp12 = vec_sub(tmp12, tmp13); \
+ \
+ tmp0 = vec_add(tmp10, tmp13); \
+ tmp3 = vec_sub(tmp10, tmp13); \
+ tmp1 = vec_add(tmp11, tmp12); \
+ tmp2 = vec_sub(tmp11, tmp12); \
+ \
+ /* Odd part */ \
+ \
+ z13 = vec_add(in##5, in##3); \
+ z10 = vec_sub(in##5, in##3); \
+ z10s = vec_sl(z10, pre_multiply_scale_bits); \
+ z11 = vec_add(in##1, in##7); \
+ z12s = vec_sub(in##1, in##7); \
+ z12s = vec_sl(z12s, pre_multiply_scale_bits); \
+ \
+ tmp11 = vec_sub(z11, z13); \
+ tmp11 = vec_sl(tmp11, pre_multiply_scale_bits); \
+ tmp11 = vec_madds(tmp11, pw_F1414, pw_zero); \
+ \
+ tmp7 = vec_add(z11, z13); \
+ \
+ /* To avoid overflow... \
+ * \
+ * (Original) \
+ * tmp12 = -2.613125930 * z10 + z5; \
+ * \
+ * (This implementation) \
+ * tmp12 = (-1.613125930 - 1) * z10 + z5; \
+ * = -1.613125930 * z10 - z10 + z5; \
+ */ \
+ \
+ z5 = vec_add(z10s, z12s); \
+ z5 = vec_madds(z5, pw_F1847, pw_zero); \
+ \
+ tmp10 = vec_madds(z12s, pw_F1082, pw_zero); \
+ tmp10 = vec_sub(tmp10, z5); \
+ tmp12 = vec_madds(z10s, pw_MF1613, z5); \
+ tmp12 = vec_sub(tmp12, z10); \
+ \
+ tmp6 = vec_sub(tmp12, tmp7); \
+ tmp5 = vec_sub(tmp11, tmp6); \
+ tmp4 = vec_add(tmp10, tmp5); \
+ \
+ out0 = vec_add(tmp0, tmp7); \
+ out1 = vec_add(tmp1, tmp6); \
+ out2 = vec_add(tmp2, tmp5); \
+ out3 = vec_sub(tmp3, tmp4); \
+ out4 = vec_add(tmp3, tmp4); \
+ out5 = vec_sub(tmp2, tmp5); \
+ out6 = vec_sub(tmp1, tmp6); \
+ out7 = vec_sub(tmp0, tmp7); \
+}
+
+
+void
+jsimd_idct_ifast_altivec (void * dct_table_, JCOEFPTR coef_block,
+ JSAMPARRAY output_buf, JDIMENSION output_col)
+{
+ short *dct_table = (short *)dct_table_;
+ int *outptr;
+
+ __vector short row0, row1, row2, row3, row4, row5, row6, row7,
+ col0, col1, col2, col3, col4, col5, col6, col7,
+ quant0, quant1, quant2, quant3, quant4, quant5, quant6, quant7,
+ tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7, tmp10, tmp11, tmp12, tmp13,
+ z5, z10, z10s, z11, z12s, z13,
+ out0, out1, out2, out3, out4, out5, out6, out7;
+ __vector signed char outb;
+
+ /* Constants */
+ __vector short pw_zero = { __8X(0) },
+ pw_F1414 = { __8X(F_1_414 << CONST_SHIFT) },
+ pw_F1847 = { __8X(F_1_847 << CONST_SHIFT) },
+ pw_MF1613 = { __8X(-F_1_613 << CONST_SHIFT) },
+ pw_F1082 = { __8X(F_1_082 << CONST_SHIFT) };
+ __vector unsigned short
+ pre_multiply_scale_bits = { __8X(PRE_MULTIPLY_SCALE_BITS) },
+ pass1_bits3 = { __8X(PASS1_BITS + 3) };
+ __vector signed char pb_centerjsamp = { __16X(CENTERJSAMPLE) };
+
+ /* Pass 1: process columns */
+
+ col0 = vec_ld(0, coef_block);
+ col1 = vec_ld(16, coef_block);
+ col2 = vec_ld(32, coef_block);
+ col3 = vec_ld(48, coef_block);
+ col4 = vec_ld(64, coef_block);
+ col5 = vec_ld(80, coef_block);
+ col6 = vec_ld(96, coef_block);
+ col7 = vec_ld(112, coef_block);
+
+ tmp1 = vec_or(col1, col2);
+ tmp2 = vec_or(col3, col4);
+ tmp1 = vec_or(tmp1, tmp2);
+ tmp3 = vec_or(col5, col6);
+ tmp3 = vec_or(tmp3, col7);
+ tmp1 = vec_or(tmp1, tmp3);
+
+ quant0 = vec_ld(0, dct_table);
+ col0 = vec_mladd(col0, quant0, pw_zero);
+
+ if (vec_all_eq(tmp1, pw_zero)) {
+ /* AC terms all zero */
+
+ row0 = vec_splat(col0, 0);
+ row1 = vec_splat(col0, 1);
+ row2 = vec_splat(col0, 2);
+ row3 = vec_splat(col0, 3);
+ row4 = vec_splat(col0, 4);
+ row5 = vec_splat(col0, 5);
+ row6 = vec_splat(col0, 6);
+ row7 = vec_splat(col0, 7);
+
+ } else {
+
+ quant1 = vec_ld(16, dct_table);
+ quant2 = vec_ld(32, dct_table);
+ quant3 = vec_ld(48, dct_table);
+ quant4 = vec_ld(64, dct_table);
+ quant5 = vec_ld(80, dct_table);
+ quant6 = vec_ld(96, dct_table);
+ quant7 = vec_ld(112, dct_table);
+
+ col1 = vec_mladd(col1, quant1, pw_zero);
+ col2 = vec_mladd(col2, quant2, pw_zero);
+ col3 = vec_mladd(col3, quant3, pw_zero);
+ col4 = vec_mladd(col4, quant4, pw_zero);
+ col5 = vec_mladd(col5, quant5, pw_zero);
+ col6 = vec_mladd(col6, quant6, pw_zero);
+ col7 = vec_mladd(col7, quant7, pw_zero);
+
+ DO_IDCT(col);
+
+ TRANSPOSE(out, row);
+ }
+
+ /* Pass 2: process rows */
+
+ DO_IDCT(row);
+
+ out0 = vec_sra(out0, pass1_bits3);
+ out1 = vec_sra(out1, pass1_bits3);
+ out2 = vec_sra(out2, pass1_bits3);
+ out3 = vec_sra(out3, pass1_bits3);
+ out4 = vec_sra(out4, pass1_bits3);
+ out5 = vec_sra(out5, pass1_bits3);
+ out6 = vec_sra(out6, pass1_bits3);
+ out7 = vec_sra(out7, pass1_bits3);
+
+ TRANSPOSE(out, col);
+
+ outb = vec_packs(col0, col0);
+ outb = vec_add(outb, pb_centerjsamp);
+ outptr = (int *)(output_buf[0] + output_col);
+ vec_ste((__vector int)outb, 0, outptr);
+ vec_ste((__vector int)outb, 4, outptr);
+
+ outb = vec_packs(col1, col1);
+ outb = vec_add(outb, pb_centerjsamp);
+ outptr = (int *)(output_buf[1] + output_col);
+ vec_ste((__vector int)outb, 0, outptr);
+ vec_ste((__vector int)outb, 4, outptr);
+
+ outb = vec_packs(col2, col2);
+ outb = vec_add(outb, pb_centerjsamp);
+ outptr = (int *)(output_buf[2] + output_col);
+ vec_ste((__vector int)outb, 0, outptr);
+ vec_ste((__vector int)outb, 4, outptr);
+
+ outb = vec_packs(col3, col3);
+ outb = vec_add(outb, pb_centerjsamp);
+ outptr = (int *)(output_buf[3] + output_col);
+ vec_ste((__vector int)outb, 0, outptr);
+ vec_ste((__vector int)outb, 4, outptr);
+
+ outb = vec_packs(col4, col4);
+ outb = vec_add(outb, pb_centerjsamp);
+ outptr = (int *)(output_buf[4] + output_col);
+ vec_ste((__vector int)outb, 0, outptr);
+ vec_ste((__vector int)outb, 4, outptr);
+
+ outb = vec_packs(col5, col5);
+ outb = vec_add(outb, pb_centerjsamp);
+ outptr = (int *)(output_buf[5] + output_col);
+ vec_ste((__vector int)outb, 0, outptr);
+ vec_ste((__vector int)outb, 4, outptr);
+
+ outb = vec_packs(col6, col6);
+ outb = vec_add(outb, pb_centerjsamp);
+ outptr = (int *)(output_buf[6] + output_col);
+ vec_ste((__vector int)outb, 0, outptr);
+ vec_ste((__vector int)outb, 4, outptr);
+
+ outb = vec_packs(col7, col7);
+ outb = vec_add(outb, pb_centerjsamp);
+ outptr = (int *)(output_buf[7] + output_col);
+ vec_ste((__vector int)outb, 0, outptr);
+ vec_ste((__vector int)outb, 4, outptr);
+}
diff --git a/simd/jidctint-altivec.c b/simd/jidctint-altivec.c
new file mode 100644
index 0000000..7f0f8d0
--- /dev/null
+++ b/simd/jidctint-altivec.c
@@ -0,0 +1,359 @@
+/*
+ * AltiVec optimizations for libjpeg-turbo
+ *
+ * Copyright (C) 2014-2015, D. R. Commander.
+ * All rights reserved.
+ * This software is provided 'as-is', without any express or implied
+ * warranty. In no event will the authors be held liable for any damages
+ * arising from the use of this software.
+ *
+ * Permission is granted to anyone to use this software for any purpose,
+ * including commercial applications, and to alter it and redistribute it
+ * freely, subject to the following restrictions:
+ *
+ * 1. The origin of this software must not be misrepresented; you must not
+ * claim that you wrote the original software. If you use this software
+ * in a product, an acknowledgment in the product documentation would be
+ * appreciated but is not required.
+ * 2. Altered source versions must be plainly marked as such, and must not be
+ * misrepresented as being the original software.
+ * 3. This notice may not be removed or altered from any source distribution.
+ */
+
+/* SLOW INTEGER INVERSE DCT */
+
+#include "jsimd_altivec.h"
+
+
+#define F_0_298 2446 /* FIX(0.298631336) */
+#define F_0_390 3196 /* FIX(0.390180644) */
+#define F_0_541 4433 /* FIX(0.541196100) */
+#define F_0_765 6270 /* FIX(0.765366865) */
+#define F_0_899 7373 /* FIX(0.899976223) */
+#define F_1_175 9633 /* FIX(1.175875602) */
+#define F_1_501 12299 /* FIX(1.501321110) */
+#define F_1_847 15137 /* FIX(1.847759065) */
+#define F_1_961 16069 /* FIX(1.961570560) */
+#define F_2_053 16819 /* FIX(2.053119869) */
+#define F_2_562 20995 /* FIX(2.562915447) */
+#define F_3_072 25172 /* FIX(3.072711026) */
+
+#define CONST_BITS 13
+#define PASS1_BITS 2
+#define DESCALE_P1 (CONST_BITS - PASS1_BITS)
+#define DESCALE_P2 (CONST_BITS + PASS1_BITS + 3)
+
+
+#define DO_IDCT(in, PASS) \
+{ \
+ /* Even part \
+ * \
+ * (Original) \
+ * z1 = (z2 + z3) * 0.541196100; \
+ * tmp2 = z1 + z3 * -1.847759065; \
+ * tmp3 = z1 + z2 * 0.765366865; \
+ * \
+ * (This implementation) \
+ * tmp2 = z2 * 0.541196100 + z3 * (0.541196100 - 1.847759065); \
+ * tmp3 = z2 * (0.541196100 + 0.765366865) + z3 * 0.541196100; \
+ */ \
+ \
+ in##26l = vec_mergeh(in##2, in##6); \
+ in##26h = vec_mergel(in##2, in##6); \
+ \
+ tmp3l = vec_msums(in##26l, pw_f130_f054, pd_zero); \
+ tmp3h = vec_msums(in##26h, pw_f130_f054, pd_zero); \
+ tmp2l = vec_msums(in##26l, pw_f054_mf130, pd_zero); \
+ tmp2h = vec_msums(in##26h, pw_f054_mf130, pd_zero); \
+ \
+ tmp0 = vec_add(in##0, in##4); \
+ tmp1 = vec_sub(in##0, in##4); \
+ \
+ tmp0l = vec_unpackh(tmp0); \
+ tmp0h = vec_unpackl(tmp0); \
+ tmp0l = vec_sl(tmp0l, const_bits); \
+ tmp0h = vec_sl(tmp0h, const_bits); \
+ tmp0l = vec_add(tmp0l, pd_descale_p##PASS); \
+ tmp0h = vec_add(tmp0h, pd_descale_p##PASS); \
+ \
+ tmp10l = vec_add(tmp0l, tmp3l); \
+ tmp10h = vec_add(tmp0h, tmp3h); \
+ tmp13l = vec_sub(tmp0l, tmp3l); \
+ tmp13h = vec_sub(tmp0h, tmp3h); \
+ \
+ tmp1l = vec_unpackh(tmp1); \
+ tmp1h = vec_unpackl(tmp1); \
+ tmp1l = vec_sl(tmp1l, const_bits); \
+ tmp1h = vec_sl(tmp1h, const_bits); \
+ tmp1l = vec_add(tmp1l, pd_descale_p##PASS); \
+ tmp1h = vec_add(tmp1h, pd_descale_p##PASS); \
+ \
+ tmp11l = vec_add(tmp1l, tmp2l); \
+ tmp11h = vec_add(tmp1h, tmp2h); \
+ tmp12l = vec_sub(tmp1l, tmp2l); \
+ tmp12h = vec_sub(tmp1h, tmp2h); \
+ \
+ /* Odd part */ \
+ \
+ z3 = vec_add(in##3, in##7); \
+ z4 = vec_add(in##1, in##5); \
+ \
+ /* (Original) \
+ * z5 = (z3 + z4) * 1.175875602; \
+ * z3 = z3 * -1.961570560; z4 = z4 * -0.390180644; \
+ * z3 += z5; z4 += z5; \
+ * \
+ * (This implementation) \
+ * z3 = z3 * (1.175875602 - 1.961570560) + z4 * 1.175875602; \
+ * z4 = z3 * 1.175875602 + z4 * (1.175875602 - 0.390180644); \
+ */ \
+ \
+ z34l = vec_mergeh(z3, z4); \
+ z34h = vec_mergel(z3, z4); \
+ \
+ z3l = vec_msums(z34l, pw_mf078_f117, pd_zero); \
+ z3h = vec_msums(z34h, pw_mf078_f117, pd_zero); \
+ z4l = vec_msums(z34l, pw_f117_f078, pd_zero); \
+ z4h = vec_msums(z34h, pw_f117_f078, pd_zero); \
+ \
+ /* (Original) \
+ * z1 = tmp0 + tmp3; z2 = tmp1 + tmp2; \
+ * tmp0 = tmp0 * 0.298631336; tmp1 = tmp1 * 2.053119869; \
+ * tmp2 = tmp2 * 3.072711026; tmp3 = tmp3 * 1.501321110; \
+ * z1 = z1 * -0.899976223; z2 = z2 * -2.562915447; \
+ * tmp0 += z1 + z3; tmp1 += z2 + z4; \
+ * tmp2 += z2 + z3; tmp3 += z1 + z4; \
+ * \
+ * (This implementation) \
+ * tmp0 = tmp0 * (0.298631336 - 0.899976223) + tmp3 * -0.899976223; \
+ * tmp1 = tmp1 * (2.053119869 - 2.562915447) + tmp2 * -2.562915447; \
+ * tmp2 = tmp1 * -2.562915447 + tmp2 * (3.072711026 - 2.562915447); \
+ * tmp3 = tmp0 * -0.899976223 + tmp3 * (1.501321110 - 0.899976223); \
+ * tmp0 += z3; tmp1 += z4; \
+ * tmp2 += z3; tmp3 += z4; \
+ */ \
+ \
+ in##71l = vec_mergeh(in##7, in##1); \
+ in##71h = vec_mergel(in##7, in##1); \
+ \
+ tmp0l = vec_msums(in##71l, pw_mf060_mf089, z3l); \
+ tmp0h = vec_msums(in##71h, pw_mf060_mf089, z3h); \
+ tmp3l = vec_msums(in##71l, pw_mf089_f060, z4l); \
+ tmp3h = vec_msums(in##71h, pw_mf089_f060, z4h); \
+ \
+ in##53l = vec_mergeh(in##5, in##3); \
+ in##53h = vec_mergel(in##5, in##3); \
+ \
+ tmp1l = vec_msums(in##53l, pw_mf050_mf256, z4l); \
+ tmp1h = vec_msums(in##53h, pw_mf050_mf256, z4h); \
+ tmp2l = vec_msums(in##53l, pw_mf256_f050, z3l); \
+ tmp2h = vec_msums(in##53h, pw_mf256_f050, z3h); \
+ \
+ /* Final output stage */ \
+ \
+ out0l = vec_add(tmp10l, tmp3l); \
+ out0h = vec_add(tmp10h, tmp3h); \
+ out7l = vec_sub(tmp10l, tmp3l); \
+ out7h = vec_sub(tmp10h, tmp3h); \
+ \
+ out0l = vec_sra(out0l, descale_p##PASS); \
+ out0h = vec_sra(out0h, descale_p##PASS); \
+ out7l = vec_sra(out7l, descale_p##PASS); \
+ out7h = vec_sra(out7h, descale_p##PASS); \
+ \
+ out0 = vec_pack(out0l, out0h); \
+ out7 = vec_pack(out7l, out7h); \
+ \
+ out1l = vec_add(tmp11l, tmp2l); \
+ out1h = vec_add(tmp11h, tmp2h); \
+ out6l = vec_sub(tmp11l, tmp2l); \
+ out6h = vec_sub(tmp11h, tmp2h); \
+ \
+ out1l = vec_sra(out1l, descale_p##PASS); \
+ out1h = vec_sra(out1h, descale_p##PASS); \
+ out6l = vec_sra(out6l, descale_p##PASS); \
+ out6h = vec_sra(out6h, descale_p##PASS); \
+ \
+ out1 = vec_pack(out1l, out1h); \
+ out6 = vec_pack(out6l, out6h); \
+ \
+ out2l = vec_add(tmp12l, tmp1l); \
+ out2h = vec_add(tmp12h, tmp1h); \
+ out5l = vec_sub(tmp12l, tmp1l); \
+ out5h = vec_sub(tmp12h, tmp1h); \
+ \
+ out2l = vec_sra(out2l, descale_p##PASS); \
+ out2h = vec_sra(out2h, descale_p##PASS); \
+ out5l = vec_sra(out5l, descale_p##PASS); \
+ out5h = vec_sra(out5h, descale_p##PASS); \
+ \
+ out2 = vec_pack(out2l, out2h); \
+ out5 = vec_pack(out5l, out5h); \
+ \
+ out3l = vec_add(tmp13l, tmp0l); \
+ out3h = vec_add(tmp13h, tmp0h); \
+ out4l = vec_sub(tmp13l, tmp0l); \
+ out4h = vec_sub(tmp13h, tmp0h); \
+ \
+ out3l = vec_sra(out3l, descale_p##PASS); \
+ out3h = vec_sra(out3h, descale_p##PASS); \
+ out4l = vec_sra(out4l, descale_p##PASS); \
+ out4h = vec_sra(out4h, descale_p##PASS); \
+ \
+ out3 = vec_pack(out3l, out3h); \
+ out4 = vec_pack(out4l, out4h); \
+}
+
+
+void
+jsimd_idct_islow_altivec (void * dct_table_, JCOEFPTR coef_block,
+ JSAMPARRAY output_buf, JDIMENSION output_col)
+{
+ short *dct_table = (short *)dct_table_;
+ int *outptr;
+
+ __vector short row0, row1, row2, row3, row4, row5, row6, row7,
+ col0, col1, col2, col3, col4, col5, col6, col7,
+ quant0, quant1, quant2, quant3, quant4, quant5, quant6, quant7,
+ tmp0, tmp1, tmp2, tmp3, z3, z4,
+ z34l, z34h, col71l, col71h, col26l, col26h, col53l, col53h,
+ row71l, row71h, row26l, row26h, row53l, row53h,
+ out0, out1, out2, out3, out4, out5, out6, out7;
+ __vector int tmp0l, tmp0h, tmp1l, tmp1h, tmp2l, tmp2h, tmp3l, tmp3h,
+ tmp10l, tmp10h, tmp11l, tmp11h, tmp12l, tmp12h, tmp13l, tmp13h,
+ z3l, z3h, z4l, z4h,
+ out0l, out0h, out1l, out1h, out2l, out2h, out3l, out3h, out4l, out4h,
+ out5l, out5h, out6l, out6h, out7l, out7h;
+ __vector signed char outb;
+
+ /* Constants */
+ __vector short pw_zero = { __8X(0) },
+ pw_f130_f054 = { __4X2(F_0_541 + F_0_765, F_0_541) },
+ pw_f054_mf130 = { __4X2(F_0_541, F_0_541 - F_1_847) },
+ pw_mf078_f117 = { __4X2(F_1_175 - F_1_961, F_1_175) },
+ pw_f117_f078 = { __4X2(F_1_175, F_1_175 - F_0_390) },
+ pw_mf060_mf089 = { __4X2(F_0_298 - F_0_899, -F_0_899) },
+ pw_mf089_f060 = { __4X2(-F_0_899, F_1_501 - F_0_899) },
+ pw_mf050_mf256 = { __4X2(F_2_053 - F_2_562, -F_2_562) },
+ pw_mf256_f050 = { __4X2(-F_2_562, F_3_072 - F_2_562) };
+ __vector unsigned short pass1_bits = { __8X(PASS1_BITS) };
+ __vector int pd_zero = { __4X(0) },
+ pd_descale_p1 = { __4X(1 << (DESCALE_P1 - 1)) },
+ pd_descale_p2 = { __4X(1 << (DESCALE_P2 - 1)) };
+ __vector unsigned int descale_p1 = { __4X(DESCALE_P1) },
+ descale_p2 = { __4X(DESCALE_P2) },
+ const_bits = { __4X(CONST_BITS) };
+ __vector signed char pb_centerjsamp = { __16X(CENTERJSAMPLE) };
+
+ /* Pass 1: process columns */
+
+ col0 = vec_ld(0, coef_block);
+ col1 = vec_ld(16, coef_block);
+ col2 = vec_ld(32, coef_block);
+ col3 = vec_ld(48, coef_block);
+ col4 = vec_ld(64, coef_block);
+ col5 = vec_ld(80, coef_block);
+ col6 = vec_ld(96, coef_block);
+ col7 = vec_ld(112, coef_block);
+
+ tmp1 = vec_or(col1, col2);
+ tmp2 = vec_or(col3, col4);
+ tmp1 = vec_or(tmp1, tmp2);
+ tmp3 = vec_or(col5, col6);
+ tmp3 = vec_or(tmp3, col7);
+ tmp1 = vec_or(tmp1, tmp3);
+
+ quant0 = vec_ld(0, dct_table);
+ col0 = vec_mladd(col0, quant0, pw_zero);
+
+ if (vec_all_eq(tmp1, pw_zero)) {
+ /* AC terms all zero */
+
+ col0 = vec_sl(col0, pass1_bits);
+
+ row0 = vec_splat(col0, 0);
+ row1 = vec_splat(col0, 1);
+ row2 = vec_splat(col0, 2);
+ row3 = vec_splat(col0, 3);
+ row4 = vec_splat(col0, 4);
+ row5 = vec_splat(col0, 5);
+ row6 = vec_splat(col0, 6);
+ row7 = vec_splat(col0, 7);
+
+ } else {
+
+ quant1 = vec_ld(16, dct_table);
+ quant2 = vec_ld(32, dct_table);
+ quant3 = vec_ld(48, dct_table);
+ quant4 = vec_ld(64, dct_table);
+ quant5 = vec_ld(80, dct_table);
+ quant6 = vec_ld(96, dct_table);
+ quant7 = vec_ld(112, dct_table);
+
+ col1 = vec_mladd(col1, quant1, pw_zero);
+ col2 = vec_mladd(col2, quant2, pw_zero);
+ col3 = vec_mladd(col3, quant3, pw_zero);
+ col4 = vec_mladd(col4, quant4, pw_zero);
+ col5 = vec_mladd(col5, quant5, pw_zero);
+ col6 = vec_mladd(col6, quant6, pw_zero);
+ col7 = vec_mladd(col7, quant7, pw_zero);
+
+ DO_IDCT(col, 1);
+
+ TRANSPOSE(out, row);
+ }
+
+ /* Pass 2: process rows */
+
+ DO_IDCT(row, 2);
+
+ TRANSPOSE(out, col);
+
+ outb = vec_packs(col0, col0);
+ outb = vec_add(outb, pb_centerjsamp);
+ outptr = (int *)(output_buf[0] + output_col);
+ vec_ste((__vector int)outb, 0, outptr);
+ vec_ste((__vector int)outb, 4, outptr);
+
+ outb = vec_packs(col1, col1);
+ outb = vec_add(outb, pb_centerjsamp);
+ outptr = (int *)(output_buf[1] + output_col);
+ vec_ste((__vector int)outb, 0, outptr);
+ vec_ste((__vector int)outb, 4, outptr);
+
+ outb = vec_packs(col2, col2);
+ outb = vec_add(outb, pb_centerjsamp);
+ outptr = (int *)(output_buf[2] + output_col);
+ vec_ste((__vector int)outb, 0, outptr);
+ vec_ste((__vector int)outb, 4, outptr);
+
+ outb = vec_packs(col3, col3);
+ outb = vec_add(outb, pb_centerjsamp);
+ outptr = (int *)(output_buf[3] + output_col);
+ vec_ste((__vector int)outb, 0, outptr);
+ vec_ste((__vector int)outb, 4, outptr);
+
+ outb = vec_packs(col4, col4);
+ outb = vec_add(outb, pb_centerjsamp);
+ outptr = (int *)(output_buf[4] + output_col);
+ vec_ste((__vector int)outb, 0, outptr);
+ vec_ste((__vector int)outb, 4, outptr);
+
+ outb = vec_packs(col5, col5);
+ outb = vec_add(outb, pb_centerjsamp);
+ outptr = (int *)(output_buf[5] + output_col);
+ vec_ste((__vector int)outb, 0, outptr);
+ vec_ste((__vector int)outb, 4, outptr);
+
+ outb = vec_packs(col6, col6);
+ outb = vec_add(outb, pb_centerjsamp);
+ outptr = (int *)(output_buf[6] + output_col);
+ vec_ste((__vector int)outb, 0, outptr);
+ vec_ste((__vector int)outb, 4, outptr);
+
+ outb = vec_packs(col7, col7);
+ outb = vec_add(outb, pb_centerjsamp);
+ outptr = (int *)(output_buf[7] + output_col);
+ vec_ste((__vector int)outb, 0, outptr);
+ vec_ste((__vector int)outb, 4, outptr);
+}
diff --git a/simd/jquanti-altivec.c b/simd/jquanti-altivec.c
new file mode 100644
index 0000000..2fbec2d
--- /dev/null
+++ b/simd/jquanti-altivec.c
@@ -0,0 +1,252 @@
+/*
+ * AltiVec optimizations for libjpeg-turbo
+ *
+ * Copyright (C) 2014-2015, D. R. Commander.
+ * All rights reserved.
+ * This software is provided 'as-is', without any express or implied
+ * warranty. In no event will the authors be held liable for any damages
+ * arising from the use of this software.
+ *
+ * Permission is granted to anyone to use this software for any purpose,
+ * including commercial applications, and to alter it and redistribute it
+ * freely, subject to the following restrictions:
+ *
+ * 1. The origin of this software must not be misrepresented; you must not
+ * claim that you wrote the original software. If you use this software
+ * in a product, an acknowledgment in the product documentation would be
+ * appreciated but is not required.
+ * 2. Altered source versions must be plainly marked as such, and must not be
+ * misrepresented as being the original software.
+ * 3. This notice may not be removed or altered from any source distribution.
+ */
+
+/* INTEGER QUANTIZATION AND SAMPLE CONVERSION */
+
+#include "jsimd_altivec.h"
+
+
+/* NOTE: The address will either be aligned or offset by 8 bytes, so we can
+ * always get the data we want by using a single vector load (although we may
+ * have to permute the result.)
+ */
+#if __BIG_ENDIAN__
+
+#define LOAD_ROW(row) { \
+ elemptr = sample_data[row] + start_col; \
+ in##row = vec_ld(0, elemptr); \
+ if ((size_t)elemptr & 15) \
+ in##row = vec_perm(in##row, in##row, vec_lvsl(0, elemptr)); \
+}
+
+#else
+
+#define LOAD_ROW(row) { \
+ elemptr = sample_data[row] + start_col; \
+ in##row = vec_vsx_ld(0, elemptr); \
+}
+
+#endif
+
+
+void
+jsimd_convsamp_altivec (JSAMPARRAY sample_data, JDIMENSION start_col,
+ DCTELEM * workspace)
+{
+ JSAMPROW elemptr;
+
+ __vector unsigned char in0, in1, in2, in3, in4, in5, in6, in7;
+ __vector short out0, out1, out2, out3, out4, out5, out6, out7;
+
+ /* Constants */
+ __vector short pw_centerjsamp = { __8X(CENTERJSAMPLE) };
+ __vector unsigned char pb_zero = { __16X(0) };
+
+ LOAD_ROW(0);
+ LOAD_ROW(1);
+ LOAD_ROW(2);
+ LOAD_ROW(3);
+ LOAD_ROW(4);
+ LOAD_ROW(5);
+ LOAD_ROW(6);
+ LOAD_ROW(7);
+
+ out0 = (__vector short)VEC_UNPACKHU(in0);
+ out1 = (__vector short)VEC_UNPACKHU(in1);
+ out2 = (__vector short)VEC_UNPACKHU(in2);
+ out3 = (__vector short)VEC_UNPACKHU(in3);
+ out4 = (__vector short)VEC_UNPACKHU(in4);
+ out5 = (__vector short)VEC_UNPACKHU(in5);
+ out6 = (__vector short)VEC_UNPACKHU(in6);
+ out7 = (__vector short)VEC_UNPACKHU(in7);
+
+ out0 = vec_sub(out0, pw_centerjsamp);
+ out1 = vec_sub(out1, pw_centerjsamp);
+ out2 = vec_sub(out2, pw_centerjsamp);
+ out3 = vec_sub(out3, pw_centerjsamp);
+ out4 = vec_sub(out4, pw_centerjsamp);
+ out5 = vec_sub(out5, pw_centerjsamp);
+ out6 = vec_sub(out6, pw_centerjsamp);
+ out7 = vec_sub(out7, pw_centerjsamp);
+
+ vec_st(out0, 0, workspace);
+ vec_st(out1, 16, workspace);
+ vec_st(out2, 32, workspace);
+ vec_st(out3, 48, workspace);
+ vec_st(out4, 64, workspace);
+ vec_st(out5, 80, workspace);
+ vec_st(out6, 96, workspace);
+ vec_st(out7, 112, workspace);
+}
+
+
+#define WORD_BIT 16
+
+/* There is no AltiVec 16-bit unsigned multiply instruction, hence this.
+ We basically need an unsigned equivalent of vec_madds(). */
+
+#define MULTIPLY(vs0, vs1, out) { \
+ tmpe = vec_mule((__vector unsigned short)vs0, \
+ (__vector unsigned short)vs1); \
+ tmpo = vec_mulo((__vector unsigned short)vs0, \
+ (__vector unsigned short)vs1); \
+ out = (__vector short)vec_perm((__vector unsigned short)tmpe, \
+ (__vector unsigned short)tmpo, \
+ shift_pack_index); \
+}
+
+void
+jsimd_quantize_altivec (JCOEFPTR coef_block, DCTELEM * divisors,
+ DCTELEM * workspace)
+{
+ __vector short row0, row1, row2, row3, row4, row5, row6, row7,
+ row0s, row1s, row2s, row3s, row4s, row5s, row6s, row7s,
+ corr0, corr1, corr2, corr3, corr4, corr5, corr6, corr7,
+ recip0, recip1, recip2, recip3, recip4, recip5, recip6, recip7,
+ scale0, scale1, scale2, scale3, scale4, scale5, scale6, scale7;
+ __vector unsigned int tmpe, tmpo;
+
+ /* Constants */
+ __vector unsigned short pw_word_bit_m1 = { __8X(WORD_BIT - 1) };
+#if __BIG_ENDIAN__
+ __vector unsigned char shift_pack_index =
+ {0,1,16,17,4,5,20,21,8,9,24,25,12,13,28,29};
+#else
+ __vector unsigned char shift_pack_index =
+ {2,3,18,19,6,7,22,23,10,11,26,27,14,15,30,31};
+#endif
+
+ row0 = vec_ld(0, workspace);
+ row1 = vec_ld(16, workspace);
+ row2 = vec_ld(32, workspace);
+ row3 = vec_ld(48, workspace);
+ row4 = vec_ld(64, workspace);
+ row5 = vec_ld(80, workspace);
+ row6 = vec_ld(96, workspace);
+ row7 = vec_ld(112, workspace);
+
+ /* Branch-less absolute value */
+ row0s = vec_sra(row0, pw_word_bit_m1);
+ row1s = vec_sra(row1, pw_word_bit_m1);
+ row2s = vec_sra(row2, pw_word_bit_m1);
+ row3s = vec_sra(row3, pw_word_bit_m1);
+ row4s = vec_sra(row4, pw_word_bit_m1);
+ row5s = vec_sra(row5, pw_word_bit_m1);
+ row6s = vec_sra(row6, pw_word_bit_m1);
+ row7s = vec_sra(row7, pw_word_bit_m1);
+ row0 = vec_xor(row0, row0s);
+ row1 = vec_xor(row1, row1s);
+ row2 = vec_xor(row2, row2s);
+ row3 = vec_xor(row3, row3s);
+ row4 = vec_xor(row4, row4s);
+ row5 = vec_xor(row5, row5s);
+ row6 = vec_xor(row6, row6s);
+ row7 = vec_xor(row7, row7s);
+ row0 = vec_sub(row0, row0s);
+ row1 = vec_sub(row1, row1s);
+ row2 = vec_sub(row2, row2s);
+ row3 = vec_sub(row3, row3s);
+ row4 = vec_sub(row4, row4s);
+ row5 = vec_sub(row5, row5s);
+ row6 = vec_sub(row6, row6s);
+ row7 = vec_sub(row7, row7s);
+
+ corr0 = vec_ld(DCTSIZE2 * 2, divisors);
+ corr1 = vec_ld(DCTSIZE2 * 2 + 16, divisors);
+ corr2 = vec_ld(DCTSIZE2 * 2 + 32, divisors);
+ corr3 = vec_ld(DCTSIZE2 * 2 + 48, divisors);
+ corr4 = vec_ld(DCTSIZE2 * 2 + 64, divisors);
+ corr5 = vec_ld(DCTSIZE2 * 2 + 80, divisors);
+ corr6 = vec_ld(DCTSIZE2 * 2 + 96, divisors);
+ corr7 = vec_ld(DCTSIZE2 * 2 + 112, divisors);
+
+ row0 = vec_add(row0, corr0);
+ row1 = vec_add(row1, corr1);
+ row2 = vec_add(row2, corr2);
+ row3 = vec_add(row3, corr3);
+ row4 = vec_add(row4, corr4);
+ row5 = vec_add(row5, corr5);
+ row6 = vec_add(row6, corr6);
+ row7 = vec_add(row7, corr7);
+
+ recip0 = vec_ld(0, divisors);
+ recip1 = vec_ld(16, divisors);
+ recip2 = vec_ld(32, divisors);
+ recip3 = vec_ld(48, divisors);
+ recip4 = vec_ld(64, divisors);
+ recip5 = vec_ld(80, divisors);
+ recip6 = vec_ld(96, divisors);
+ recip7 = vec_ld(112, divisors);
+
+ MULTIPLY(row0, recip0, row0);
+ MULTIPLY(row1, recip1, row1);
+ MULTIPLY(row2, recip2, row2);
+ MULTIPLY(row3, recip3, row3);
+ MULTIPLY(row4, recip4, row4);
+ MULTIPLY(row5, recip5, row5);
+ MULTIPLY(row6, recip6, row6);
+ MULTIPLY(row7, recip7, row7);
+
+ scale0 = vec_ld(DCTSIZE2 * 4, divisors);
+ scale1 = vec_ld(DCTSIZE2 * 4 + 16, divisors);
+ scale2 = vec_ld(DCTSIZE2 * 4 + 32, divisors);
+ scale3 = vec_ld(DCTSIZE2 * 4 + 48, divisors);
+ scale4 = vec_ld(DCTSIZE2 * 4 + 64, divisors);
+ scale5 = vec_ld(DCTSIZE2 * 4 + 80, divisors);
+ scale6 = vec_ld(DCTSIZE2 * 4 + 96, divisors);
+ scale7 = vec_ld(DCTSIZE2 * 4 + 112, divisors);
+
+ MULTIPLY(row0, scale0, row0);
+ MULTIPLY(row1, scale1, row1);
+ MULTIPLY(row2, scale2, row2);
+ MULTIPLY(row3, scale3, row3);
+ MULTIPLY(row4, scale4, row4);
+ MULTIPLY(row5, scale5, row5);
+ MULTIPLY(row6, scale6, row6);
+ MULTIPLY(row7, scale7, row7);
+
+ row0 = vec_xor(row0, row0s);
+ row1 = vec_xor(row1, row1s);
+ row2 = vec_xor(row2, row2s);
+ row3 = vec_xor(row3, row3s);
+ row4 = vec_xor(row4, row4s);
+ row5 = vec_xor(row5, row5s);
+ row6 = vec_xor(row6, row6s);
+ row7 = vec_xor(row7, row7s);
+ row0 = vec_sub(row0, row0s);
+ row1 = vec_sub(row1, row1s);
+ row2 = vec_sub(row2, row2s);
+ row3 = vec_sub(row3, row3s);
+ row4 = vec_sub(row4, row4s);
+ row5 = vec_sub(row5, row5s);
+ row6 = vec_sub(row6, row6s);
+ row7 = vec_sub(row7, row7s);
+
+ vec_st(row0, 0, coef_block);
+ vec_st(row1, 16, coef_block);
+ vec_st(row2, 32, coef_block);
+ vec_st(row3, 48, coef_block);
+ vec_st(row4, 64, coef_block);
+ vec_st(row5, 80, coef_block);
+ vec_st(row6, 96, coef_block);
+ vec_st(row7, 112, coef_block);
+}
diff --git a/simd/jsimd.h b/simd/jsimd.h
index c5abd45..c0de7e7 100644
--- a/simd/jsimd.h
+++ b/simd/jsimd.h
@@ -2,7 +2,7 @@
* simd/jsimd.h
*
* Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
- * Copyright 2011 D. R. Commander
+ * Copyright (C) 2011, 2014-2015 D. R. Commander
* Copyright (C) 2013-2014, MIPS Technologies, Inc., California
* Copyright (C) 2014 Linaro Limited
*
@@ -21,6 +21,7 @@
#define JSIMD_SSE2 0x08
#define JSIMD_ARM_NEON 0x10
#define JSIMD_MIPS_DSPR2 0x20
+#define JSIMD_ALTIVEC 0x40
/* SIMD Ext: retrieve SIMD/CPU information */
EXTERN(unsigned int) jpeg_simd_cpu_support (void);
@@ -115,6 +116,28 @@
(JDIMENSION img_width, JSAMPARRAY input_buf, JSAMPIMAGE output_buf,
JDIMENSION output_row, int num_rows);
+EXTERN(void) jsimd_rgb_ycc_convert_altivec
+ (JDIMENSION img_width, JSAMPARRAY input_buf, JSAMPIMAGE output_buf,
+ JDIMENSION output_row, int num_rows);
+EXTERN(void) jsimd_extrgb_ycc_convert_altivec
+ (JDIMENSION img_width, JSAMPARRAY input_buf, JSAMPIMAGE output_buf,
+ JDIMENSION output_row, int num_rows);
+EXTERN(void) jsimd_extrgbx_ycc_convert_altivec
+ (JDIMENSION img_width, JSAMPARRAY input_buf, JSAMPIMAGE output_buf,
+ JDIMENSION output_row, int num_rows);
+EXTERN(void) jsimd_extbgr_ycc_convert_altivec
+ (JDIMENSION img_width, JSAMPARRAY input_buf, JSAMPIMAGE output_buf,
+ JDIMENSION output_row, int num_rows);
+EXTERN(void) jsimd_extbgrx_ycc_convert_altivec
+ (JDIMENSION img_width, JSAMPARRAY input_buf, JSAMPIMAGE output_buf,
+ JDIMENSION output_row, int num_rows);
+EXTERN(void) jsimd_extxbgr_ycc_convert_altivec
+ (JDIMENSION img_width, JSAMPARRAY input_buf, JSAMPIMAGE output_buf,
+ JDIMENSION output_row, int num_rows);
+EXTERN(void) jsimd_extxrgb_ycc_convert_altivec
+ (JDIMENSION img_width, JSAMPARRAY input_buf, JSAMPIMAGE output_buf,
+ JDIMENSION output_row, int num_rows);
+
/* RGB & extended RGB --> Grayscale Colorspace Conversion */
EXTERN(void) jsimd_rgb_gray_convert_mmx
(JDIMENSION img_width, JSAMPARRAY input_buf, JSAMPIMAGE output_buf,
@@ -183,6 +206,28 @@
(JDIMENSION img_width, JSAMPARRAY input_buf, JSAMPIMAGE output_buf,
JDIMENSION output_row, int num_rows);
+EXTERN(void) jsimd_rgb_gray_convert_altivec
+ (JDIMENSION img_width, JSAMPARRAY input_buf, JSAMPIMAGE output_buf,
+ JDIMENSION output_row, int num_rows);
+EXTERN(void) jsimd_extrgb_gray_convert_altivec
+ (JDIMENSION img_width, JSAMPARRAY input_buf, JSAMPIMAGE output_buf,
+ JDIMENSION output_row, int num_rows);
+EXTERN(void) jsimd_extrgbx_gray_convert_altivec
+ (JDIMENSION img_width, JSAMPARRAY input_buf, JSAMPIMAGE output_buf,
+ JDIMENSION output_row, int num_rows);
+EXTERN(void) jsimd_extbgr_gray_convert_altivec
+ (JDIMENSION img_width, JSAMPARRAY input_buf, JSAMPIMAGE output_buf,
+ JDIMENSION output_row, int num_rows);
+EXTERN(void) jsimd_extbgrx_gray_convert_altivec
+ (JDIMENSION img_width, JSAMPARRAY input_buf, JSAMPIMAGE output_buf,
+ JDIMENSION output_row, int num_rows);
+EXTERN(void) jsimd_extxbgr_gray_convert_altivec
+ (JDIMENSION img_width, JSAMPARRAY input_buf, JSAMPIMAGE output_buf,
+ JDIMENSION output_row, int num_rows);
+EXTERN(void) jsimd_extxrgb_gray_convert_altivec
+ (JDIMENSION img_width, JSAMPARRAY input_buf, JSAMPIMAGE output_buf,
+ JDIMENSION output_row, int num_rows);
+
/* YCC --> RGB & extended RGB Colorspace Conversion */
EXTERN(void) jsimd_ycc_rgb_convert_mmx
(JDIMENSION out_width, JSAMPIMAGE input_buf, JDIMENSION input_row,
@@ -276,6 +321,28 @@
(JDIMENSION out_width, JSAMPIMAGE input_buf, JDIMENSION input_row,
JSAMPARRAY output_buf, int num_rows);
+EXTERN(void) jsimd_ycc_rgb_convert_altivec
+ (JDIMENSION out_width, JSAMPIMAGE input_buf, JDIMENSION input_row,
+ JSAMPARRAY output_buf, int num_rows);
+EXTERN(void) jsimd_ycc_extrgb_convert_altivec
+ (JDIMENSION out_width, JSAMPIMAGE input_buf, JDIMENSION input_row,
+ JSAMPARRAY output_buf, int num_rows);
+EXTERN(void) jsimd_ycc_extrgbx_convert_altivec
+ (JDIMENSION out_width, JSAMPIMAGE input_buf, JDIMENSION input_row,
+ JSAMPARRAY output_buf, int num_rows);
+EXTERN(void) jsimd_ycc_extbgr_convert_altivec
+ (JDIMENSION out_width, JSAMPIMAGE input_buf, JDIMENSION input_row,
+ JSAMPARRAY output_buf, int num_rows);
+EXTERN(void) jsimd_ycc_extbgrx_convert_altivec
+ (JDIMENSION out_width, JSAMPIMAGE input_buf, JDIMENSION input_row,
+ JSAMPARRAY output_buf, int num_rows);
+EXTERN(void) jsimd_ycc_extxbgr_convert_altivec
+ (JDIMENSION out_width, JSAMPIMAGE input_buf, JDIMENSION input_row,
+ JSAMPARRAY output_buf, int num_rows);
+EXTERN(void) jsimd_ycc_extxrgb_convert_altivec
+ (JDIMENSION out_width, JSAMPIMAGE input_buf, JDIMENSION input_row,
+ JSAMPARRAY output_buf, int num_rows);
+
/* NULL Colorspace Conversion */
EXTERN(void) jsimd_c_null_convert_mips_dspr2
(JDIMENSION img_width, JSAMPARRAY input_buf, JSAMPIMAGE output_buf,
@@ -297,6 +364,11 @@
JDIMENSION v_samp_factor, JDIMENSION width_blocks,
JSAMPARRAY input_data, JSAMPARRAY output_data);
+EXTERN(void) jsimd_h2v1_downsample_altivec
+ (JDIMENSION image_width, int max_v_samp_factor,
+ JDIMENSION v_samp_factor, JDIMENSION width_blocks,
+ JSAMPARRAY input_data, JSAMPARRAY output_data);
+
/* h2v2 Downsampling */
EXTERN(void) jsimd_h2v2_downsample_mmx
(JDIMENSION image_width, int max_v_samp_factor,
@@ -313,6 +385,11 @@
JDIMENSION v_samp_factor, JDIMENSION width_blocks,
JSAMPARRAY input_data, JSAMPARRAY output_data);
+EXTERN(void) jsimd_h2v2_downsample_altivec
+ (JDIMENSION image_width, int max_v_samp_factor,
+ JDIMENSION v_samp_factor, JDIMENSION width_blocks,
+ JSAMPARRAY input_data, JSAMPARRAY output_data);
+
/* h2v2 Smooth Downsampling */
EXTERN(void) jsimd_h2v2_smooth_downsample_mips_dspr2
(JSAMPARRAY input_data, JSAMPARRAY output_data,
@@ -348,6 +425,12 @@
JSAMPARRAY * output_data_ptr, JDIMENSION output_width,
int max_v_samp_factor);
+EXTERN(void) jsimd_h2v1_upsample_altivec
+ (int max_v_samp_factor, JDIMENSION output_width, JSAMPARRAY input_data,
+ JSAMPARRAY * output_data_ptr);
+EXTERN(void) jsimd_h2v2_upsample_altivec
+ (int max_v_samp_factor, JDIMENSION output_width, JSAMPARRAY input_data,
+ JSAMPARRAY * output_data_ptr);
/* Fancy Upsampling */
EXTERN(void) jsimd_h2v1_fancy_upsample_mmx
@@ -376,6 +459,13 @@
(int max_v_samp_factor, JDIMENSION downsampled_width,
JSAMPARRAY input_data, JSAMPARRAY * output_data_ptr);
+EXTERN(void) jsimd_h2v1_fancy_upsample_altivec
+ (int max_v_samp_factor, JDIMENSION downsampled_width,
+ JSAMPARRAY input_data, JSAMPARRAY * output_data_ptr);
+EXTERN(void) jsimd_h2v2_fancy_upsample_altivec
+ (int max_v_samp_factor, JDIMENSION downsampled_width,
+ JSAMPARRAY input_data, JSAMPARRAY * output_data_ptr);
+
/* Merged Upsampling */
EXTERN(void) jsimd_h2v1_merged_upsample_mmx
(JDIMENSION output_width, JSAMPIMAGE input_buf,
@@ -510,6 +600,50 @@
(JDIMENSION output_width, JSAMPIMAGE input_buf,
JDIMENSION in_row_group_ctr, JSAMPARRAY output_buf, JSAMPLE* range);
+EXTERN(void) jsimd_h2v1_merged_upsample_altivec
+ (JDIMENSION output_width, JSAMPIMAGE input_buf,
+ JDIMENSION in_row_group_ctr, JSAMPARRAY output_buf);
+EXTERN(void) jsimd_h2v1_extrgb_merged_upsample_altivec
+ (JDIMENSION output_width, JSAMPIMAGE input_buf,
+ JDIMENSION in_row_group_ctr, JSAMPARRAY output_buf);
+EXTERN(void) jsimd_h2v1_extrgbx_merged_upsample_altivec
+ (JDIMENSION output_width, JSAMPIMAGE input_buf,
+ JDIMENSION in_row_group_ctr, JSAMPARRAY output_buf);
+EXTERN(void) jsimd_h2v1_extbgr_merged_upsample_altivec
+ (JDIMENSION output_width, JSAMPIMAGE input_buf,
+ JDIMENSION in_row_group_ctr, JSAMPARRAY output_buf);
+EXTERN(void) jsimd_h2v1_extbgrx_merged_upsample_altivec
+ (JDIMENSION output_width, JSAMPIMAGE input_buf,
+ JDIMENSION in_row_group_ctr, JSAMPARRAY output_buf);
+EXTERN(void) jsimd_h2v1_extxbgr_merged_upsample_altivec
+ (JDIMENSION output_width, JSAMPIMAGE input_buf,
+ JDIMENSION in_row_group_ctr, JSAMPARRAY output_buf);
+EXTERN(void) jsimd_h2v1_extxrgb_merged_upsample_altivec
+ (JDIMENSION output_width, JSAMPIMAGE input_buf,
+ JDIMENSION in_row_group_ctr, JSAMPARRAY output_buf);
+
+EXTERN(void) jsimd_h2v2_merged_upsample_altivec
+ (JDIMENSION output_width, JSAMPIMAGE input_buf,
+ JDIMENSION in_row_group_ctr, JSAMPARRAY output_buf);
+EXTERN(void) jsimd_h2v2_extrgb_merged_upsample_altivec
+ (JDIMENSION output_width, JSAMPIMAGE input_buf,
+ JDIMENSION in_row_group_ctr, JSAMPARRAY output_buf);
+EXTERN(void) jsimd_h2v2_extrgbx_merged_upsample_altivec
+ (JDIMENSION output_width, JSAMPIMAGE input_buf,
+ JDIMENSION in_row_group_ctr, JSAMPARRAY output_buf);
+EXTERN(void) jsimd_h2v2_extbgr_merged_upsample_altivec
+ (JDIMENSION output_width, JSAMPIMAGE input_buf,
+ JDIMENSION in_row_group_ctr, JSAMPARRAY output_buf);
+EXTERN(void) jsimd_h2v2_extbgrx_merged_upsample_altivec
+ (JDIMENSION output_width, JSAMPIMAGE input_buf,
+ JDIMENSION in_row_group_ctr, JSAMPARRAY output_buf);
+EXTERN(void) jsimd_h2v2_extxbgr_merged_upsample_altivec
+ (JDIMENSION output_width, JSAMPIMAGE input_buf,
+ JDIMENSION in_row_group_ctr, JSAMPARRAY output_buf);
+EXTERN(void) jsimd_h2v2_extxrgb_merged_upsample_altivec
+ (JDIMENSION output_width, JSAMPIMAGE input_buf,
+ JDIMENSION in_row_group_ctr, JSAMPARRAY output_buf);
+
/* Sample Conversion */
EXTERN(void) jsimd_convsamp_mmx
(JSAMPARRAY sample_data, JDIMENSION start_col, DCTELEM * workspace);
@@ -523,6 +657,9 @@
EXTERN(void) jsimd_convsamp_mips_dspr2
(JSAMPARRAY sample_data, JDIMENSION start_col, DCTELEM * workspace);
+EXTERN(void) jsimd_convsamp_altivec
+ (JSAMPARRAY sample_data, JDIMENSION start_col, DCTELEM * workspace);
+
/* Floating Point Sample Conversion */
EXTERN(void) jsimd_convsamp_float_3dnow
(JSAMPARRAY sample_data, JDIMENSION start_col, FAST_FLOAT * workspace);
@@ -544,6 +681,8 @@
EXTERN(void) jsimd_fdct_islow_mips_dspr2 (DCTELEM * data);
+EXTERN(void) jsimd_fdct_islow_altivec (DCTELEM * data);
+
/* Fast Integer Forward DCT */
EXTERN(void) jsimd_fdct_ifast_mmx (DCTELEM * data);
@@ -554,6 +693,8 @@
EXTERN(void) jsimd_fdct_ifast_mips_dspr2 (DCTELEM * data);
+EXTERN(void) jsimd_fdct_ifast_altivec (DCTELEM * data);
+
/* Floating Point Forward DCT */
EXTERN(void) jsimd_fdct_float_3dnow (FAST_FLOAT * data);
@@ -573,6 +714,9 @@
EXTERN(void) jsimd_quantize_mips_dspr2
(JCOEFPTR coef_block, DCTELEM * divisors, DCTELEM * workspace);
+EXTERN(void) jsimd_quantize_altivec
+ (JCOEFPTR coef_block, DCTELEM * divisors, DCTELEM * workspace);
+
/* Floating Point Quantization */
EXTERN(void) jsimd_quantize_float_3dnow
(JCOEFPTR coef_block, FAST_FLOAT * divisors, FAST_FLOAT * workspace);
@@ -641,6 +785,10 @@
(void * dct_table, JCOEFPTR coef_block, int * output_buf,
JSAMPLE * output_col);
+EXTERN(void) jsimd_idct_islow_altivec
+ (void * dct_table, JCOEFPTR coef_block, JSAMPARRAY output_buf,
+ JDIMENSION output_col);
+
/* Fast Integer Inverse DCT */
EXTERN(void) jsimd_idct_ifast_mmx
(void * dct_table, JCOEFPTR coef_block, JSAMPARRAY output_buf,
@@ -662,6 +810,10 @@
(DCTELEM * wsptr, JSAMPARRAY output_buf, JDIMENSION output_col,
const int * idct_coefs);
+EXTERN(void) jsimd_idct_ifast_altivec
+ (void * dct_table, JCOEFPTR coef_block, JSAMPARRAY output_buf,
+ JDIMENSION output_col);
+
/* Floating Point Inverse DCT */
EXTERN(void) jsimd_idct_float_3dnow
(void * dct_table, JCOEFPTR coef_block, JSAMPARRAY output_buf,
diff --git a/simd/jsimd_altivec.h b/simd/jsimd_altivec.h
new file mode 100644
index 0000000..2660219
--- /dev/null
+++ b/simd/jsimd_altivec.h
@@ -0,0 +1,99 @@
+/*
+ * AltiVec optimizations for libjpeg-turbo
+ *
+ * Copyright (C) 2014-2015, D. R. Commander.
+ * All rights reserved.
+ * This software is provided 'as-is', without any express or implied
+ * warranty. In no event will the authors be held liable for any damages
+ * arising from the use of this software.
+ *
+ * Permission is granted to anyone to use this software for any purpose,
+ * including commercial applications, and to alter it and redistribute it
+ * freely, subject to the following restrictions:
+ *
+ * 1. The origin of this software must not be misrepresented; you must not
+ * claim that you wrote the original software. If you use this software
+ * in a product, an acknowledgment in the product documentation would be
+ * appreciated but is not required.
+ * 2. Altered source versions must be plainly marked as such, and must not be
+ * misrepresented as being the original software.
+ * 3. This notice may not be removed or altered from any source distribution.
+ */
+
+#define JPEG_INTERNALS
+#include "../jinclude.h"
+#include "../jpeglib.h"
+#include "../jsimd.h"
+#include "../jdct.h"
+#include "../jsimddct.h"
+#include "jsimd.h"
+#include <altivec.h>
+
+
+/* Common code */
+
+#define __4X(a) a, a, a, a
+#define __4X2(a, b) a, b, a, b, a, b, a, b
+#define __8X(a) __4X(a), __4X(a)
+#define __16X(a) __8X(a), __8X(a)
+
+#define TRANSPOSE(row, col) \
+{ \
+ __vector short row04l, row04h, row15l, row15h, \
+ row26l, row26h, row37l, row37h; \
+ __vector short col01e, col01o, col23e, col23o, \
+ col45e, col45o, col67e, col67o; \
+ \
+ /* transpose coefficients (phase 1) */ \
+ row04l = vec_mergeh(row##0, row##4); /* row04l=(00 40 01 41 02 42 03 43) */ \
+ row04h = vec_mergel(row##0, row##4); /* row04h=(04 44 05 45 06 46 07 47) */ \
+ row15l = vec_mergeh(row##1, row##5); /* row15l=(10 50 11 51 12 52 13 53) */ \
+ row15h = vec_mergel(row##1, row##5); /* row15h=(14 54 15 55 16 56 17 57) */ \
+ row26l = vec_mergeh(row##2, row##6); /* row26l=(20 60 21 61 22 62 23 63) */ \
+ row26h = vec_mergel(row##2, row##6); /* row26h=(24 64 25 65 26 66 27 67) */ \
+ row37l = vec_mergeh(row##3, row##7); /* row37l=(30 70 31 71 32 72 33 73) */ \
+ row37h = vec_mergel(row##3, row##7); /* row37h=(34 74 35 75 36 76 37 77) */ \
+ \
+ /* transpose coefficients (phase 2) */ \
+ col01e = vec_mergeh(row04l, row26l); /* col01e=(00 20 40 60 01 21 41 61) */ \
+ col23e = vec_mergel(row04l, row26l); /* col23e=(02 22 42 62 03 23 43 63) */ \
+ col45e = vec_mergeh(row04h, row26h); /* col45e=(04 24 44 64 05 25 45 65) */ \
+ col67e = vec_mergel(row04h, row26h); /* col67e=(06 26 46 66 07 27 47 67) */ \
+ col01o = vec_mergeh(row15l, row37l); /* col01o=(10 30 50 70 11 31 51 71) */ \
+ col23o = vec_mergel(row15l, row37l); /* col23o=(12 32 52 72 13 33 53 73) */ \
+ col45o = vec_mergeh(row15h, row37h); /* col45o=(14 34 54 74 15 35 55 75) */ \
+ col67o = vec_mergel(row15h, row37h); /* col67o=(16 36 56 76 17 37 57 77) */ \
+ \
+ /* transpose coefficients (phase 3) */ \
+ col##0 = vec_mergeh(col01e, col01o); /* col0=(00 10 20 30 40 50 60 70) */ \
+ col##1 = vec_mergel(col01e, col01o); /* col1=(01 11 21 31 41 51 61 71) */ \
+ col##2 = vec_mergeh(col23e, col23o); /* col2=(02 12 22 32 42 52 62 72) */ \
+ col##3 = vec_mergel(col23e, col23o); /* col3=(03 13 23 33 43 53 63 73) */ \
+ col##4 = vec_mergeh(col45e, col45o); /* col4=(04 14 24 34 44 54 64 74) */ \
+ col##5 = vec_mergel(col45e, col45o); /* col5=(05 15 25 35 45 55 65 75) */ \
+ col##6 = vec_mergeh(col67e, col67o); /* col6=(06 16 26 36 46 56 66 76) */ \
+ col##7 = vec_mergel(col67e, col67o); /* col7=(07 17 27 37 47 57 67 77) */ \
+}
+
+#ifndef min
+#define min(a,b) ((a) < (b) ? (a) : (b))
+#endif
+
+
+/* Macros to abstract big/little endian bit twiddling */
+
+#if __BIG_ENDIAN__
+
+#define VEC_LD(a, b) vec_ld(a, b)
+#define VEC_ST(a, b, c) vec_st(a, b, c)
+#define VEC_UNPACKHU(a) vec_mergeh(pb_zero, a)
+#define VEC_UNPACKLU(a) vec_mergel(pb_zero, a)
+
+#else
+
+#define VEC_LD(a, b) vec_vsx_ld(a, b)
+#define VEC_ST(a, b, c) vec_vsx_st(a, b, c)
+#define VEC_UNPACKHU(a) vec_mergeh(a, pb_zero)
+#define VEC_UNPACKLU(a) vec_mergel(a, pb_zero)
+
+#endif
diff --git a/simd/jsimd_powerpc.c b/simd/jsimd_powerpc.c
new file mode 100644
index 0000000..2fc6814
--- /dev/null
+++ b/simd/jsimd_powerpc.c
@@ -0,0 +1,726 @@
+/*
+ * jsimd_powerpc.c
+ *
+ * Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
+ * Copyright 2009-2011, 2014-2015 D. R. Commander
+ *
+ * Based on the x86 SIMD extension for IJG JPEG library,
+ * Copyright (C) 1999-2006, MIYASAKA Masaru.
+ * For conditions of distribution and use, see copyright notice in jsimdext.inc
+ *
+ * This file contains the interface between the "normal" portions
+ * of the library and the SIMD implementations when running on a
+ * PowerPC architecture.
+ */
+
+#define JPEG_INTERNALS
+#include "../jinclude.h"
+#include "../jpeglib.h"
+#include "../jsimd.h"
+#include "../jdct.h"
+#include "../jsimddct.h"
+#include "jsimd.h"
+
+static unsigned int simd_support = ~0;
+
+LOCAL(void)
+init_simd (void)
+{
+ char *env = NULL;
+
+ if (simd_support != ~0U)
+ return;
+
+ simd_support = JSIMD_ALTIVEC;
+
+ /* Force different settings through environment variables */
+ env = getenv("JSIMD_FORCENONE");
+ if ((env != NULL) && (strcmp(env, "1") == 0))
+ simd_support = 0;
+}
+
+GLOBAL(int)
+jsimd_can_rgb_ycc (void)
+{
+ init_simd();
+
+ /* The code is optimised for these values only */
+ if (BITS_IN_JSAMPLE != 8)
+ return 0;
+ if (sizeof(JDIMENSION) != 4)
+ return 0;
+ if ((RGB_PIXELSIZE != 3) && (RGB_PIXELSIZE != 4))
+ return 0;
+
+ if (simd_support & JSIMD_ALTIVEC)
+ return 1;
+
+ return 0;
+}
+
+GLOBAL(int)
+jsimd_can_rgb_gray (void)
+{
+ init_simd();
+
+ /* The code is optimised for these values only */
+ if (BITS_IN_JSAMPLE != 8)
+ return 0;
+ if (sizeof(JDIMENSION) != 4)
+ return 0;
+ if ((RGB_PIXELSIZE != 3) && (RGB_PIXELSIZE != 4))
+ return 0;
+
+ if (simd_support & JSIMD_ALTIVEC)
+ return 1;
+
+ return 0;
+}
+
+GLOBAL(int)
+jsimd_can_ycc_rgb (void)
+{
+ init_simd();
+
+ /* The code is optimised for these values only */
+ if (BITS_IN_JSAMPLE != 8)
+ return 0;
+ if (sizeof(JDIMENSION) != 4)
+ return 0;
+ if ((RGB_PIXELSIZE != 3) && (RGB_PIXELSIZE != 4))
+ return 0;
+
+ if (simd_support & JSIMD_ALTIVEC)
+ return 1;
+
+ return 0;
+}
+
+GLOBAL(int)
+jsimd_can_ycc_rgb565 (void)
+{
+ return 0;
+}
+
+GLOBAL(void)
+jsimd_rgb_ycc_convert (j_compress_ptr cinfo,
+ JSAMPARRAY input_buf, JSAMPIMAGE output_buf,
+ JDIMENSION output_row, int num_rows)
+{
+ void (*altivecfct)(JDIMENSION, JSAMPARRAY, JSAMPIMAGE, JDIMENSION, int);
+
+ switch(cinfo->in_color_space) {
+ case JCS_EXT_RGB:
+ altivecfct=jsimd_extrgb_ycc_convert_altivec;
+ break;
+ case JCS_EXT_RGBX:
+ case JCS_EXT_RGBA:
+ altivecfct=jsimd_extrgbx_ycc_convert_altivec;
+ break;
+ case JCS_EXT_BGR:
+ altivecfct=jsimd_extbgr_ycc_convert_altivec;
+ break;
+ case JCS_EXT_BGRX:
+ case JCS_EXT_BGRA:
+ altivecfct=jsimd_extbgrx_ycc_convert_altivec;
+ break;
+ case JCS_EXT_XBGR:
+ case JCS_EXT_ABGR:
+ altivecfct=jsimd_extxbgr_ycc_convert_altivec;
+ break;
+ case JCS_EXT_XRGB:
+ case JCS_EXT_ARGB:
+ altivecfct=jsimd_extxrgb_ycc_convert_altivec;
+ break;
+ default:
+ altivecfct=jsimd_rgb_ycc_convert_altivec;
+ break;
+ }
+
+ altivecfct(cinfo->image_width, input_buf, output_buf, output_row, num_rows);
+}
+
+GLOBAL(void)
+jsimd_rgb_gray_convert (j_compress_ptr cinfo,
+ JSAMPARRAY input_buf, JSAMPIMAGE output_buf,
+ JDIMENSION output_row, int num_rows)
+{
+ void (*altivecfct)(JDIMENSION, JSAMPARRAY, JSAMPIMAGE, JDIMENSION, int);
+
+ switch(cinfo->in_color_space) {
+ case JCS_EXT_RGB:
+ altivecfct=jsimd_extrgb_gray_convert_altivec;
+ break;
+ case JCS_EXT_RGBX:
+ case JCS_EXT_RGBA:
+ altivecfct=jsimd_extrgbx_gray_convert_altivec;
+ break;
+ case JCS_EXT_BGR:
+ altivecfct=jsimd_extbgr_gray_convert_altivec;
+ break;
+ case JCS_EXT_BGRX:
+ case JCS_EXT_BGRA:
+ altivecfct=jsimd_extbgrx_gray_convert_altivec;
+ break;
+ case JCS_EXT_XBGR:
+ case JCS_EXT_ABGR:
+ altivecfct=jsimd_extxbgr_gray_convert_altivec;
+ break;
+ case JCS_EXT_XRGB:
+ case JCS_EXT_ARGB:
+ altivecfct=jsimd_extxrgb_gray_convert_altivec;
+ break;
+ default:
+ altivecfct=jsimd_rgb_gray_convert_altivec;
+ break;
+ }
+
+ altivecfct(cinfo->image_width, input_buf, output_buf, output_row, num_rows);
+}
+
+GLOBAL(void)
+jsimd_ycc_rgb_convert (j_decompress_ptr cinfo,
+ JSAMPIMAGE input_buf, JDIMENSION input_row,
+ JSAMPARRAY output_buf, int num_rows)
+{
+ void (*altivecfct)(JDIMENSION, JSAMPIMAGE, JDIMENSION, JSAMPARRAY, int);
+
+ switch(cinfo->out_color_space) {
+ case JCS_EXT_RGB:
+ altivecfct=jsimd_ycc_extrgb_convert_altivec;
+ break;
+ case JCS_EXT_RGBX:
+ case JCS_EXT_RGBA:
+ altivecfct=jsimd_ycc_extrgbx_convert_altivec;
+ break;
+ case JCS_EXT_BGR:
+ altivecfct=jsimd_ycc_extbgr_convert_altivec;
+ break;
+ case JCS_EXT_BGRX:
+ case JCS_EXT_BGRA:
+ altivecfct=jsimd_ycc_extbgrx_convert_altivec;
+ break;
+ case JCS_EXT_XBGR:
+ case JCS_EXT_ABGR:
+ altivecfct=jsimd_ycc_extxbgr_convert_altivec;
+ break;
+ case JCS_EXT_XRGB:
+ case JCS_EXT_ARGB:
+ altivecfct=jsimd_ycc_extxrgb_convert_altivec;
+ break;
+ default:
+ altivecfct=jsimd_ycc_rgb_convert_altivec;
+ break;
+ }
+
+ altivecfct(cinfo->output_width, input_buf, input_row, output_buf, num_rows);
+}
+
+GLOBAL(void)
+jsimd_ycc_rgb565_convert (j_decompress_ptr cinfo,
+ JSAMPIMAGE input_buf, JDIMENSION input_row,
+ JSAMPARRAY output_buf, int num_rows)
+{
+}
+
+GLOBAL(int)
+jsimd_can_h2v2_downsample (void)
+{
+ init_simd();
+
+ /* The code is optimised for these values only */
+ if (BITS_IN_JSAMPLE != 8)
+ return 0;
+ if (sizeof(JDIMENSION) != 4)
+ return 0;
+
+ if (simd_support & JSIMD_ALTIVEC)
+ return 1;
+
+ return 0;
+}
+
+GLOBAL(int)
+jsimd_can_h2v1_downsample (void)
+{
+ init_simd();
+
+ /* The code is optimised for these values only */
+ if (BITS_IN_JSAMPLE != 8)
+ return 0;
+ if (sizeof(JDIMENSION) != 4)
+ return 0;
+
+ if (simd_support & JSIMD_ALTIVEC)
+ return 1;
+
+ return 0;
+}
+
+GLOBAL(void)
+jsimd_h2v2_downsample (j_compress_ptr cinfo, jpeg_component_info * compptr,
+ JSAMPARRAY input_data, JSAMPARRAY output_data)
+{
+ jsimd_h2v2_downsample_altivec(cinfo->image_width, cinfo->max_v_samp_factor,
+ compptr->v_samp_factor,
+ compptr->width_in_blocks,
+ input_data, output_data);
+}
+
+GLOBAL(void)
+jsimd_h2v1_downsample (j_compress_ptr cinfo, jpeg_component_info * compptr,
+ JSAMPARRAY input_data, JSAMPARRAY output_data)
+{
+ jsimd_h2v1_downsample_altivec(cinfo->image_width, cinfo->max_v_samp_factor,
+ compptr->v_samp_factor,
+ compptr->width_in_blocks,
+ input_data, output_data);
+}
+
+GLOBAL(int)
+jsimd_can_h2v2_upsample (void)
+{
+ init_simd();
+
+ /* The code is optimised for these values only */
+ if (BITS_IN_JSAMPLE != 8)
+ return 0;
+ if (sizeof(JDIMENSION) != 4)
+ return 0;
+
+ if (simd_support & JSIMD_ALTIVEC)
+ return 1;
+
+ return 0;
+}
+
+GLOBAL(int)
+jsimd_can_h2v1_upsample (void)
+{
+ init_simd();
+
+ /* The code is optimised for these values only */
+ if (BITS_IN_JSAMPLE != 8)
+ return 0;
+ if (sizeof(JDIMENSION) != 4)
+ return 0;
+
+ if (simd_support & JSIMD_ALTIVEC)
+ return 1;
+
+ return 0;
+}
+
+GLOBAL(void)
+jsimd_h2v2_upsample (j_decompress_ptr cinfo,
+ jpeg_component_info * compptr,
+ JSAMPARRAY input_data,
+ JSAMPARRAY * output_data_ptr)
+{
+ jsimd_h2v2_upsample_altivec(cinfo->max_v_samp_factor, cinfo->output_width,
+ input_data, output_data_ptr);
+}
+
+GLOBAL(void)
+jsimd_h2v1_upsample (j_decompress_ptr cinfo,
+ jpeg_component_info * compptr,
+ JSAMPARRAY input_data,
+ JSAMPARRAY * output_data_ptr)
+{
+ jsimd_h2v1_upsample_altivec(cinfo->max_v_samp_factor, cinfo->output_width,
+ input_data, output_data_ptr);
+}
+
+GLOBAL(int)
+jsimd_can_h2v2_fancy_upsample (void)
+{
+ init_simd();
+
+ /* The code is optimised for these values only */
+ if (BITS_IN_JSAMPLE != 8)
+ return 0;
+ if (sizeof(JDIMENSION) != 4)
+ return 0;
+
+ if (simd_support & JSIMD_ALTIVEC)
+ return 1;
+
+ return 0;
+}
+
+GLOBAL(int)
+jsimd_can_h2v1_fancy_upsample (void)
+{
+ init_simd();
+
+ /* The code is optimised for these values only */
+ if (BITS_IN_JSAMPLE != 8)
+ return 0;
+ if (sizeof(JDIMENSION) != 4)
+ return 0;
+
+ if (simd_support & JSIMD_ALTIVEC)
+ return 1;
+
+ return 0;
+}
+
+GLOBAL(void)
+jsimd_h2v2_fancy_upsample (j_decompress_ptr cinfo,
+ jpeg_component_info * compptr,
+ JSAMPARRAY input_data,
+ JSAMPARRAY * output_data_ptr)
+{
+ jsimd_h2v2_fancy_upsample_altivec(cinfo->max_v_samp_factor,
+ compptr->downsampled_width, input_data,
+ output_data_ptr);
+}
+
+GLOBAL(void)
+jsimd_h2v1_fancy_upsample (j_decompress_ptr cinfo,
+ jpeg_component_info * compptr,
+ JSAMPARRAY input_data,
+ JSAMPARRAY * output_data_ptr)
+{
+ jsimd_h2v1_fancy_upsample_altivec(cinfo->max_v_samp_factor,
+ compptr->downsampled_width, input_data,
+ output_data_ptr);
+}
+
+GLOBAL(int)
+jsimd_can_h2v2_merged_upsample (void)
+{
+ init_simd();
+
+ /* The code is optimised for these values only */
+ if (BITS_IN_JSAMPLE != 8)
+ return 0;
+ if (sizeof(JDIMENSION) != 4)
+ return 0;
+
+ if (simd_support & JSIMD_ALTIVEC)
+ return 1;
+
+ return 0;
+}
+
+GLOBAL(int)
+jsimd_can_h2v1_merged_upsample (void)
+{
+ init_simd();
+
+ /* The code is optimised for these values only */
+ if (BITS_IN_JSAMPLE != 8)
+ return 0;
+ if (sizeof(JDIMENSION) != 4)
+ return 0;
+
+ if (simd_support & JSIMD_ALTIVEC)
+ return 1;
+
+ return 0;
+}
+
+GLOBAL(void)
+jsimd_h2v2_merged_upsample (j_decompress_ptr cinfo,
+ JSAMPIMAGE input_buf,
+ JDIMENSION in_row_group_ctr,
+ JSAMPARRAY output_buf)
+{
+ void (*altivecfct)(JDIMENSION, JSAMPIMAGE, JDIMENSION, JSAMPARRAY);
+
+ switch(cinfo->out_color_space) {
+ case JCS_EXT_RGB:
+ altivecfct=jsimd_h2v2_extrgb_merged_upsample_altivec;
+ break;
+ case JCS_EXT_RGBX:
+ case JCS_EXT_RGBA:
+ altivecfct=jsimd_h2v2_extrgbx_merged_upsample_altivec;
+ break;
+ case JCS_EXT_BGR:
+ altivecfct=jsimd_h2v2_extbgr_merged_upsample_altivec;
+ break;
+ case JCS_EXT_BGRX:
+ case JCS_EXT_BGRA:
+ altivecfct=jsimd_h2v2_extbgrx_merged_upsample_altivec;
+ break;
+ case JCS_EXT_XBGR:
+ case JCS_EXT_ABGR:
+ altivecfct=jsimd_h2v2_extxbgr_merged_upsample_altivec;
+ break;
+ case JCS_EXT_XRGB:
+ case JCS_EXT_ARGB:
+ altivecfct=jsimd_h2v2_extxrgb_merged_upsample_altivec;
+ break;
+ default:
+ altivecfct=jsimd_h2v2_merged_upsample_altivec;
+ break;
+ }
+
+ altivecfct(cinfo->output_width, input_buf, in_row_group_ctr, output_buf);
+}
+
+GLOBAL(void)
+jsimd_h2v1_merged_upsample (j_decompress_ptr cinfo,
+ JSAMPIMAGE input_buf,
+ JDIMENSION in_row_group_ctr,
+ JSAMPARRAY output_buf)
+{
+ void (*altivecfct)(JDIMENSION, JSAMPIMAGE, JDIMENSION, JSAMPARRAY);
+
+ switch(cinfo->out_color_space) {
+ case JCS_EXT_RGB:
+ altivecfct=jsimd_h2v1_extrgb_merged_upsample_altivec;
+ break;
+ case JCS_EXT_RGBX:
+ case JCS_EXT_RGBA:
+ altivecfct=jsimd_h2v1_extrgbx_merged_upsample_altivec;
+ break;
+ case JCS_EXT_BGR:
+ altivecfct=jsimd_h2v1_extbgr_merged_upsample_altivec;
+ break;
+ case JCS_EXT_BGRX:
+ case JCS_EXT_BGRA:
+ altivecfct=jsimd_h2v1_extbgrx_merged_upsample_altivec;
+ break;
+ case JCS_EXT_XBGR:
+ case JCS_EXT_ABGR:
+ altivecfct=jsimd_h2v1_extxbgr_merged_upsample_altivec;
+ break;
+ case JCS_EXT_XRGB:
+ case JCS_EXT_ARGB:
+ altivecfct=jsimd_h2v1_extxrgb_merged_upsample_altivec;
+ break;
+ default:
+ altivecfct=jsimd_h2v1_merged_upsample_altivec;
+ break;
+ }
+
+ altivecfct(cinfo->output_width, input_buf, in_row_group_ctr, output_buf);
+}
+
+GLOBAL(int)
+jsimd_can_convsamp (void)
+{
+ init_simd();
+
+ /* The code is optimised for these values only */
+ if (DCTSIZE != 8)
+ return 0;
+ if (BITS_IN_JSAMPLE != 8)
+ return 0;
+ if (sizeof(JDIMENSION) != 4)
+ return 0;
+ if (sizeof(DCTELEM) != 2)
+ return 0;
+
+ if (simd_support & JSIMD_ALTIVEC)
+ return 1;
+
+ return 0;
+}
+
+GLOBAL(int)
+jsimd_can_convsamp_float (void)
+{
+ return 0;
+}
+
+GLOBAL(void)
+jsimd_convsamp (JSAMPARRAY sample_data, JDIMENSION start_col,
+ DCTELEM * workspace)
+{
+ jsimd_convsamp_altivec(sample_data, start_col, workspace);
+}
+
+GLOBAL(void)
+jsimd_convsamp_float (JSAMPARRAY sample_data, JDIMENSION start_col,
+ FAST_FLOAT * workspace)
+{
+}
+
+GLOBAL(int)
+jsimd_can_fdct_islow (void)
+{
+ init_simd();
+
+ /* The code is optimised for these values only */
+ if (DCTSIZE != 8)
+ return 0;
+ if (sizeof(DCTELEM) != 2)
+ return 0;
+
+ if (simd_support & JSIMD_ALTIVEC)
+ return 1;
+
+ return 0;
+}
+
+GLOBAL(int)
+jsimd_can_fdct_ifast (void)
+{
+ init_simd();
+
+ /* The code is optimised for these values only */
+ if (DCTSIZE != 8)
+ return 0;
+ if (sizeof(DCTELEM) != 2)
+ return 0;
+
+ if (simd_support & JSIMD_ALTIVEC)
+ return 1;
+
+ return 0;
+}
+
+GLOBAL(int)
+jsimd_can_fdct_float (void)
+{
+ return 0;
+}
+
+GLOBAL(void)
+jsimd_fdct_islow (DCTELEM * data)
+{
+ jsimd_fdct_islow_altivec(data);
+}
+
+GLOBAL(void)
+jsimd_fdct_ifast (DCTELEM * data)
+{
+ jsimd_fdct_ifast_altivec(data);
+}
+
+GLOBAL(void)
+jsimd_fdct_float (FAST_FLOAT * data)
+{
+}
+
+GLOBAL(int)
+jsimd_can_quantize (void)
+{
+ init_simd();
+
+ /* The code is optimised for these values only */
+ if (DCTSIZE != 8)
+ return 0;
+ if (sizeof(JCOEF) != 2)
+ return 0;
+ if (sizeof(DCTELEM) != 2)
+ return 0;
+
+ if (simd_support & JSIMD_ALTIVEC)
+ return 1;
+
+ return 0;
+}
+
+GLOBAL(int)
+jsimd_can_quantize_float (void)
+{
+ return 0;
+}
+
+GLOBAL(void)
+jsimd_quantize (JCOEFPTR coef_block, DCTELEM * divisors,
+ DCTELEM * workspace)
+{
+ jsimd_quantize_altivec(coef_block, divisors, workspace);
+}
+
+GLOBAL(void)
+jsimd_quantize_float (JCOEFPTR coef_block, FAST_FLOAT * divisors,
+ FAST_FLOAT * workspace)
+{
+}
+
+GLOBAL(int)
+jsimd_can_idct_2x2 (void)
+{
+ return 0;
+}
+
+GLOBAL(int)
+jsimd_can_idct_4x4 (void)
+{
+ return 0;
+}
+
+GLOBAL(void)
+jsimd_idct_2x2 (j_decompress_ptr cinfo, jpeg_component_info * compptr,
+ JCOEFPTR coef_block, JSAMPARRAY output_buf,
+ JDIMENSION output_col)
+{
+}
+
+GLOBAL(void)
+jsimd_idct_4x4 (j_decompress_ptr cinfo, jpeg_component_info * compptr,
+ JCOEFPTR coef_block, JSAMPARRAY output_buf,
+ JDIMENSION output_col)
+{
+}
+
+GLOBAL(int)
+jsimd_can_idct_islow (void)
+{
+ init_simd();
+
+ /* The code is optimised for these values only */
+ if (DCTSIZE != 8)
+ return 0;
+ if (sizeof(JCOEF) != 2)
+ return 0;
+
+ if (simd_support & JSIMD_ALTIVEC)
+ return 1;
+
+ return 0;
+}
+
+GLOBAL(int)
+jsimd_can_idct_ifast (void)
+{
+ init_simd();
+
+ /* The code is optimised for these values only */
+ if (DCTSIZE != 8)
+ return 0;
+ if (sizeof(JCOEF) != 2)
+ return 0;
+
+ if (simd_support & JSIMD_ALTIVEC)
+ return 1;
+
+ return 0;
+}
+
+GLOBAL(int)
+jsimd_can_idct_float (void)
+{
+ return 0;
+}
+
+GLOBAL(void)
+jsimd_idct_islow (j_decompress_ptr cinfo, jpeg_component_info * compptr,
+ JCOEFPTR coef_block, JSAMPARRAY output_buf,
+ JDIMENSION output_col)
+{
+ jsimd_idct_islow_altivec(compptr->dct_table, coef_block, output_buf,
+ output_col);
+}
+
+GLOBAL(void)
+jsimd_idct_ifast (j_decompress_ptr cinfo, jpeg_component_info * compptr,
+ JCOEFPTR coef_block, JSAMPARRAY output_buf,
+ JDIMENSION output_col)
+{
+ jsimd_idct_ifast_altivec(compptr->dct_table, coef_block, output_buf,
+ output_col);
+}
+
+GLOBAL(void)
+jsimd_idct_float (j_decompress_ptr cinfo, jpeg_component_info * compptr,
+ JCOEFPTR coef_block, JSAMPARRAY output_buf,
+ JDIMENSION output_col)
+{
+}