Merge branch '1.4.x'
diff --git a/BUILDING.md b/BUILDING.md
new file mode 100644
index 0000000..a0af863
--- /dev/null
+++ b/BUILDING.md
@@ -0,0 +1,865 @@
+Building on Un*x Platforms (including Cygwin and OS X)
+=======================================================
+
+
+Build Requirements
+------------------
+
+- autoconf 2.56 or later
+- automake 1.7 or later
+- libtool 1.4 or later
+  * If using Xcode 4.3 or later on OS X, autoconf and automake are no longer
+    provided.  The easiest way to obtain them is from
+    [MacPorts](http://www.MacPorts.org).
+
+- NASM or YASM (if building x86 or x86-64 SIMD extensions)
+  * If using NASM, 0.98, or 2.01 or later is required for an x86 build (0.99
+    and 2.00 do not work properly with libjpeg-turbo's x86 SIMD code.)
+  * If using NASM, 2.00 or later is required for an x86-64 build.
+  * If using NASM, 2.07 or later (except 2.11.08) is required for an x86-64
+    Mac build (2.11.08 does not work properly with libjpeg-turbo's x86-64 SIMD
+    code when building macho64 objects.)  NASM or YASM can be obtained from
+    [MacPorts](http://www.macports.org/).
+
+  The binary RPMs released by the NASM project do not work on older Linux
+  systems, such as Red Hat Enterprise Linux 4.  On such systems, you can
+   easily build and install NASM from a source RPM by downloading one of the
+  SRPMs from
+
+  <http://www.nasm.us/pub/nasm/releasebuilds>
+
+  and executing the following as root:
+
+        ARCH=`uname -m`
+        rpmbuild --rebuild nasm-{version}.src.rpm
+        rpm -Uvh /usr/src/redhat/RPMS/$ARCH/nasm-{version}.$ARCH.rpm
+
+  NOTE: the NASM build will fail if texinfo is not installed.
+
+- GCC v4.1 (or later) or clang recommended for best performance
+
+- If building the TurboJPEG Java wrapper, JDK or OpenJDK 1.5 or later is
+  required.  Some systems, such as Solaris 10 and later and Red Hat Enterprise
+  Linux 5 and later, have this pre-installed.  On OS X 10.5 and 10.6, it will
+  be necessary to install the Java Developer Package, which can be downloaded
+  from <http://developer.apple.com/downloads> (Apple ID required.)  For other
+  systems, you can obtain the Oracle Java Development Kit from
+  <http://www.java.com>.
+
+
+Out-of-Tree Builds
+------------------
+
+Binary objects, libraries, and executables are generated in the same directory
+from which `configure` was executed (the "binary directory"), and this
+directory need not necessarily be the same as the libjpeg-turbo source
+directory.  You can create multiple independent binary directories, in which
+different versions of libjpeg-turbo can be built from the same source tree
+using different compilers or settings.  In the sections below,
+*{build_directory}* refers to the binary directory, whereas
+*{source_directory}* refers to the libjpeg-turbo source directory.  For in-tree
+builds, these directories are the same.
+
+
+Building libjpeg-turbo
+----------------------
+
+The following procedure will build libjpeg-turbo on Linux, FreeBSD, Cygwin, and
+Solaris/x86 systems (on Solaris, this generates a 32-bit library.  See below
+for 64-bit build instructions.)
+
+    cd {source_directory}
+    autoreconf -fiv
+    cd {build_directory}
+    sh {source_directory}/configure [additional configure flags]
+    make
+
+NOTE: Running autoreconf in the source directory is not necessary if building
+libjpeg-turbo from one of the official release tarballs.
+
+This will generate the following files under .libs/:
+
+**libjpeg.a**  
+Static link library for the libjpeg API
+
+**libjpeg.so.{version}** (Linux, Unix)  
+**libjpeg.{version}.dylib** (OS X)  
+**cygjpeg-{version}.dll** (Cygwin)  
+Shared library for the libjpeg API
+
+By default, *{version}* is 62.1.0, 7.1.0, or 8.0.2, depending on whether
+libjpeg v6b (default), v7, or v8 emulation is enabled.  If using Cygwin,
+*{version}* is 62, 7, or 8.
+
+**libjpeg.so** (Linux, Unix)  
+**libjpeg.dylib** (OS X)  
+Development symlink for the libjpeg API
+
+**libjpeg.dll.a** (Cygwin)  
+Import library for the libjpeg API
+
+**libturbojpeg.a**  
+Static link library for the TurboJPEG API
+
+**libturbojpeg.so.0.1.0** (Linux, Unix)  
+**libturbojpeg.0.1.0.dylib** (OS X)  
+**cygturbojpeg-0.dll** (Cygwin)  
+Shared library for the TurboJPEG API
+
+**libturbojpeg.so** (Linux, Unix)  
+**libturbojpeg.dylib** (OS X)  
+Development symlink for the TurboJPEG API
+
+**libturbojpeg.dll.a** (Cygwin)  
+Import library for the TurboJPEG API
+
+
+### libjpeg v7 or v8 API/ABI Emulation
+
+Add `--with-jpeg7` to the `configure` command line to build a version of
+libjpeg-turbo that is API/ABI-compatible with libjpeg v7.  Add `--with-jpeg8`
+to the `configure` command to build a version of libjpeg-turbo that is
+API/ABI-compatible with libjpeg v8.  See [README.md](README.md) for more
+information on libjpeg v7 and v8 emulation.
+
+
+### In-Memory Source/Destination Managers
+
+When using libjpeg v6b or v7 API/ABI emulation, add `--without-mem-srcdst` to
+the `configure` command line to build a version of libjpeg-turbo that lacks the
+`jpeg_mem_src()` and `jpeg_mem_dest()` functions.  These functions were not
+part of the original libjpeg v6b and v7 APIs, so removing them ensures strict
+conformance with those APIs.  See [README.md](README.md) for more information.
+
+
+### Arithmetic Coding Support
+
+Since the patent on arithmetic coding has expired, this functionality has been
+included in this release of libjpeg-turbo.  libjpeg-turbo's implementation is
+based on the implementation in libjpeg v8, but it works when emulating libjpeg
+v7 or v6b as well.  The default is to enable both arithmetic encoding and
+decoding, but those who have philosophical objections to arithmetic coding can
+add `--without-arith-enc` or `--without-arith-dec` to the `configure` command
+line to disable encoding or decoding (respectively.)
+
+
+### TurboJPEG Java Wrapper
+
+Add `--with-java` to the `configure` command line to incorporate an optional
+Java Native Interface wrapper into the TurboJPEG shared library and build the
+Java front-end classes to support it.  This allows the TurboJPEG shared library
+to be used directly from Java applications.  See [java/README](java/README) for
+more details.
+
+You can set the `JAVAC`, `JAR`, and `JAVA` configure variables to specify
+alternate commands for javac, jar, and java (respectively.)  You can also
+set the `JAVACFLAGS` configure variable to specify arguments that should be
+passed to the Java compiler when building the front-end classes, and
+`JNI_CFLAGS` to specify arguments that should be passed to the C compiler when
+building the JNI wrapper.  Run `configure --help` for more details.
+
+
+Installing libjpeg-turbo
+------------------------
+
+If you intend to install these libraries and the associated header files, then
+replace 'make' in the instructions above with
+
+    make install prefix={base dir} libdir={library directory}
+
+For example,
+
+    make install prefix=/usr/local libdir=/usr/local/lib64
+
+will install the header files in /usr/local/include and the library files in
+/usr/local/lib64.  If `prefix` and `libdir` are not specified, then the default
+is to install the header files in /opt/libjpeg-turbo/include and the library
+files in /opt/libjpeg-turbo/lib32 (32-bit) or /opt/libjpeg-turbo/lib64
+(64-bit.)
+
+NOTE: You can specify a prefix of /usr and a libdir of, for instance,
+/usr/lib64 to overwrite the system's version of libjpeg.  If you do this,
+however, then be sure to BACK UP YOUR SYSTEM'S INSTALLATION OF LIBJPEG before
+overwriting it.  It is recommended that you instead install libjpeg-turbo into
+a non-system directory and manipulate the `LD_LIBRARY_PATH` or create symlinks
+to force applications to use libjpeg-turbo instead of libjpeg.  See
+[README.md](README.md) for more information.
+
+
+Build Recipes
+-------------
+
+
+### 32-bit Build on 64-bit Linux
+
+Add
+
+    --host i686-pc-linux-gnu CFLAGS='-O3 -m32' LDFLAGS=-m32
+
+to the `configure` command line.
+
+
+### 64-bit Build on 64-bit OS X
+
+Add
+
+    --host x86_64-apple-darwin NASM=/opt/local/bin/nasm
+
+to the `configure` command line.  NASM 2.07 or later from MacPorts must be
+installed.
+
+
+### 32-bit Build on 64-bit OS X
+
+Add
+
+    --host i686-apple-darwin CFLAGS='-O3 -m32' LDFLAGS=-m32
+
+to the `configure` command line.
+
+
+### 64-bit Backward-Compatible Build on 64-bit OS X
+
+Add
+
+    --host x86_64-apple-darwin NASM=/opt/local/bin/nasm \
+      CFLAGS='-mmacosx-version-min=10.5 -O3' \
+      LDFLAGS='-mmacosx-version-min=10.5'
+
+to the `configure` command line.  NASM 2.07 or later from MacPorts must be
+installed.
+
+
+### 32-bit Backward-Compatible Build on OS X
+
+Add
+
+    --host i686-apple-darwin \
+      CFLAGS='-mmacosx-version-min=10.5 -O3 -m32' \
+      LDFLAGS='-mmacosx-version-min=10.5 -m32'
+
+to the `configure` command line.
+
+
+### 64-bit Build on 64-bit Solaris
+
+Add
+
+    --host x86_64-pc-solaris CFLAGS='-O3 -m64' LDFLAGS=-m64
+
+to the `configure` command line.
+
+
+### 32-bit Build on 64-bit FreeBSD
+
+Add
+
+    --host i386-unknown-freebsd CC='gcc -B /usr/lib32' CFLAGS='-O3 -m32' \
+      LDFLAGS='-B/usr/lib32'
+
+to the `configure` command line.  NASM 2.07 or later from FreeBSD ports must be
+installed.
+
+
+### Oracle Solaris Studio
+
+Add
+
+    CC=cc
+
+to the `configure` command line.  libjpeg-turbo will automatically be built
+with the maximum optimization level (-xO5) unless you override `CFLAGS`.
+
+To build a 64-bit version of libjpeg-turbo using Oracle Solaris Studio, add
+
+    --host x86_64-pc-solaris CC=cc CFLAGS='-xO5 -m64' LDFLAGS=-m64
+
+to the `configure` command line.
+
+
+### MinGW Build on Cygwin
+
+Use CMake (see recipes below)
+
+
+ARM Support
+-----------
+
+This release of libjpeg-turbo can use ARM NEON SIMD instructions to accelerate
+JPEG compression/decompression by approximately 2-4x on ARMv7 and later
+platforms.  If libjpeg-turbo is configured on an ARM Linux platform, then the
+build system will automatically include the NEON SIMD routines, if they are
+supported.  Build instructions for other ARM-based platforms follow.
+
+
+### Building libjpeg-turbo for iOS
+
+iOS platforms, such as the iPhone and iPad, use ARM processors, some of which
+support NEON instructions.  Additional steps are required in order to build
+libjpeg-turbo for these platforms.
+
+
+#### Additional build requirements
+
+- [gas-preprocessor.pl]
+  (https://raw.githubusercontent.com/libjpeg-turbo/gas-preprocessor/master/gas-preprocessor.pl)
+  should be installed in your `PATH`.
+
+
+#### ARM 32-bit Build (Xcode 4.6.x and earlier, LLVM-GCC)
+
+Set the following shell variables for simplicity:
+
+  *Xcode 4.2 and earlier*
+
+    IOS_PLATFORMDIR=/Developer/Platforms/iPhoneOS.platform`
+
+  *Xcode 4.3 and later*
+
+    IOS_PLATFORMDIR=/Applications/Xcode.app/Contents/Developer/Platforms/iPhoneOS.platform
+
+  *All Xcode versions*
+
+    IOS_SYSROOT=$IOS_PLATFORMDIR/Developer/SDKs/iPhoneOS*.sdk
+    IOS_GCC=$IOS_PLATFORMDIR/Developer/usr/bin/arm-apple-darwin10-llvm-gcc-4.2
+
+  *ARMv6 (code will run on all iOS devices, not SIMD-accelerated)*  
+  [NOTE: Requires Xcode 4.4.x or earlier]
+
+    IOS_CFLAGS="-march=armv6 -mcpu=arm1176jzf-s -mfpu=vfp"
+
+  *ARMv7 (code will run on iPhone 3GS-4S/iPad 1st-3rd Generation and newer)*
+
+    IOS_CFLAGS="-march=armv7 -mcpu=cortex-a8 -mtune=cortex-a8 -mfpu=neon"
+
+  *ARMv7s (code will run on iPhone 5/iPad 4th Generation and newer)*  
+  [NOTE: Requires Xcode 4.5 or later]
+
+    IOS_CFLAGS="-march=armv7s -mcpu=swift -mtune=swift -mfpu=neon"
+
+Follow the procedure under "Building libjpeg-turbo" above, adding
+
+    --host arm-apple-darwin10 \
+      CC="$IOS_GCC" LD="$IOS_GCC" \
+      CFLAGS="-mfloat-abi=softfp -isysroot $IOS_SYSROOT -O3 $IOS_CFLAGS" \
+      LDFLAGS="-mfloat-abi=softfp -isysroot $IOS_SYSROOT $IOS_CFLAGS"
+
+to the `configure` command line.
+
+
+#### ARM 32-bit Build (Xcode 5.0.x and later, Clang)
+
+Set the following shell variables for simplicity:
+
+    IOS_PLATFORMDIR=/Applications/Xcode.app/Contents/Developer/Platforms/iPhoneOS.platform
+    IOS_SYSROOT=$IOS_PLATFORMDIR/Developer/SDKs/iPhoneOS*.sdk
+    IOS_GCC=/Applications/Xcode.app/Contents/Developer/Toolchains/XcodeDefault.xctoolchain/usr/bin/clang
+
+  *ARMv7 (code will run on iPhone 3GS-4S/iPad 1st-3rd Generation and newer)*
+
+    IOS_CFLAGS="-arch armv7"
+
+  *ARMv7s (code will run on iPhone 5/iPad 4th Generation and newer)*
+
+    IOS_CFLAGS="-arch armv7s"
+
+Follow the procedure under "Building libjpeg-turbo" above, adding
+
+    --host arm-apple-darwin10 \
+      CC="$IOS_GCC" LD="$IOS_GCC" \
+      CFLAGS="-mfloat-abi=softfp -isysroot $IOS_SYSROOT -O3 $IOS_CFLAGS" \
+      LDFLAGS="-mfloat-abi=softfp -isysroot $IOS_SYSROOT $IOS_CFLAGS" \
+      CCASFLAGS="-no-integrated-as $IOS_CFLAGS"
+
+to the `configure` command line.
+
+
+#### ARMv8 64-bit Build (Xcode 5.0.x and later, Clang)
+
+Code will run on iPhone 5S/iPad Mini 2/iPad Air and newer.
+
+Set the following shell variables for simplicity:
+
+    IOS_PLATFORMDIR=/Applications/Xcode.app/Contents/Developer/Platforms/iPhoneOS.platform
+    IOS_SYSROOT=$IOS_PLATFORMDIR/Developer/SDKs/iPhoneOS*.sdk
+    IOS_GCC=/Applications/Xcode.app/Contents/Developer/Toolchains/XcodeDefault.xctoolchain/usr/bin/clang
+    IOS_CFLAGS="-arch arm64"
+
+Follow the procedure under "Building libjpeg-turbo" above, adding
+
+    --host aarch64-apple-darwin \
+      CC="$IOS_GCC" LD="$IOS_GCC" \
+      CFLAGS="-isysroot $IOS_SYSROOT -O3 $IOS_CFLAGS" \
+      LDFLAGS="-isysroot $IOS_SYSROOT $IOS_CFLAGS"
+
+to the `configure` command line.
+
+
+NOTE:  You can also add `-miphoneos-version-min={version}` to `$IOS_CFLAGS`
+above in order to support older versions of iOS than the default version
+supported by the SDK.
+
+Once built, lipo can be used to combine the ARMv6, v7, v7s, and/or v8 variants
+into a universal library.
+
+
+### Building libjpeg-turbo for Android
+
+Building libjpeg-turbo for Android platforms requires the
+{Android NDK}(https://developer.android.com/tools/sdk/ndk)
+and autotools.  The following is a general recipe script that can be modified for your specific needs.
+
+    # Set these variables to suit your needs
+    NDK_PATH={full path to the "ndk" directory-- for example, /opt/android/ndk}
+    BUILD_PLATFORM={the platform name for the NDK package you installed--
+      for example, "windows-x86" or "linux-x86_64" or "darwin-x86_64"}
+    TOOLCHAIN_VERSION={"4.8", "4.9", "clang3.5", etc.  This corresponds to a
+      toolchain directory under ${NDK_PATH}/toolchains/.}
+    ANDROID_VERSION={The minimum version of Android to support-- for example,
+      "16", "19", etc.  "21" or later is required for a 64-bit build.}
+
+    # 32-bit ARMv7 build
+    HOST=arm-linux-androideabi
+    SYSROOT=${NDK_PATH}/platforms/android-${ANDROID_VERSION}/arch-arm
+    ANDROID_CFLAGS="-march=armv7-a -mfloat-abi=softfp -fprefetch-loop-arrays \
+      --sysroot=${SYSROOT}"
+
+    # 64-bit ARMv8 build
+    HOST=aarch64-linux-android
+    SYSROOT=${NDK_PATH}/platforms/android-${ANDROID_VERSION}/arch-arm64
+    ANDROID_CFLAGS="--sysroot=${SYSROOT}"
+
+    TOOLCHAIN=${NDK_PATH}/toolchains/${HOST}-${TOOLCHAIN_VERSION}/prebuilt/${BUILD_PLATFORM}
+    ANDROID_INCLUDES="-I${SYSROOT}/usr/include -I${TOOLCHAIN}/include"
+    export CPP=${TOOLCHAIN}/bin/${HOST}-cpp
+    export AR=${TOOLCHAIN}/bin/${HOST}-ar
+    export AS=${TOOLCHAIN}/bin/${HOST}-as
+    export NM=${TOOLCHAIN}/bin/${HOST}-nm
+    export CC=${TOOLCHAIN}/bin/${HOST}-gcc
+    export LD=${TOOLCHAIN}/bin/${HOST}-ld
+    export RANLIB=${TOOLCHAIN}/bin/${HOST}-ranlib
+    export OBJDUMP=${TOOLCHAIN}/bin/${HOST}-objdump
+    export STRIP=${TOOLCHAIN}/bin/${HOST}-strip
+    cd {build_directory}
+    sh {source_directory}/configure --host=${HOST} \
+      CFLAGS="${ANDROID_INCLUDES} ${ANDROID_CFLAGS} -O3 -fPIE" \
+      CPPFLAGS="${ANDROID_INCLUDES} ${ANDROID_CFLAGS}" \
+      LDFLAGS="${ANDROID_CFLAGS} -pie" --with-simd ${1+"$@"}
+    make
+
+If building for Android 4.0.x (API level < 16) or earlier, remove `-fPIE` from
+`CFLAGS` and `-pie` from `LDFLAGS`.
+
+
+Building on Windows (Visual C++ or MinGW)
+=========================================
+
+
+Build Requirements
+------------------
+
+- [CMake](http://www.cmake.org) v2.8.11 or later
+
+- [NASM](http://www.nasm.us) or [YASM](http://yasm.tortall.net)
+  * If using NASM, 0.98 or later is required for an x86 build.
+  * If using NASM, 2.05 or later is required for an x86-64 build.
+  * nasm.exe should be in your `PATH`.
+
+- Microsoft Visual C++ 2005 or later
+
+  If you don't already have Visual C++, then the easiest way to get it is by
+  installing the
+  [Windows SDK](http://msdn.microsoft.com/en-us/windows/bb980924.aspx).
+  The Windows SDK includes both 32-bit and 64-bit Visual C++ compilers and
+  everything necessary to build libjpeg-turbo.
+
+  * You can also use Microsoft Visual Studio Express/Community Edition, which
+    is a free download.  (NOTE: versions prior to 2012 can only be used to
+    build 32-bit code.)
+  * If you intend to build libjpeg-turbo from the command line, then add the
+    appropriate compiler and SDK directories to the `INCLUDE`, `LIB`, and
+    `PATH` environment variables.  This is generally accomplished by
+    executing `vcvars32.bat` or `vcvars64.bat` and `SetEnv.cmd`.
+    `vcvars32.bat` and `vcvars64.bat` are part of Visual C++ and are located in
+    the same directory as the compiler.  `SetEnv.cmd` is part of the Windows
+    SDK.  You can pass optional arguments to `SetEnv.cmd` to specify a 32-bit
+    or 64-bit build environment.
+
+   ... OR ...
+
+- MinGW
+
+  [MinGW-builds](http://sourceforge.net/projects/mingwbuilds/) or
+  [tdm-gcc](http://tdm-gcc.tdragon.net/) recommended if building on a Windows
+  machine.  Both distributions install a Start Menu link that can be used to
+  launch a command prompt with the appropriate compiler paths automatically
+  set.
+
+- If building the TurboJPEG Java wrapper, JDK 1.5 or later is required.  This
+  can be downloaded from <http://www.java.com>.
+
+
+Out-of-Tree Builds
+------------------
+
+Binary objects, libraries, and executables are generated in the same directory
+from which `cmake` was executed (the "binary directory"), and this directory
+need not necessarily be the same as the libjpeg-turbo source directory.  You
+can create multiple independent binary directories, in which different versions
+of libjpeg-turbo can be built from the same source tree using different
+compilers or settings.  In the sections below, *{build_directory}* refers to
+the binary directory, whereas *{source_directory}* refers to the libjpeg-turbo
+source directory.  For in-tree builds, these directories are the same.
+
+
+Building libjpeg-turbo
+----------------------
+
+
+### Visual C++ (Command Line)
+
+    cd {build_directory}
+    cmake -G "NMake Makefiles" -DCMAKE_BUILD_TYPE=Release {source_directory}
+    nmake
+
+This will build either a 32-bit or a 64-bit version of libjpeg-turbo, depending
+on which version of cl.exe is in the `PATH`.
+
+The following files will be generated under *{build_directory}*:
+
+**jpeg-static.lib**  
+Static link library for the libjpeg API
+
+**sharedlib/jpeg{version}.dll**  
+DLL for the libjpeg API
+
+**sharedlib/jpeg.lib**  
+Import library for the libjpeg API
+
+**turbojpeg-static.lib**  
+Static link library for the TurboJPEG API
+
+**turbojpeg.dll**  
+DLL for the TurboJPEG API
+
+**turbojpeg.lib**  
+Import library for the TurboJPEG API
+
+*{version}* is 62, 7, or 8, depending on whether libjpeg v6b (default), v7, or
+v8 emulation is enabled.
+
+
+### Visual C++ (IDE)
+
+Choose the appropriate CMake generator option for your version of Visual Studio
+(run `cmake` with no arguments for a list of available generators.)  For
+instance:
+
+    cd {build_directory}
+    cmake -G "Visual Studio 10" {source_directory}
+
+NOTE:  Add "Win64" to the generator name (for example, "Visual Studio 10
+Win64") to build a 64-bit version of libjpeg-turbo.  Recent versions of CMake
+no longer document that.  A separate build directory must be used for 32-bit
+and 64-bit builds.
+
+You can then open ALL_BUILD.vcproj in Visual Studio and build one of the
+configurations in that project ("Debug", "Release", etc.) to generate a full
+build of libjpeg-turbo.
+
+This will generate the following files under *{build_directory}*:
+
+**{configuration}/jpeg-static.lib**  
+Static link library for the libjpeg API
+
+**sharedlib/{configuration}/jpeg{version}.dll**  
+DLL for the libjpeg API
+
+**sharedlib/{configuration}/jpeg.lib**  
+Import library for the libjpeg API
+
+**{configuration}/turbojpeg-static.lib**  
+Static link library for the TurboJPEG API
+
+**{configuration}/turbojpeg.dll**  
+DLL for the TurboJPEG API
+
+**{configuration}/turbojpeg.lib**  
+Import library for the TurboJPEG API
+
+*{configuration}* is Debug, Release, RelWithDebInfo, or MinSizeRel, depending
+on the configuration you built in the IDE, and *{version}* is 62, 7, or 8,
+depending on whether libjpeg v6b (default), v7, or v8 emulation is enabled.
+
+
+### MinGW
+
+NOTE: This assumes that you are building on a Windows machine.  If you are
+cross-compiling on a Linux/Unix machine, then see "Build Recipes" below.
+
+    cd {build_directory}
+    cmake -G "MinGW Makefiles" {source_directory}
+    mingw32-make
+
+This will generate the following files under *{build_directory}*:
+
+**libjpeg.a**  
+Static link library for the libjpeg API
+
+**sharedlib/libjpeg-{version}.dll**  
+DLL for the libjpeg API
+
+**sharedlib/libjpeg.dll.a**  
+Import library for the libjpeg API
+
+**libturbojpeg.a**  
+Static link library for the TurboJPEG API
+
+**libturbojpeg.dll**  
+DLL for the TurboJPEG API
+
+**libturbojpeg.dll.a**  
+Import library for the TurboJPEG API
+
+*{version}* is 62, 7, or 8, depending on whether libjpeg v6b (default), v7, or
+v8 emulation is enabled.
+
+
+### Debug Build
+
+Add `-DCMAKE_BUILD_TYPE=Debug` to the `cmake` command line.  Or, if building
+with NMake, remove `-DCMAKE_BUILD_TYPE=Release` (Debug builds are the default
+with NMake.)
+
+
+### libjpeg v7 or v8 API/ABI Emulation
+
+Add `-DWITH_JPEG7=1` to the `cmake` command line to build a version of
+libjpeg-turbo that is API/ABI-compatible with libjpeg v7.  Add `-DWITH_JPEG8=1`
+to the `cmake` command line to build a version of libjpeg-turbo that is
+API/ABI-compatible with libjpeg v8.  See [README.md](README.md) for more
+information on libjpeg v7 and v8 emulation.
+
+
+### In-Memory Source/Destination Managers
+
+When using libjpeg v6b or v7 API/ABI emulation, add `-DWITH_MEM_SRCDST=0` to
+the `cmake` command line to build a version of libjpeg-turbo that lacks the
+`jpeg_mem_src()` and `jpeg_mem_dest()` functions.  These functions were not
+part of the original libjpeg v6b and v7 APIs, so removing them ensures strict
+conformance with those APIs.  See [README.md](README.md) for more information.
+
+
+### Arithmetic Coding Support
+
+Since the patent on arithmetic coding has expired, this functionality has been
+included in this release of libjpeg-turbo.  libjpeg-turbo's implementation is
+based on the implementation in libjpeg v8, but it works when emulating libjpeg
+v7 or v6b as well.  The default is to enable both arithmetic encoding and
+decoding, but those who have philosophical objections to arithmetic coding can
+add `-DWITH_ARITH_ENC=0` or `-DWITH_ARITH_DEC=0` to the `cmake` command line to
+disable encoding or decoding (respectively.)
+
+
+### TurboJPEG Java Wrapper
+
+Add `-DWITH_JAVA=1` to the `cmake` command line to incorporate an optional Java
+Native Interface wrapper into the TurboJPEG shared library and build the Java
+front-end classes to support it.  This allows the TurboJPEG shared library to
+be used directly from Java applications.  See [java/README](java/README) for
+more details.
+
+You can set the `Java_JAVAC_EXECUTABLE`, `Java_JAVA_EXECUTABLE`, and
+`Java_JAR_EXECUTABLE` CMake variables to specify alternate commands or
+locations for javac, jar, and java (respectively.)  You can also set the
+`JAVACFLAGS` CMake variable to specify arguments that should be passed to the
+Java compiler when building the front-end classes.
+
+
+Installing libjpeg-turbo
+------------------------
+
+You can use the build system to install libjpeg-turbo into a directory of your
+choosing (as opposed to creating an installer.)  To do this, add:
+
+    -DCMAKE_INSTALL_PREFIX={install_directory}
+
+to the cmake command line.
+
+For example,
+
+    cmake -G "NMake Makefiles" -DCMAKE_BUILD_TYPE=Release \
+      -DCMAKE_INSTALL_PREFIX=c:\libjpeg-turbo {source_directory}
+    nmake install
+
+will install the header files in c:\libjpeg-turbo\include, the library files
+in c:\libjpeg-turbo\lib, the DLL's in c:\libjpeg-turbo\bin, and the
+documentation in c:\libjpeg-turbo\doc.
+
+
+Build Recipes
+-------------
+
+
+### 64-bit MinGW Build on Cygwin
+
+    cd {build_directory}
+    CC=/usr/bin/x86_64-w64-mingw32-gcc \
+      cmake -G "Unix Makefiles" -DCMAKE_SYSTEM_NAME=Windows \
+      -DCMAKE_RC_COMPILER=/usr/bin/x86_64-w64-mingw32-windres.exe \
+      {source_directory}
+    make
+
+This produces a 64-bit build of libjpeg-turbo that does not depend on
+cygwin1.dll or other Cygwin DLL's.  The mingw64-x86\_64-gcc-core and
+mingw64-x86\_64-gcc-g++ packages (and their dependencies) must be installed.
+
+
+### 32-bit MinGW Build on Cygwin
+
+     cd {build_directory}
+     CC=/usr/bin/i686-w64-mingw32-gcc \
+       cmake -G "Unix Makefiles" -DCMAKE_SYSTEM_NAME=Windows \
+       -DCMAKE_RC_COMPILER=/usr/bin/i686-w64-mingw32-windres.exe \
+       {source_directory}
+     make
+
+This produces a 32-bit build of libjpeg-turbo that does not depend on
+cygwin1.dll or other Cygwin DLL's.  The mingw64-i686-gcc-core and
+mingw64-i686-gcc-g++ packages (and their dependencies) must be installed.
+
+
+### MinGW Build on Linux
+
+    cd {build_directory}
+    CC={mingw_binary_path}/i686-pc-mingw32-gcc \
+      cmake -G "Unix Makefiles" -DCMAKE_SYSTEM_NAME=Windows \
+      -DCMAKE_RC_COMPILER={mingw_binary_path}/i686-pc-mingw32-windres \
+      -DCMAKE_AR={mingw_binary_path}/i686-pc-mingw32-ar \
+      -DCMAKE_RANLIB={mingw_binary_path}/i686-pc-mingw32-ranlib \
+      {source_directory}
+    make
+
+
+Creating Release Packages
+=========================
+
+The following commands can be used to create various types of release packages:
+
+
+Unix/Linux
+----------
+
+    make rpm
+
+Create Red Hat-style binary RPM package.  Requires RPM v4 or later.
+
+    make srpm
+
+This runs `make dist` to create a pristine source tarball, then creates a
+Red Hat-style source RPM package from the tarball.  Requires RPM v4 or later.
+
+    make deb
+
+Create Debian-style binary package.  Requires dpkg.
+
+    make dmg
+
+Create Macintosh package/disk image.  This requires pkgbuild and
+productbuild, which are installed by default on OS X 10.7 and later and which
+can be obtained by installing Xcode 3.2.6 (with the "Unix Development"
+option) on OS X 10.6.  Packages built in this manner can be installed on OS X
+10.5 and later, but they must be built on OS X 10.6 or later.
+
+    make udmg [BUILDDIR32={32-bit build directory}]
+
+On 64-bit OS X systems, this creates a Macintosh package and disk image that
+contains universal i386/x86-64 binaries.  You should first configure a 32-bit
+out-of-tree build of libjpeg-turbo, then configure a 64-bit out-of-tree
+build, then run `make udmg` from the 64-bit build directory.  The build
+system will look for the 32-bit build under *{source_directory}*/osxx86 by
+default, but you can override this by setting the `BUILDDIR32` variable on the
+make command line as shown above.
+
+    make iosdmg [BUILDDIR32={32-bit build directory}] \
+      [BUILDDIRARMV6={ARMv6 build directory}] \
+      [BUILDDIRARMV7={ARMv7 build directory}] \
+      [BUILDDIRARMV7S={ARMv7s build directory}] \
+      [BUILDDIRARMV8={ARMv8 build directory}]
+
+On OS X systems, this creates a Macintosh package and disk image in which the
+libjpeg-turbo static libraries contain ARM architectures necessary to build
+iOS applications.  If building on an x86-64 system, the binaries will also
+contain the i386 architecture, as with `make udmg` above.  You should first
+configure ARMv6, ARMv7, ARMv7s, and/or ARMv8 out-of-tree builds of
+libjpeg-turbo (see "Building libjpeg-turbo for iOS" above.)  If you are
+building an x86-64 version of libjpeg-turbo, you should configure a 32-bit
+out-of-tree build as well.  Next, build libjpeg-turbo as you would normally,
+using an out-of-tree build.  When it is built, run `make iosdmg` from the
+build directory.  The build system will look for the ARMv6 build under
+*{source_directory}*/iosarmv6 by default, the ARMv7 build under
+*{source_directory}*/iosarmv7 by default, the ARMv7s build under
+*{source_directory}*/iosarmv7s by default, the ARMv8 build under
+*{source_directory}*/iosarmv8 by default, and (if applicable) the 32-bit build
+under *{source_directory}*/osxx86 by default, but you can override this by
+setting the `BUILDDIR32`, `BUILDDIRARMV6`, `BUILDDIRARMV7`, `BUILDDIRARMV7S`,
+and/or `BUILDDIRARMV8` variables on the `make` command line as shown above.
+
+NOTE: If including an ARMv8 build in the package, then you may need to use
+Xcode's version of lipo instead of the operating system's.  To do this, pass
+an argument of `LIPO="xcrun lipo"` on the make command line.
+
+    make cygwinpkg
+
+Build a Cygwin binary package.
+
+
+Windows
+-------
+
+If using NMake:
+
+    cd {build_directory}
+    nmake installer
+
+If using MinGW:
+
+    cd {build_directory}
+    make installer
+
+If using the Visual Studio IDE, build the "installer" project.
+
+The installer package (libjpeg-turbo[-gcc][64].exe) will be located under
+*{build_directory}*.  If building using the Visual Studio IDE, then the
+installer package will be located in a subdirectory with the same name as the
+configuration you built (such as *{build_directory}*\Debug\ or
+*{build_directory}*\Release\).
+
+Building a Windows installer requires the Nullsoft Install System
+(http://nsis.sourceforge.net/.)  makensis.exe should be in your `PATH`.
+
+
+Regression testing
+==================
+
+The most common way to test libjpeg-turbo is by invoking `make test` on
+Unix/Linux platforms or `ctest` on Windows platforms, once the build has
+completed.  This runs a series of tests to ensure that mathematical
+compatibility has been maintained between libjpeg-turbo and libjpeg v6b.  This
+also invokes the TurboJPEG unit tests, which ensure that the colorspace
+extensions, YUV encoding, decompression scaling, and other features of the
+TurboJPEG C and Java APIs are working properly (and, by extension, that the
+equivalent features of the underlying libjpeg API are also working.)
+
+Invoking `make testclean` or `nmake testclean` (if using NMake) or building
+the 'testclean' target (if using the Visual Studio IDE) will clean up the
+output images generated by `make test`.
+
+On Unix/Linux platforms, more extensive tests of the TurboJPEG C and Java
+wrappers can be run by invoking `make tjtest`.  These extended TurboJPEG tests
+essentially iterate through all of the available features of the TurboJPEG APIs
+that are not covered by the TurboJPEG unit tests (this includes the lossless
+transform options) and compare the images generated by each feature to images
+generated using the equivalent feature in the libjpeg API.  The extended
+TurboJPEG tests are meant to test for regressions in the TurboJPEG wrappers,
+not in the underlying libjpeg API library.
diff --git a/BUILDING.txt b/BUILDING.txt
deleted file mode 100644
index 38581ea..0000000
--- a/BUILDING.txt
+++ /dev/null
@@ -1,891 +0,0 @@
-*******************************************************************************
-**     Building on Un*x Platforms (including Cygwin and OS X)
-*******************************************************************************
-
-
-==================
-Build Requirements
-==================
-
--- autoconf 2.56 or later
--- automake 1.7 or later
--- libtool 1.4 or later
-   * If using Xcode 4.3 or later on OS X, autoconf and automake are no longer
-     provided.  The easiest way to obtain them is from MacPorts
-     (http://www.macports.org/).
-
--- NASM or YASM (if building x86 or x86-64 SIMD extensions)
-   * If using NASM, 0.98, or 2.01 or later is required for an x86 build (0.99
-     and 2.00 do not work properly with libjpeg-turbo's x86 SIMD code.)
-   * If using NASM, 2.00 or later is required for an x86-64 build.
-   * If using NASM, 2.07 or later (except 2.11.08) is required for an x86-64
-     Mac build (2.11.08 does not work properly with libjpeg-turbo's x86-64 SIMD
-     code when building macho64 objects.)  NASM or YASM can be obtained from
-     MacPorts (http://www.macports.org/).
-
-   The binary RPMs released by the NASM project do not work on older Linux
-   systems, such as Red Hat Enterprise Linux 4.  On such systems, you can
-   easily build and install NASM from a source RPM by downloading one of the
-   SRPMs from
-
-   http://www.nasm.us/pub/nasm/releasebuilds
-
-   and executing the following as root:
-
-     ARCH=`uname -m`
-     rpmbuild --rebuild nasm-{version}.src.rpm
-     rpm -Uvh /usr/src/redhat/RPMS/$ARCH/nasm-{version}.$ARCH.rpm
-
-   NOTE: the NASM build will fail if texinfo is not installed.
-
--- GCC v4.1 or later recommended for best performance
-   * Beginning with Xcode 4, Apple stopped distributing GCC and switched to
-     the LLVM compiler.  Xcode v4.0 through v4.6 provides a GCC front end
-     called LLVM-GCC.  Unfortunately, as of this writing, neither LLVM-GCC nor
-     the LLVM (clang) compiler produces optimal performance with libjpeg-turbo.
-     Building libjpeg-turbo with LLVM-GCC v4.2 results in a 10% performance
-     degradation when compressing using 64-bit code, relative to building
-     libjpeg-turbo with GCC v4.2.  Building libjpeg-turbo with LLVM (clang)
-     results in a 20% performance degradation when compressing using 64-bit
-     code, relative to building libjpeg-turbo with GCC v4.2.  If you are
-     running Snow Leopard or earlier, it is suggested that you continue to use
-     Xcode v3.2.6, which provides GCC v4.2.  If you are using Lion or later, it
-     is suggested that you install Apple GCC v4.2 or GCC v5 through MacPorts.
-
--- If building the TurboJPEG Java wrapper, JDK or OpenJDK 1.5 or later is
-   required.  Some systems, such as Solaris 10 and later and Red Hat Enterprise
-   Linux 5 and later, have this pre-installed.  On OS X 10.5 and later, it will
-   be necessary to install the Java Developer Package, which can be downloaded
-   from http://developer.apple.com/downloads (Apple ID required.)  For systems
-   that do not have a JDK installed, you can obtain the Oracle Java Development
-   Kit from http://www.java.com.
-
-
-==================
-Out-of-Tree Builds
-==================
-
-Binary objects, libraries, and executables are generated in the same directory
-from which configure was executed (the "binary directory"), and this directory
-need not necessarily be the same as the libjpeg-turbo source directory.  You
-can create multiple independent binary directories, in which different versions
-of libjpeg-turbo can be built from the same source tree using different
-compilers or settings.  In the sections below, {build_directory} refers to the
-binary directory, whereas {source_directory} refers to the libjpeg-turbo source
-directory.  For in-tree builds, these directories are the same.
-
-
-======================
-Building libjpeg-turbo
-======================
-
-The following procedure will build libjpeg-turbo on Linux, FreeBSD, Cygwin, and
-Solaris/x86 systems (on Solaris, this generates a 32-bit library.  See below
-for 64-bit build instructions.)
-
-  cd {source_directory}
-  autoreconf -fiv
-  cd {build_directory}
-  sh {source_directory}/configure [additional configure flags]
-  make
-
-NOTE: Running autoreconf in the source directory is not necessary if building
-libjpeg-turbo from one of the official release tarballs.
-
-This will generate the following files under .libs/
-
-  libjpeg.a
-      Static link library for the libjpeg API
-
-  libjpeg.so.{version} (Linux, Unix)
-  libjpeg.{version}.dylib (OS X)
-  cygjpeg-{version}.dll (Cygwin)
-      Shared library for the libjpeg API
-
-  By default, {version} is 62.1.0, 7.1.0, or 8.0.2, depending on whether
-  libjpeg v6b (default), v7, or v8 emulation is enabled.  If using Cygwin,
-  {version} is 62, 7, or 8.
-
-  libjpeg.so (Linux, Unix)
-  libjpeg.dylib (OS X)
-      Development symlink for the libjpeg API
-
-  libjpeg.dll.a (Cygwin)
-      Import library for the libjpeg API
-
-  libturbojpeg.a
-      Static link library for the TurboJPEG API
-
-  libturbojpeg.so.0.1.0 (Linux, Unix)
-  libturbojpeg.0.1.0.dylib (OS X)
-  cygturbojpeg-0.dll (Cygwin)
-      Shared library for the TurboJPEG API
-
-  libturbojpeg.so (Linux, Unix)
-  libturbojpeg.dylib (OS X)
-      Development symlink for the TurboJPEG API
-
-  libturbojpeg.dll.a (Cygwin)
-      Import library for the TurboJPEG API
-
-
-libjpeg v7 or v8 API/ABI Emulation
-----------------------------------
-
-Add --with-jpeg7 to the configure command line to build a version of
-libjpeg-turbo that is API/ABI-compatible with libjpeg v7.  Add --with-jpeg8 to
-the configure command to build a version of libjpeg-turbo that is
-API/ABI-compatible with libjpeg v8.  See README-turbo.txt for more information
-on libjpeg v7 and v8 emulation.
-
-
-In-Memory Source/Destination Managers
--------------------------------------
-
-When using libjpeg v6b or v7 API/ABI emulation, add --without-mem-srcdst to the
-configure command line to build a version of libjpeg-turbo that lacks the
-jpeg_mem_src() and jpeg_mem_dest() functions.  These functions were not part of
-the original libjpeg v6b and v7 APIs, so removing them ensures strict
-conformance with those APIs.  See README-turbo.txt for more information.
-
-
-Arithmetic Coding Support
--------------------------
-
-Since the patent on arithmetic coding has expired, this functionality has been
-included in this release of libjpeg-turbo.  libjpeg-turbo's implementation is
-based on the implementation in libjpeg v8, but it works when emulating libjpeg
-v7 or v6b as well.  The default is to enable both arithmetic encoding and
-decoding, but those who have philosophical objections to arithmetic coding can
-add --without-arith-enc or --without-arith-dec to the configure command line to
-disable encoding or decoding (respectively.)
-
-
-TurboJPEG Java Wrapper
-----------------------
-Add --with-java to the configure command line to incorporate an optional Java
-Native Interface wrapper into the TurboJPEG shared library and build the Java
-front-end classes to support it.  This allows the TurboJPEG shared library to
-be used directly from Java applications.  See java/README for more details.
-
-You can set the JAVAC, JAR, and JAVA configure variables to specify
-alternate commands for javac, jar, and java (respectively.)  You can also
-set the JAVACFLAGS configure variable to specify arguments that should be
-passed to the Java compiler when building the front-end classes, and JNI_CFLAGS
-to specify arguments that should be passed to the C compiler when building the
-JNI wrapper.  Run 'configure --help' for more details.
-
-
-========================
-Installing libjpeg-turbo
-========================
-
-If you intend to install these libraries and the associated header files, then
-replace 'make' in the instructions above with
-
-  make install prefix={base dir} libdir={library directory}
-
-For example,
-
-  make install prefix=/usr/local libdir=/usr/local/lib64
-
-will install the header files in /usr/local/include and the library files in
-/usr/local/lib64.  If 'prefix' and 'libdir' are not specified, then the default
-is to install the header files in /opt/libjpeg-turbo/include and the library
-files in /opt/libjpeg-turbo/lib32 (32-bit) or /opt/libjpeg-turbo/lib64
-(64-bit.)
-
-NOTE: You can specify a prefix of /usr and a libdir of, for instance,
-/usr/lib64 to overwrite the system's version of libjpeg.  If you do this,
-however, then be sure to BACK UP YOUR SYSTEM'S INSTALLATION OF LIBJPEG before
-overwriting it.  It is recommended that you instead install libjpeg-turbo into
-a non-system directory and manipulate the LD_LIBRARY_PATH or create symlinks
-to force applications to use libjpeg-turbo instead of libjpeg.  See
-README-turbo.txt for more information.
-
-
-=============
-Build Recipes
-=============
-
-
-32-bit Build on 64-bit Linux
-----------------------------
-
-Add
-
-  --host i686-pc-linux-gnu CFLAGS='-O3 -m32' LDFLAGS=-m32
-
-to the configure command line.
-
-
-64-bit Build on 64-bit OS X
----------------------------
-
-Add
-
-  --host x86_64-apple-darwin NASM=/opt/local/bin/nasm
-
-to the configure command line.  NASM 2.07 or later from MacPorts must be
-installed.
-
-
-32-bit Build on 64-bit OS X
----------------------------
-
-Add
-
-  --host i686-apple-darwin CFLAGS='-O3 -m32' LDFLAGS=-m32
-
-to the configure command line.
-
-
-64-bit Backward-Compatible Build on 64-bit OS X
------------------------------------------------
-
-Add
-
-  --host x86_64-apple-darwin NASM=/opt/local/bin/nasm \
-  CFLAGS='-isysroot /Developer/SDKs/MacOSX10.5.sdk \
-    -mmacosx-version-min=10.5 -O3' \
-    LDFLAGS='-isysroot /Developer/SDKs/MacOSX10.5.sdk \
-    -mmacosx-version-min=10.5'
-
-to the configure command line.  The OS X 10.5 SDK, and NASM 2.07 or later from
-MacPorts, must be installed.
-
-
-32-bit Backward-Compatible Build on OS X
-----------------------------------------
-
-Add
-
-  --host i686-apple-darwin \
-    CFLAGS='-isysroot /Developer/SDKs/MacOSX10.5.sdk \
-    -mmacosx-version-min=10.5 -O3 -m32' \
-    LDFLAGS='-isysroot /Developer/SDKs/MacOSX10.5.sdk \
-    -mmacosx-version-min=10.5 -m32'
-
-to the configure command line.  The OS X 10.5 SDK must be installed.
-
-
-64-bit Library Build on 64-bit Solaris
---------------------------------------
-
-Add
-
-  --host x86_64-pc-solaris CFLAGS='-O3 -m64' LDFLAGS=-m64
-
-to the configure command line.
-
-
-32-bit Build on 64-bit FreeBSD
-------------------------------
-
-Add
-
-  --host i386-unknown-freebsd CC='gcc -B /usr/lib32' CFLAGS='-O3 -m32' \
-    LDFLAGS='-B/usr/lib32'
-
-to the configure command line.  NASM 2.07 or later from FreeBSD ports must be
-installed.
-
-
-Oracle Solaris Studio
----------------------
-
-Add
-
-  CC=cc
-
-to the configure command line.  libjpeg-turbo will automatically be built with
-the maximum optimization level (-xO5) unless you override CFLAGS.
-
-To build a 64-bit version of libjpeg-turbo using Oracle Solaris Studio, add
-
-  --host x86_64-pc-solaris CC=cc CFLAGS='-xO5 -m64' LDFLAGS=-m64
-
-to the configure command line.
-
-
-MinGW Build on Cygwin
----------------------
-
-Use CMake (see recipes below)
-
-
-===========
-ARM Support
-===========
-
-This release of libjpeg-turbo can use ARM NEON SIMD instructions to accelerate
-JPEG compression/decompression by approximately 2-4x on ARMv7 and later
-platforms.  If libjpeg-turbo is configured on an ARM Linux platform, then the
-build system will automatically include the NEON SIMD routines, if they are
-supported.  Build instructions for other ARM-based platforms follow.
-
-
-Building libjpeg-turbo for iOS
-------------------------------
-
-iOS platforms, such as the iPhone and iPad, use ARM processors, some of which
-support NEON instructions.  Additional steps are required in order to build
-libjpeg-turbo for these platforms.
-
-Additional build requirements:
-
-  gas-preprocessor.pl
-  (https://raw.githubusercontent.com/libjpeg-turbo/gas-preprocessor/master/gas-preprocessor.pl)
-  should be installed in your PATH.
-
-
-ARM 32-bit Build (Xcode 4.6.x and earlier, LLVM-GCC):
-
-Set the following shell variables for simplicity:
-
-  Xcode 4.2 and earlier:
-  IOS_PLATFORMDIR=/Developer/Platforms/iPhoneOS.platform
-  Xcode 4.3 and later:
-  IOS_PLATFORMDIR=/Applications/Xcode.app/Contents/Developer/Platforms/iPhoneOS.platform
-
-  IOS_SYSROOT=$IOS_PLATFORMDIR/Developer/SDKs/iPhoneOS*.sdk
-  IOS_GCC=$IOS_PLATFORMDIR/Developer/usr/bin/arm-apple-darwin10-llvm-gcc-4.2
-
-  ARMv6 (code will run on all iOS devices, not SIMD-accelerated):
-  [NOTE: Requires Xcode 4.4.x or earlier]
-  IOS_CFLAGS="-march=armv6 -mcpu=arm1176jzf-s -mfpu=vfp"
-
-  ARMv7 (code will run on iPhone 3GS-4S/iPad 1st-3rd Generation and newer):
-  IOS_CFLAGS="-march=armv7 -mcpu=cortex-a8 -mtune=cortex-a8 -mfpu=neon"
-
-  ARMv7s (code will run on iPhone 5/iPad 4th Generation and newer):
-  [NOTE: Requires Xcode 4.5 or later]
-  IOS_CFLAGS="-march=armv7s -mcpu=swift -mtune=swift -mfpu=neon"
-
-Follow the procedure under "Building libjpeg-turbo" above, adding
-
-  --host arm-apple-darwin10 \
-    CC="$IOS_GCC" LD="$IOS_GCC" \
-    CFLAGS="-mfloat-abi=softfp -isysroot $IOS_SYSROOT -O3 $IOS_CFLAGS" \
-    LDFLAGS="-mfloat-abi=softfp -isysroot $IOS_SYSROOT $IOS_CFLAGS"
-
-to the configure command line.
-
-
-ARM 32-bit Build (Xcode 5.0.x and later, Clang):
-
-Set the following shell variables for simplicity:
-
-  IOS_PLATFORMDIR=/Applications/Xcode.app/Contents/Developer/Platforms/iPhoneOS.platform
-  IOS_SYSROOT=$IOS_PLATFORMDIR/Developer/SDKs/iPhoneOS*.sdk
-  IOS_GCC=/Applications/Xcode.app/Contents/Developer/Toolchains/XcodeDefault.xctoolchain/usr/bin/clang
-
-  ARMv7 (code will run on iPhone 3GS-4S/iPad 1st-3rd Generation and newer):
-  IOS_CFLAGS="-arch armv7"
-
-  ARMv7s (code will run on iPhone 5/iPad 4th Generation and newer):
-  IOS_CFLAGS="-arch armv7s"
-
-Follow the procedure under "Building libjpeg-turbo" above, adding
-
-  --host arm-apple-darwin10 \
-    CC="$IOS_GCC" LD="$IOS_GCC" \
-    CFLAGS="-mfloat-abi=softfp -isysroot $IOS_SYSROOT -O3 $IOS_CFLAGS" \
-    LDFLAGS="-mfloat-abi=softfp -isysroot $IOS_SYSROOT $IOS_CFLAGS" \
-    CCASFLAGS="-no-integrated-as $IOS_CFLAGS"
-
-to the configure command line.
-
-
-ARMv8 64-bit Build (Xcode 5.0.x and later, Clang):
-
-Code will run on iPhone 5S/iPad Mini 2/iPad Air and newer.
-
-Set the following shell variables for simplicity:
-
-  IOS_PLATFORMDIR=/Applications/Xcode.app/Contents/Developer/Platforms/iPhoneOS.platform
-  IOS_SYSROOT=$IOS_PLATFORMDIR/Developer/SDKs/iPhoneOS*.sdk
-  IOS_GCC=/Applications/Xcode.app/Contents/Developer/Toolchains/XcodeDefault.xctoolchain/usr/bin/clang
-  IOS_CFLAGS="-arch arm64"
-
-Follow the procedure under "Building libjpeg-turbo" above, adding
-
-  --host aarch64-apple-darwin \
-    CC="$IOS_GCC" LD="$IOS_GCC" \
-    CFLAGS="-isysroot $IOS_SYSROOT -O3 $IOS_CFLAGS" \
-    LDFLAGS="-isysroot $IOS_SYSROOT $IOS_CFLAGS"
-
-to the configure command line.
-
-
-NOTE:  You can also add -miphoneos-version-min={version} to $IOS_CFLAGS above
-in order to support older versions of iOS than the default version supported by
-the SDK.
-
-Once built, lipo can be used to combine the ARMv6, v7, v7s, and/or v8 variants
-into a universal library.
-
-
-Building libjpeg-turbo for Android
-----------------------------------
-
-Building libjpeg-turbo for Android platforms requires the Android NDK
-(https://developer.android.com/tools/sdk/ndk) and autotools.  The following is
-a general recipe script that can be modified for your specific needs.
-
-  # Set these variables to suit your needs
-  NDK_PATH={full path to the "ndk" directory-- for example, /opt/android/ndk}
-  BUILD_PLATFORM={the platform name for the NDK package you installed--
-    for example, "windows-x86" or "linux-x86_64" or "darwin-x86_64"}
-  TOOLCHAIN_VERSION={"4.8", "4.9", "clang3.5", etc.  This corresponds to a
-    toolchain directory under ${NDK_PATH}/toolchains/.}
-  ANDROID_VERSION={The minimum version of Android to support-- for example,
-    "16", "19", etc.  "21" or later is required for a 64-bit build.}
-
-  # 32-bit ARMv7 build
-  HOST=arm-linux-androideabi
-  SYSROOT=${NDK_PATH}/platforms/android-${ANDROID_VERSION}/arch-arm
-  ANDROID_CFLAGS="-march=armv7-a -mfloat-abi=softfp -fprefetch-loop-arrays \
-    --sysroot=${SYSROOT}"
-
-  # 64-bit ARMv8 build
-  HOST=aarch64-linux-android
-  SYSROOT=${NDK_PATH}/platforms/android-${ANDROID_VERSION}/arch-arm64
-  ANDROID_CFLAGS="--sysroot=${SYSROOT}"
-
-  TOOLCHAIN=${NDK_PATH}/toolchains/${HOST}-${TOOLCHAIN_VERSION}/prebuilt/${BUILD_PLATFORM}
-  ANDROID_INCLUDES="-I${SYSROOT}/usr/include -I${TOOLCHAIN}/include"
-  export CPP=${TOOLCHAIN}/bin/${HOST}-cpp
-  export AR=${TOOLCHAIN}/bin/${HOST}-ar
-  export AS=${TOOLCHAIN}/bin/${HOST}-as
-  export NM=${TOOLCHAIN}/bin/${HOST}-nm
-  export CC=${TOOLCHAIN}/bin/${HOST}-gcc
-  export LD=${TOOLCHAIN}/bin/${HOST}-ld
-  export RANLIB=${TOOLCHAIN}/bin/${HOST}-ranlib
-  export OBJDUMP=${TOOLCHAIN}/bin/${HOST}-objdump
-  export STRIP=${TOOLCHAIN}/bin/${HOST}-strip
-  cd {build_directory}
-  sh {source_directory}/configure --host=${HOST} \
-    CFLAGS="${ANDROID_INCLUDES} ${ANDROID_CFLAGS} -O3 -fPIE" \
-    CPPFLAGS="${ANDROID_INCLUDES} ${ANDROID_CFLAGS}" \
-    LDFLAGS="${ANDROID_CFLAGS} -pie" --with-simd ${1+"$@"}
-  make
-
-If building for Android 4.0.x (API level < 16) or earlier, remove -fPIE from
-CFLAGS and -pie from LDFLAGS.
-
-
-*******************************************************************************
-**     Building on Windows (Visual C++ or MinGW)
-*******************************************************************************
-
-
-==================
-Build Requirements
-==================
-
--- CMake (http://www.cmake.org) v2.8.11 or later
-
--- Microsoft Visual C++ 2005 or later
-
-   If you don't already have Visual C++, then the easiest way to get it is by
-   installing the Windows SDK:
-
-   http://msdn.microsoft.com/en-us/windows/bb980924.aspx
-
-   The Windows SDK includes both 32-bit and 64-bit Visual C++ compilers and
-   everything necessary to build libjpeg-turbo.
-
-   * You can also use Microsoft Visual Studio Express Edition, which is a free
-     download.  (NOTE: versions prior to 2012 can only be used to build 32-bit
-     code.)
-   * If you intend to build libjpeg-turbo from the command line, then add the
-     appropriate compiler and SDK directories to the INCLUDE, LIB, and PATH
-     environment variables.  This is generally accomplished by executing
-     vcvars32.bat or vcvars64.bat and SetEnv.cmd.  vcvars32.bat and
-     vcvars64.bat are part of Visual C++ and are located in the same directory
-     as the compiler.  SetEnv.cmd is part of the Windows SDK.  You can pass
-     optional arguments to SetEnv.cmd to specify a 32-bit or 64-bit build
-     environment.
-
-... OR ...
-
--- MinGW
-
-   MinGW-builds (http://sourceforge.net/projects/mingwbuilds/) or
-   tdm-gcc (http://tdm-gcc.tdragon.net/) recommended if building on a Windows
-   machine.  Both distributions install a Start Menu link that can be used to
-   launch a command prompt with the appropriate compiler paths automatically
-   set.
-
--- NASM (http://www.nasm.us) or YASM (http://yasm.tortall.net)
-   * If using NASM, 0.98 or later is required for an x86 build.
-   * If using NASM, 2.05 or later is required for an x86-64 build.
-
--- If building the TurboJPEG Java wrapper, JDK 1.5 or later is required.  This
-   can be downloaded from http://www.java.com.
-
-
-==================
-Out-of-Tree Builds
-==================
-
-Binary objects, libraries, and executables are generated in the same directory
-from which cmake was executed (the "binary directory"), and this directory need
-not necessarily be the same as the libjpeg-turbo source directory.  You can
-create multiple independent binary directories, in which different versions of
-libjpeg-turbo can be built from the same source tree using different compilers
-or settings.  In the sections below, {build_directory} refers to the binary
-directory, whereas {source_directory} refers to the libjpeg-turbo source
-directory.  For in-tree builds, these directories are the same.
-
-
-======================
-Building libjpeg-turbo
-======================
-
-
-Visual C++ (Command Line)
--------------------------
-
-  cd {build_directory}
-  cmake -G "NMake Makefiles" -DCMAKE_BUILD_TYPE=Release {source_directory}
-  nmake
-
-This will build either a 32-bit or a 64-bit version of libjpeg-turbo, depending
-on which version of cl.exe is in the PATH.
-
-The following files will be generated under {build_directory}:
-
-  jpeg-static.lib
-      Static link library for the libjpeg API
-  sharedlib/jpeg{version}.dll
-      DLL for the libjpeg API
-  sharedlib/jpeg.lib
-      Import library for the libjpeg API
-  turbojpeg-static.lib
-      Static link library for the TurboJPEG API
-  turbojpeg.dll
-      DLL for the TurboJPEG API
-  turbojpeg.lib
-      Import library for the TurboJPEG API
-
-{version} is 62, 7, or 8, depending on whether libjpeg v6b (default), v7, or
-v8 emulation is enabled.
-
-
-Visual C++ (IDE)
-----------------
-
-Choose the appropriate CMake generator option for your version of Visual Studio
-(run "cmake" with no arguments for a list of available generators.)  For
-instance:
-
-  cd {build_directory}
-  cmake -G "Visual Studio 10" {source_directory}
-
-NOTE:  Add "Win64" to the generator name (for example, "Visual Studio 10
-Win64") to build a 64-bit version of libjpeg-turbo.  Recent versions of CMake
-no longer document that.  A separate build directory must be used for 32-bit
-and 64-bit builds.
-
-You can then open ALL_BUILD.vcproj in Visual Studio and build one of the
-configurations in that project ("Debug", "Release", etc.) to generate a full
-build of libjpeg-turbo.
-
-This will generate the following files under {build_directory}:
-
-  {configuration}/jpeg-static.lib
-      Static link library for the libjpeg API
-  sharedlib/{configuration}/jpeg{version}.dll
-      DLL for the libjpeg API
-  sharedlib/{configuration}/jpeg.lib
-      Import library for the libjpeg API
-  {configuration}/turbojpeg-static.lib
-      Static link library for the TurboJPEG API
-  {configuration}/turbojpeg.dll
-      DLL for the TurboJPEG API
-  {configuration}/turbojpeg.lib
-      Import library for the TurboJPEG API
-
-{configuration} is Debug, Release, RelWithDebInfo, or MinSizeRel, depending on
-the configuration you built in the IDE, and {version} is 62, 7, or 8,
-depending on whether libjpeg v6b (default), v7, or v8 emulation is enabled.
-
-
-MinGW
------
-
-NOTE: This assumes that you are building on a Windows machine.  If you are
-cross-compiling on a Linux/Unix machine, then see "Build Recipes" below.
-
-  cd {build_directory}
-  cmake -G "MinGW Makefiles" {source_directory}
-  mingw32-make
-
-This will generate the following files under {build_directory}
-
-  libjpeg.a
-      Static link library for the libjpeg API
-  sharedlib/libjpeg-{version}.dll
-      DLL for the libjpeg API
-  sharedlib/libjpeg.dll.a
-      Import library for the libjpeg API
-  libturbojpeg.a
-      Static link library for the TurboJPEG API
-  libturbojpeg.dll
-      DLL for the TurboJPEG API
-  libturbojpeg.dll.a
-      Import library for the TurboJPEG API
-
-{version} is 62, 7, or 8, depending on whether libjpeg v6b (default), v7, or
-v8 emulation is enabled.
-
-
-Debug Build
------------
-
-Add "-DCMAKE_BUILD_TYPE=Debug" to the cmake command line.  Or, if building with
-NMake, remove "-DCMAKE_BUILD_TYPE=Release" (Debug builds are the default with
-NMake.)
-
-
-libjpeg v7 or v8 API/ABI Emulation
------------------------------------
-
-Add "-DWITH_JPEG7=1" to the cmake command line to build a version of
-libjpeg-turbo that is API/ABI-compatible with libjpeg v7.  Add "-DWITH_JPEG8=1"
-to the cmake command to build a version of libjpeg-turbo that is
-API/ABI-compatible with libjpeg v8.  See README-turbo.txt for more information
-on libjpeg v7 and v8 emulation.
-
-
-In-Memory Source/Destination Managers
--------------------------------------
-
-When using libjpeg v6b or v7 API/ABI emulation, add -DWITH_MEM_SRCDST=0 to the
-CMake command line to build a version of libjpeg-turbo that lacks the
-jpeg_mem_src() and jpeg_mem_dest() functions.  These functions were not part of
-the original libjpeg v6b and v7 APIs, so removing them ensures strict
-conformance with those APIs.  See README-turbo.txt for more information.
-
-
-Arithmetic Coding Support
--------------------------
-
-Since the patent on arithmetic coding has expired, this functionality has been
-included in this release of libjpeg-turbo.  libjpeg-turbo's implementation is
-based on the implementation in libjpeg v8, but it works when emulating libjpeg
-v7 or v6b as well.  The default is to enable both arithmetic encoding and
-decoding, but those who have philosophical objections to arithmetic coding can
-add "-DWITH_ARITH_ENC=0" or "-DWITH_ARITH_DEC=0" to the cmake command line to
-disable encoding or decoding (respectively.)
-
-
-TurboJPEG Java Wrapper
-----------------------
-Add "-DWITH_JAVA=1" to the cmake command line to incorporate an optional Java
-Native Interface wrapper into the TurboJPEG shared library and build the Java
-front-end classes to support it.  This allows the TurboJPEG shared library to
-be used directly from Java applications.  See java/README for more details.
-
-You can set the Java_JAVAC_EXECUTABLE, Java_JAVA_EXECUTABLE, and
-Java_JAR_EXECUTABLE CMake variables to specify alternate commands or locations
-for javac, jar, and java (respectively.)  You can also set the JAVACFLAGS CMake
-variable to specify arguments that should be passed to the Java compiler when
-building the front-end classes.
-
-
-========================
-Installing libjpeg-turbo
-========================
-
-You can use the build system to install libjpeg-turbo into a directory of your
-choosing (as opposed to creating an installer.)  To do this, add:
-
-  -DCMAKE_INSTALL_PREFIX={install_directory}
-
-to the cmake command line.
-
-For example,
-
-  cmake -G "NMake Makefiles" -DCMAKE_BUILD_TYPE=Release \
-    -DCMAKE_INSTALL_PREFIX=c:\libjpeg-turbo {source_directory}
-  nmake install
-
-will install the header files in c:\libjpeg-turbo\include, the library files
-in c:\libjpeg-turbo\lib, the DLL's in c:\libjpeg-turbo\bin, and the
-documentation in c:\libjpeg-turbo\doc.
-
-
-=============
-Build Recipes
-=============
-
-
-64-bit MinGW Build on Cygwin
-----------------------------
-
-  cd {build_directory}
-  CC=/usr/bin/x86_64-w64-mingw32-gcc \
-    cmake -G "Unix Makefiles" -DCMAKE_SYSTEM_NAME=Windows \
-    -DCMAKE_RC_COMPILER=/usr/bin/x86_64-w64-mingw32-windres.exe \
-    {source_directory}
-  make
-
-This produces a 64-bit build of libjpeg-turbo that does not depend on
-cygwin1.dll or other Cygwin DLL's.  The mingw64-x86_64-gcc-core and
-mingw64-x86_64-gcc-g++ packages (and their dependencies) must be installed.
-
-
-32-bit MinGW Build on Cygwin
-----------------------------
-
-  cd {build_directory}
-  CC=/usr/bin/i686-w64-mingw32-gcc \
-    cmake -G "Unix Makefiles" -DCMAKE_SYSTEM_NAME=Windows \
-    -DCMAKE_RC_COMPILER=/usr/bin/i686-w64-mingw32-windres.exe \
-    {source_directory}
-  make
-
-This produces a 32-bit build of libjpeg-turbo that does not depend on
-cygwin1.dll or other Cygwin DLL's.  The mingw64-i686-gcc-core and
-mingw64-i686-gcc-g++ packages (and their dependencies) must be installed.
-
-
-MinGW Build on Linux
---------------------
-
-  cd {build_directory}
-  CC={mingw_binary_path}/i386-mingw32-gcc \
-    cmake -G "Unix Makefiles" -DCMAKE_SYSTEM_NAME=Windows \
-    -DCMAKE_AR={mingw_binary_path}/i386-mingw32-ar \
-    -DCMAKE_RANLIB={mingw_binary_path}/i386-mingw32-ranlib \
-    {source_directory}
-  make
-
-
-*******************************************************************************
-**     Creating Release Packages
-*******************************************************************************
-
-The following commands can be used to create various types of release packages:
-
-
-Unix/Linux
-----------
-
-make rpm
-
-  Create Red Hat-style binary RPM package.  Requires RPM v4 or later.
-
-make srpm
-
-  This runs 'make dist' to create a pristine source tarball, then creates a
-  Red Hat-style source RPM package from the tarball.  Requires RPM v4 or later.
-
-make deb
-
-  Create Debian-style binary package.  Requires dpkg.
-
-make dmg
-
-  Create Macintosh package/disk image.  This requires pkgbuild and
-  productbuild, which are installed by default on OS X 10.7 and later and which
-  can be obtained by installing Xcode 3.2.6 (with the "Unix Development"
-  option) on OS X 10.6.  Packages built in this manner can be installed on OS X
-  10.5 and later, but they must be built on OS X 10.6 or later.
-
-make udmg [BUILDDIR32={32-bit build directory}]
-
-  On 64-bit OS X systems, this creates a Macintosh package and disk image that
-  contains universal i386/x86-64 binaries.  You should first configure a 32-bit
-  out-of-tree build of libjpeg-turbo, then configure a 64-bit out-of-tree
-  build, then run 'make udmg' from the 64-bit build directory.  The build
-  system will look for the 32-bit build under {source_directory}/osxx86 by
-  default, but you can override this by setting the BUILDDIR32 variable on the
-  make command line as shown above.
-
-make iosdmg [BUILDDIR32={32-bit build directory}] \
-  [BUILDDIRARMV6={ARMv6 build directory}] \
-  [BUILDDIRARMV7={ARMv7 build directory}] \
-  [BUILDDIRARMV7S={ARMv7s build directory}] \
-  [BUILDDIRARMV8={ARMv8 build directory}]
-
-  On OS X systems, this creates a Macintosh package and disk image in which the
-  libjpeg-turbo static libraries contain ARM architectures necessary to build
-  iOS applications.  If building on an x86-64 system, the binaries will also
-  contain the i386 architecture, as with 'make udmg' above.  You should first
-  configure ARMv6, ARMv7, ARMv7s, and/or ARMv8 out-of-tree builds of
-  libjpeg-turbo (see "Building libjpeg-turbo for iOS" above.)  If you are
-  building an x86-64 version of libjpeg-turbo, you should configure a 32-bit
-  out-of-tree build as well.  Next, build libjpeg-turbo as you would normally,
-  using an out-of-tree build.  When it is built, run 'make iosdmg' from the
-  build directory.  The build system will look for the ARMv6 build under
-  {source_directory}/iosarmv6 by default, the ARMv7 build under
-  {source_directory}/iosarmv7 by default, the ARMv7s build under
-  {source_directory}/iosarmv7s by default, the ARMv8 build under
-  {source_directory}/iosarmv8 by default, and (if applicable) the 32-bit build
-  under {source_directory}/osxx86 by default, but you can override this by
-  setting the BUILDDIR32, BUILDDIRARMV6, BUILDDIRARMV7, BUILDDIRARMV7S, and/or
-  BUILDDIRARMV8 variables on the make command line as shown above.
-
-  NOTE: If including an ARMv8 build in the package, then you may need to use
-  Xcode's version of lipo instead of the operating system's.  To do this, pass
-  an argument of LIPO="xcrun lipo" on the make command line.
-
-make cygwinpkg
-
-  Build a Cygwin binary package.
-
-
-Windows
--------
-
-If using NMake:
-
-  cd {build_directory}
-  nmake installer
-
-If using MinGW:
-
-  cd {build_directory}
-  make installer
-
-If using the Visual Studio IDE, build the "installer" project.
-
-The installer package (libjpeg-turbo[-gcc][64].exe) will be located under
-{build_directory}.  If building using the Visual Studio IDE, then the installer
-package will be located in a subdirectory with the same name as the
-configuration you built (such as {build_directory}\Debug\ or
-{build_directory}\Release\).
-
-Building a Windows installer requires the Nullsoft Install System
-(http://nsis.sourceforge.net/.)  makensis.exe should be in your PATH.
-
-
-*******************************************************************************
-**     Regression testing
-*******************************************************************************
-
-The most common way to test libjpeg-turbo is by invoking 'make test' on
-Unix/Linux platforms or 'ctest' on Windows platforms, once the build has
-completed.  This runs a series of tests to ensure that mathematical
-compatibility has been maintained between libjpeg-turbo and libjpeg v6b.  This
-also invokes the TurboJPEG unit tests, which ensure that the colorspace
-extensions, YUV encoding, decompression scaling, and other features of the
-TurboJPEG C and Java APIs are working properly (and, by extension, that the
-equivalent features of the underlying libjpeg API are also working.)
-
-Invoking 'make testclean' or 'nmake testclean' (if using NMake) or building
-the 'testclean' target (if using the Visual Studio IDE) will clean up the
-output images generated by 'make test'.
-
-On Unix/Linux platforms, more extensive tests of the TurboJPEG C and Java
-wrappers can be run by invoking 'make tjtest'.  These extended TurboJPEG tests
-essentially iterate through all of the available features of the TurboJPEG APIs
-that are not covered by the TurboJPEG unit tests (this includes the lossless
-transform options) and compare the images generated by each feature to images
-generated using the equivalent feature in the libjpeg API.  The extended
-TurboJPEG tests are meant to test for regressions in the TurboJPEG wrappers,
-not in the underlying libjpeg API library.
diff --git a/CMakeLists.txt b/CMakeLists.txt
index 0e83798..5924f0c 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -9,7 +9,25 @@
 endif()
 
 project(libjpeg-turbo C)
-set(VERSION 1.4.3)
+set(VERSION 1.4.90)
+string(REPLACE "." ";" VERSION_TRIPLET ${VERSION})
+list(GET VERSION_TRIPLET 0 VERSION_MAJOR)
+list(GET VERSION_TRIPLET 1 VERSION_MINOR)
+list(GET VERSION_TRIPLET 2 VERSION_REVISION)
+function(pad_number NUMBER OUTPUT_LEN)
+  string(LENGTH "${${NUMBER}}" INPUT_LEN)
+  if(INPUT_LEN LESS OUTPUT_LEN)
+    math(EXPR ZEROES "${OUTPUT_LEN} - ${INPUT_LEN} - 1")
+    set(NUM ${${NUMBER}})
+    foreach(C RANGE ${ZEROES})
+      set(NUM "0${NUM}")
+    endforeach()
+    set(${NUMBER} ${NUM} PARENT_SCOPE)
+  endif()
+endfunction()
+pad_number(VERSION_MINOR 3)
+pad_number(VERSION_REVISION 3)
+set(LIBJPEG_TURBO_VERSION_NUMBER ${VERSION_MAJOR}${VERSION_MINOR}${VERSION_REVISION})
 
 if(NOT WIN32)
   message(FATAL_ERROR "Platform not supported by this build system.  Use autotools instead.")
@@ -34,8 +52,8 @@
 message(STATUS "VERSION = ${VERSION}, BUILD = ${BUILD}")
 
 option(WITH_SIMD "Include SIMD extensions" TRUE)
-option(WITH_ARITH_ENC "Include arithmetic encoding support" TRUE)
-option(WITH_ARITH_DEC "Include arithmetic decoding support" TRUE)
+option(WITH_ARITH_ENC "Include arithmetic encoding support when emulating the libjpeg v6b API/ABI" TRUE)
+option(WITH_ARITH_DEC "Include arithmetic decoding support when emulating the libjpeg v6b API/ABI" TRUE)
 option(WITH_JPEG7 "Emulate libjpeg v7 API/ABI (this makes libjpeg-turbo backward incompatible with libjpeg v6b)" FALSE)
 option(WITH_JPEG8 "Emulate libjpeg v8 API/ABI (this makes libjpeg-turbo backward incompatible with libjpeg v6b)" FALSE)
 option(WITH_MEM_SRCDST "Include in-memory source/destination manager functions when emulating the libjpeg v6b or v7 API/ABI" TRUE)
@@ -57,6 +75,14 @@
   set(BITS_IN_JSAMPLE 8)
 endif()
 
+if(WITH_JPEG8 OR WITH_JPEG7)
+  set(WITH_ARITH_ENC 1)
+  set(WITH_ARITH_DEC 1)
+endif()
+if(WITH_JPEG8)
+  set(WITH_MEM_SRCDST 1)
+endif()
+
 if(WITH_ARITH_ENC)
   set(C_ARITH_CODING_SUPPORTED 1)
   message(STATUS "Arithmetic encoding support enabled")
@@ -162,6 +188,14 @@
 
 include_directories(${CMAKE_CURRENT_BINARY_DIR} ${CMAKE_SOURCE_DIR})
 
+string(TOUPPER ${CMAKE_BUILD_TYPE} CMAKE_BUILD_TYPE_UC)
+
+set(EFFECTIVE_C_FLAGS "${CMAKE_C_FLAGS} ${CMAKE_C_FLAGS_${CMAKE_BUILD_TYPE_UC}}")
+message(STATUS "Compiler flags = ${EFFECTIVE_C_FLAGS}")
+
+set(EFFECTIVE_LD_FLAGS "${CMAKE_EXE_LINKER_FLAGS} ${CMAKE_EXE_LINKER_FLAGS_${CMAKE_BUILD_TYPE_UC}}")
+message(STATUS "Linker flags = ${EFFECTIVE_LD_FLAGS}")
+
 if(WITH_JAVA)
   find_package(Java)
   find_package(JNI)
@@ -309,6 +343,8 @@
 # Tests
 #
 
+add_subdirectory(md5)
+
 if(MSVC_IDE)
   set(OBJDIR "\${CTEST_CONFIGURATION_TYPE}/")
 else()
@@ -345,6 +381,10 @@
   set(MD5_PPM_420M_ISLOW_3_8 343d19015531b7bbe746124127244fa8)
   set(MD5_PPM_420M_ISLOW_1_4 35fd59d866e44659edfa3c18db2a3edb)
   set(MD5_PPM_420M_ISLOW_1_8 ccaed48ac0aedefda5d4abe4013f4ad7)
+  set(MD5_PPM_420_ISLOW_SKIP15_31 86664cd9dc956536409e44e244d20a97)
+  set(MD5_PPM_420_ISLOW_PROG_CROP62x62_71_71 452a21656115a163029cfba5c04fa76a)
+  set(MD5_PPM_444_ISLOW_SKIP1_6 ef63901f71ef7a75cd78253fc0914f84)
+  set(MD5_PPM_444_ISLOW_PROG_CROP98x98_13_13 15b173fb5872d9575572fbcc1b05956f)
   set(MD5_JPEG_CROP cdb35ff4b4519392690ea040c56ea99c)
 else()
   set(TESTORIG testorig.jpg)
@@ -394,6 +434,13 @@
   set(MD5_BMP_420_ISLOW_565D 6bde71526acc44bcff76f696df8638d2)
   set(MD5_BMP_420M_ISLOW_565 8dc0185245353cfa32ad97027342216f)
   set(MD5_BMP_420M_ISLOW_565D d1be3a3339166255e76fa50a0d70d73e)
+  set(MD5_PPM_420_ISLOW_SKIP15_31 c4c65c1e43d7275cd50328a61e6534f0)
+  set(MD5_PPM_420_ISLOW_ARI_SKIP16_139 087c6b123db16ac00cb88c5b590bb74a)
+  set(MD5_PPM_420_ISLOW_PROG_CROP62x62_71_71 26eb36ccc7d1f0cb80cdabb0ac8b5d99)
+  set(MD5_PPM_420_ISLOW_ARI_CROP53x53_4_4 886c6775af22370257122f8b16207e6d)
+  set(MD5_PPM_444_ISLOW_SKIP1_6 5606f86874cf26b8fcee1117a0a436a6)
+  set(MD5_PPM_444_ISLOW_PROG_CROP98x98_13_13 db87dc7ce26bcdc7a6b56239ce2b9d6c)
+  set(MD5_PPM_444_ISLOW_ARI_CROP37x37_0_0 cb57b32bd6d03e35432362f7bf184b6d)
   set(MD5_JPEG_CROP b4197f377e621c4e9b1d20471432610d)
 endif()
 
@@ -432,6 +479,13 @@
   set(TEST_LIBTYPES ${TEST_LIBTYPES} static)
 endif()
 
+set(TESTIMAGES ${CMAKE_SOURCE_DIR}/testimages)
+set(MD5CMP ${CMAKE_CURRENT_BINARY_DIR}/md5/md5cmp)
+if(CMAKE_CROSSCOMPILING)
+  file(RELATIVE_PATH TESTIMAGES ${CMAKE_CURRENT_BINARY_DIR} ${TESTIMAGES})
+  file(RELATIVE_PATH MD5CMP ${CMAKE_CURRENT_BINARY_DIR} ${MD5CMP})
+endif()
+
 foreach(libtype ${TEST_LIBTYPES})
   if(libtype STREQUAL "shared")
     set(dir sharedlib/)
@@ -453,219 +507,186 @@
 
   # CC: null  SAMP: fullsize  FDCT: islow  ENT: huff
   add_test(cjpeg${suffix}-rgb-islow
-    ${dir}cjpeg${suffix} -rgb -dct int -outfile testout_rgb_islow.jpg
-      ${CMAKE_SOURCE_DIR}/testimages/testorig.ppm)
+    ${dir}cjpeg${suffix} -rgb -dct int
+      -outfile testout_rgb_islow.jpg ${TESTIMAGES}/testorig.ppm)
   add_test(cjpeg${suffix}-rgb-islow-cmp
-    ${CMAKE_COMMAND} -DMD5=${MD5_JPEG_RGB_ISLOW} -DFILE=testout_rgb_islow.jpg
-      -P ${CMAKE_SOURCE_DIR}/cmakescripts/md5cmp.cmake)
+    ${MD5CMP} ${MD5_JPEG_RGB_ISLOW} testout_rgb_islow.jpg)
+
   # CC: null  SAMP: fullsize  IDCT: islow  ENT: huff
   add_test(djpeg${suffix}-rgb-islow
-    ${dir}djpeg${suffix} -dct int -ppm -outfile testout_rgb_islow.ppm
-      testout_rgb_islow.jpg)
+    ${dir}djpeg${suffix} -dct int -ppm
+      -outfile testout_rgb_islow.ppm testout_rgb_islow.jpg)
   add_test(djpeg${suffix}-rgb-islow-cmp
-    ${CMAKE_COMMAND} -DMD5=${MD5_PPM_RGB_ISLOW} -DFILE=testout_rgb_islow.ppm
-      -P ${CMAKE_SOURCE_DIR}/cmakescripts/md5cmp.cmake)
+    ${MD5CMP} ${MD5_PPM_RGB_ISLOW} testout_rgb_islow.ppm)
+
   if(NOT WITH_12BIT)
     # CC: RGB->RGB565  SAMP: fullsize  IDCT: islow  ENT: huff
     add_test(djpeg${suffix}-rgb-islow-565
       ${dir}djpeg${suffix} -dct int -rgb565 -dither none -bmp
         -outfile testout_rgb_islow_565.bmp testout_rgb_islow.jpg)
     add_test(djpeg${suffix}-rgb-islow-565-cmp
-      ${CMAKE_COMMAND} -DMD5=${MD5_BMP_RGB_ISLOW_565}
-        -DFILE=testout_rgb_islow_565.bmp
-        -P ${CMAKE_SOURCE_DIR}/cmakescripts/md5cmp.cmake)
+      ${MD5CMP} ${MD5_BMP_RGB_ISLOW_565} testout_rgb_islow_565.bmp)
+
     # CC: RGB->RGB565 (dithered)  SAMP: fullsize  IDCT: islow  ENT: huff
     add_test(djpeg${suffix}-rgb-islow-565D
       ${dir}djpeg${suffix} -dct int -rgb565 -bmp
         -outfile testout_rgb_islow_565D.bmp testout_rgb_islow.jpg)
     add_test(djpeg${suffix}-rgb-islow-565D-cmp
-      ${CMAKE_COMMAND} -DMD5=${MD5_BMP_RGB_ISLOW_565D}
-        -DFILE=testout_rgb_islow_565D.bmp
-        -P ${CMAKE_SOURCE_DIR}/cmakescripts/md5cmp.cmake)
+      ${MD5CMP} ${MD5_BMP_RGB_ISLOW_565D} testout_rgb_islow_565D.bmp)
   endif()
 
   # CC: RGB->YCC  SAMP: fullsize/h2v1  FDCT: ifast  ENT: 2-pass huff
   add_test(cjpeg${suffix}-422-ifast-opt
     ${dir}cjpeg${suffix} -sample 2x1 -dct fast -opt
-      -outfile testout_422_ifast_opt.jpg
-      ${CMAKE_SOURCE_DIR}/testimages/testorig.ppm)
+      -outfile testout_422_ifast_opt.jpg ${TESTIMAGES}/testorig.ppm)
   add_test(cjpeg${suffix}-422-ifast-opt-cmp
-    ${CMAKE_COMMAND} -DMD5=${MD5_JPEG_422_IFAST_OPT}
-      -DFILE=testout_422_ifast_opt.jpg
-      -P ${CMAKE_SOURCE_DIR}/cmakescripts/md5cmp.cmake)
+    ${MD5CMP} ${MD5_JPEG_422_IFAST_OPT} testout_422_ifast_opt.jpg)
+
   # CC: YCC->RGB  SAMP: fullsize/h2v1 fancy  IDCT: ifast  ENT: huff
   add_test(djpeg${suffix}-422-ifast
-    ${dir}djpeg${suffix} -dct fast -outfile testout_422_ifast.ppm
-      testout_422_ifast_opt.jpg)
+    ${dir}djpeg${suffix} -dct fast
+      -outfile testout_422_ifast.ppm testout_422_ifast_opt.jpg)
   add_test(djpeg${suffix}-422-ifast-cmp
-    ${CMAKE_COMMAND} -DMD5=${MD5_PPM_422_IFAST} -DFILE=testout_422_ifast.ppm
-      -P ${CMAKE_SOURCE_DIR}/cmakescripts/md5cmp.cmake)
+    ${MD5CMP} ${MD5_PPM_422_IFAST} testout_422_ifast.ppm)
+
   # CC: YCC->RGB  SAMP: h2v1 merged  IDCT: ifast  ENT: huff
   add_test(djpeg${suffix}-422m-ifast
-    ${dir}djpeg${suffix} -dct fast -nosmooth -outfile testout_422m_ifast.ppm
-      testout_422_ifast_opt.jpg)
+    ${dir}djpeg${suffix} -dct fast -nosmooth
+      -outfile testout_422m_ifast.ppm testout_422_ifast_opt.jpg)
   add_test(djpeg${suffix}-422m-ifast-cmp
-    ${CMAKE_COMMAND} -DMD5=${MD5_PPM_422M_IFAST} -DFILE=testout_422m_ifast.ppm
-      -P ${CMAKE_SOURCE_DIR}/cmakescripts/md5cmp.cmake)
+    ${MD5CMP} ${MD5_PPM_422M_IFAST} testout_422m_ifast.ppm)
+
   if(NOT WITH_12BIT)
     # CC: YCC->RGB565  SAMP: h2v1 merged  IDCT: ifast  ENT: huff
     add_test(djpeg${suffix}-422m-ifast-565
       ${dir}djpeg${suffix} -dct int -nosmooth -rgb565 -dither none -bmp
         -outfile testout_422m_ifast_565.bmp testout_422_ifast_opt.jpg)
     add_test(djpeg${suffix}-422m-ifast-565-cmp
-      ${CMAKE_COMMAND} -DMD5=${MD5_BMP_422M_IFAST_565}
-        -DFILE=testout_422m_ifast_565.bmp
-        -P ${CMAKE_SOURCE_DIR}/cmakescripts/md5cmp.cmake)
+      ${MD5CMP} ${MD5_BMP_422M_IFAST_565} testout_422m_ifast_565.bmp)
+
     # CC: YCC->RGB565 (dithered)  SAMP: h2v1 merged  IDCT: ifast  ENT: huff
     add_test(djpeg${suffix}-422m-ifast-565D
       ${dir}djpeg${suffix} -dct int -nosmooth -rgb565 -bmp
         -outfile testout_422m_ifast_565D.bmp testout_422_ifast_opt.jpg)
     add_test(djpeg${suffix}-422m-ifast-565D-cmp
-      ${CMAKE_COMMAND} -DMD5=${MD5_BMP_422M_IFAST_565D}
-        -DFILE=testout_422m_ifast_565D.bmp
-        -P ${CMAKE_SOURCE_DIR}/cmakescripts/md5cmp.cmake)
+      ${MD5CMP} ${MD5_BMP_422M_IFAST_565D} testout_422m_ifast_565D.bmp)
   endif()
 
   # CC: RGB->YCC  SAMP: fullsize/h2v2  FDCT: ifast  ENT: prog huff
   add_test(cjpeg${suffix}-420-q100-ifast-prog
     ${dir}cjpeg${suffix} -sample 2x2 -quality 100 -dct fast -prog
-      -outfile testout_420_q100_ifast_prog.jpg
-      ${CMAKE_SOURCE_DIR}/testimages/testorig.ppm)
+      -outfile testout_420_q100_ifast_prog.jpg ${TESTIMAGES}/testorig.ppm)
   add_test(cjpeg${suffix}-420-q100-ifast-prog-cmp
-    ${CMAKE_COMMAND} -DMD5=${MD5_JPEG_420_IFAST_Q100_PROG}
-      -DFILE=testout_420_q100_ifast_prog.jpg
-      -P ${CMAKE_SOURCE_DIR}/cmakescripts/md5cmp.cmake)
+    ${MD5CMP} ${MD5_JPEG_420_IFAST_Q100_PROG} testout_420_q100_ifast_prog.jpg)
+
   # CC: YCC->RGB  SAMP: fullsize/h2v2 fancy  IDCT: ifast  ENT: prog huff
   add_test(djpeg${suffix}-420-q100-ifast-prog
-    ${dir}djpeg${suffix} -dct fast -outfile testout_420_q100_ifast.ppm
-      testout_420_q100_ifast_prog.jpg)
+    ${dir}djpeg${suffix} -dct fast
+      -outfile testout_420_q100_ifast.ppm testout_420_q100_ifast_prog.jpg)
   add_test(djpeg${suffix}-420-q100-ifast-prog-cmp
-    ${CMAKE_COMMAND} -DMD5=${MD5_PPM_420_Q100_IFAST}
-      -DFILE=testout_420_q100_ifast.ppm
-      -P ${CMAKE_SOURCE_DIR}/cmakescripts/md5cmp.cmake)
+    ${MD5CMP} ${MD5_PPM_420_Q100_IFAST} testout_420_q100_ifast.ppm)
+
   # CC: YCC->RGB  SAMP: h2v2 merged  IDCT: ifast  ENT: prog huff
   add_test(djpeg${suffix}-420m-q100-ifast-prog
     ${dir}djpeg${suffix} -dct fast -nosmooth
       -outfile testout_420m_q100_ifast.ppm testout_420_q100_ifast_prog.jpg)
   add_test(djpeg${suffix}-420m-q100-ifast-prog-cmp
-    ${CMAKE_COMMAND} -DMD5=${MD5_PPM_420M_Q100_IFAST}
-      -DFILE=testout_420m_q100_ifast.ppm
-      -P ${CMAKE_SOURCE_DIR}/cmakescripts/md5cmp.cmake)
+    ${MD5CMP} ${MD5_PPM_420M_Q100_IFAST} testout_420m_q100_ifast.ppm)
 
   # CC: RGB->Gray  SAMP: fullsize  FDCT: islow  ENT: huff
   add_test(cjpeg${suffix}-gray-islow
-    ${dir}cjpeg${suffix} -gray -dct int -outfile testout_gray_islow.jpg
-      ${CMAKE_SOURCE_DIR}/testimages/testorig.ppm)
+    ${dir}cjpeg${suffix} -gray -dct int
+      -outfile testout_gray_islow.jpg ${TESTIMAGES}/testorig.ppm)
   add_test(cjpeg${suffix}-gray-islow-cmp
-    ${CMAKE_COMMAND} -DMD5=${MD5_JPEG_GRAY_ISLOW}
-      -DFILE=testout_gray_islow.jpg
-      -P ${CMAKE_SOURCE_DIR}/cmakescripts/md5cmp.cmake)
+    ${MD5CMP} ${MD5_JPEG_GRAY_ISLOW} testout_gray_islow.jpg)
+
   # CC: Gray->Gray  SAMP: fullsize  IDCT: islow  ENT: huff
   add_test(djpeg${suffix}-gray-islow
-    ${dir}djpeg${suffix} -dct int -outfile testout_gray_islow.ppm
-      testout_gray_islow.jpg)
+    ${dir}djpeg${suffix} -dct int
+      -outfile testout_gray_islow.ppm testout_gray_islow.jpg)
   add_test(djpeg${suffix}-gray-islow-cmp
-    ${CMAKE_COMMAND} -DMD5=${MD5_PPM_GRAY_ISLOW}
-      -DFILE=testout_gray_islow.ppm
-      -P ${CMAKE_SOURCE_DIR}/cmakescripts/md5cmp.cmake)
+    ${MD5CMP} ${MD5_PPM_GRAY_ISLOW} testout_gray_islow.ppm)
+
   # CC: Gray->RGB  SAMP: fullsize  IDCT: islow  ENT: huff
   add_test(djpeg${suffix}-gray-islow-rgb
-    ${dir}djpeg${suffix} -dct int -rgb -outfile testout_gray_islow_rgb.ppm
-      testout_gray_islow.jpg)
+    ${dir}djpeg${suffix} -dct int -rgb
+      -outfile testout_gray_islow_rgb.ppm testout_gray_islow.jpg)
   add_test(djpeg${suffix}-gray-islow-rgb-cmp
-    ${CMAKE_COMMAND} -DMD5=${MD5_PPM_GRAY_ISLOW_RGB}
-      -DFILE=testout_gray_islow_rgb.ppm
-      -P ${CMAKE_SOURCE_DIR}/cmakescripts/md5cmp.cmake)
+    ${MD5CMP} ${MD5_PPM_GRAY_ISLOW_RGB} testout_gray_islow_rgb.ppm)
+
   if(NOT WITH_12BIT)
     # CC: Gray->RGB565  SAMP: fullsize  IDCT: islow  ENT: huff
     add_test(djpeg${suffix}-gray-islow-565
       ${dir}djpeg${suffix} -dct int -rgb565 -dither none -bmp
         -outfile testout_gray_islow_565.bmp testout_gray_islow.jpg)
     add_test(djpeg${suffix}-gray-islow-565-cmp
-      ${CMAKE_COMMAND} -DMD5=${MD5_BMP_GRAY_ISLOW_565}
-        -DFILE=testout_gray_islow_565.bmp
-        -P ${CMAKE_SOURCE_DIR}/cmakescripts/md5cmp.cmake)
+      ${MD5CMP} ${MD5_BMP_GRAY_ISLOW_565} testout_gray_islow_565.bmp)
+
     # CC: Gray->RGB565 (dithered)  SAMP: fullsize  IDCT: islow  ENT: huff
     add_test(djpeg${suffix}-gray-islow-565D
       ${dir}djpeg${suffix} -dct int -rgb565 -bmp
         -outfile testout_gray_islow_565D.bmp testout_gray_islow.jpg)
     add_test(djpeg${suffix}-gray-islow-565D-cmp
-      ${CMAKE_COMMAND} -DMD5=${MD5_BMP_GRAY_ISLOW_565D}
-        -DFILE=testout_gray_islow_565D.bmp
-        -P ${CMAKE_SOURCE_DIR}/cmakescripts/md5cmp.cmake)
+      ${MD5CMP} ${MD5_BMP_GRAY_ISLOW_565D} testout_gray_islow_565D.bmp)
   endif()
 
   # CC: RGB->YCC  SAMP: fullsize smooth/h2v2 smooth  FDCT: islow
   # ENT: 2-pass huff
   add_test(cjpeg${suffix}-420s-ifast-opt
-    ${dir}cjpeg${suffix} -sample 2x2 -smooth 1 -dct int -opt -outfile
-      testout_420s_ifast_opt.jpg ${CMAKE_SOURCE_DIR}/testimages/testorig.ppm)
+    ${dir}cjpeg${suffix} -sample 2x2 -smooth 1 -dct int -opt
+      -outfile testout_420s_ifast_opt.jpg ${TESTIMAGES}/testorig.ppm)
   add_test(cjpeg${suffix}-420s-ifast-opt-cmp
-    ${CMAKE_COMMAND} -DMD5=${MD5_JPEG_420S_IFAST_OPT}
-      -DFILE=testout_420s_ifast_opt.jpg
-      -P ${CMAKE_SOURCE_DIR}/cmakescripts/md5cmp.cmake)
+    ${MD5CMP} ${MD5_JPEG_420S_IFAST_OPT} testout_420s_ifast_opt.jpg)
 
   # CC: RGB->YCC  SAMP: fullsize/int  FDCT: float  ENT: prog huff
   add_test(cjpeg${suffix}-3x2-float-prog
     ${dir}cjpeg${suffix} -sample 3x2 -dct float -prog
-      -outfile testout_3x2_float_prog.jpg
-      ${CMAKE_SOURCE_DIR}/testimages/testorig.ppm)
+      -outfile testout_3x2_float_prog.jpg ${TESTIMAGES}/testorig.ppm)
   add_test(cjpeg${suffix}-3x2-float-prog-cmp
-    ${CMAKE_COMMAND} -DMD5=${MD5_JPEG_3x2_FLOAT_PROG}
-      -DFILE=testout_3x2_float_prog.jpg
-      -P ${CMAKE_SOURCE_DIR}/cmakescripts/md5cmp.cmake)
+    ${MD5CMP} ${MD5_JPEG_3x2_FLOAT_PROG} testout_3x2_float_prog.jpg)
+
   # CC: YCC->RGB  SAMP: fullsize/int  IDCT: float  ENT: prog huff
   add_test(djpeg${suffix}-3x2-float-prog
-    ${dir}djpeg${suffix} -dct float -outfile testout_3x2_float.ppm
-      testout_3x2_float_prog.jpg)
+    ${dir}djpeg${suffix} -dct float
+      -outfile testout_3x2_float.ppm testout_3x2_float_prog.jpg)
   add_test(djpeg${suffix}-3x2-float-prog-cmp
-    ${CMAKE_COMMAND} -DMD5=${MD5_PPM_3x2_FLOAT} -DFILE=testout_3x2_float.ppm
-      -P ${CMAKE_SOURCE_DIR}/cmakescripts/md5cmp.cmake)
+    ${MD5CMP} ${MD5_PPM_3x2_FLOAT} testout_3x2_float.ppm)
 
   if(WITH_ARITH_ENC)
     # CC: YCC->RGB  SAMP: fullsize/h2v2  FDCT: islow  ENT: arith
     add_test(cjpeg${suffix}-420-islow-ari
       ${dir}cjpeg${suffix} -dct int -arithmetic
-        -outfile testout_420_islow_ari.jpg
-        ${CMAKE_SOURCE_DIR}/testimages/testorig.ppm)
+        -outfile testout_420_islow_ari.jpg ${TESTIMAGES}/testorig.ppm)
     add_test(cjpeg${suffix}-420-islow-ari-cmp
-      ${CMAKE_COMMAND} -DMD5=${MD5_JPEG_420_ISLOW_ARI}
-        -DFILE=testout_420_islow_ari.jpg
-        -P ${CMAKE_SOURCE_DIR}/cmakescripts/md5cmp.cmake)
+      ${MD5CMP} ${MD5_JPEG_420_ISLOW_ARI} testout_420_islow_ari.jpg)
+
     add_test(jpegtran${suffix}-420-islow-ari
       ${dir}jpegtran${suffix} -arithmetic
-        -outfile testout_420_islow_ari.jpg
-        ${CMAKE_SOURCE_DIR}/testimages/testimgint.jpg)
+        -outfile testout_420_islow_ari.jpg ${TESTIMAGES}/testimgint.jpg)
     add_test(jpegtran${suffix}-420-islow-ari-cmp
-      ${CMAKE_COMMAND} -DMD5=${MD5_JPEG_420_ISLOW_ARI}
-        -DFILE=testout_420_islow_ari.jpg
-        -P ${CMAKE_SOURCE_DIR}/cmakescripts/md5cmp.cmake)
+      ${MD5CMP} ${MD5_JPEG_420_ISLOW_ARI} testout_420_islow_ari.jpg)
+
     # CC: YCC->RGB  SAMP: fullsize  FDCT: islow  ENT: prog arith
     add_test(cjpeg${suffix}-444-islow-progari
-      ${dir}cjpeg${suffix} -sample 1x1 -dct int -progressive -arithmetic
-        -outfile testout_444_islow_progari.jpg
-        ${CMAKE_SOURCE_DIR}/testimages/testorig.ppm)
+      ${dir}cjpeg${suffix} -sample 1x1 -dct int -prog -arithmetic
+        -outfile testout_444_islow_progari.jpg ${TESTIMAGES}/testorig.ppm)
     add_test(cjpeg${suffix}-444-islow-progari-cmp
-      ${CMAKE_COMMAND} -DMD5=${MD5_JPEG_444_ISLOW_PROGARI}
-        -DFILE=testout_444_islow_progari.jpg
-        -P ${CMAKE_SOURCE_DIR}/cmakescripts/md5cmp.cmake)
+      ${MD5CMP} ${MD5_JPEG_444_ISLOW_PROGARI} testout_444_islow_progari.jpg)
   endif()
+
   if(WITH_ARITH_DEC)
     # CC: RGB->YCC  SAMP: h2v2 merged  IDCT: ifast  ENT: arith
     add_test(djpeg${suffix}-420m-ifast-ari
-      ${dir}djpeg${suffix} -fast -ppm -outfile testout_420m_ifast_ari.ppm
-        ${CMAKE_SOURCE_DIR}/testimages/testimgari.jpg)
+      ${dir}djpeg${suffix} -fast -ppm
+        -outfile testout_420m_ifast_ari.ppm ${TESTIMAGES}/testimgari.jpg)
     add_test(djpeg${suffix}-420m-ifast-ari-cmp
-      ${CMAKE_COMMAND} -DMD5=${MD5_PPM_420M_IFAST_ARI}
-        -DFILE=testout_420m_ifast_ari.ppm
-        -P ${CMAKE_SOURCE_DIR}/cmakescripts/md5cmp.cmake)
+      ${MD5CMP} ${MD5_PPM_420M_IFAST_ARI} testout_420m_ifast_ari.ppm)
+
     add_test(jpegtran${suffix}-420-islow
-      ${dir}jpegtran${suffix} -outfile testout_420_islow.jpg
-        ${CMAKE_SOURCE_DIR}/testimages/testimgari.jpg)
+      ${dir}jpegtran${suffix}
+        -outfile testout_420_islow.jpg ${TESTIMAGES}/testimgari.jpg)
     add_test(jpegtran${suffix}-420-islow-cmp
-      ${CMAKE_COMMAND} -DMD5=${MD5_JPEG_420_ISLOW}
-        -DFILE=testout_420_islow.jpg
-        -P ${CMAKE_SOURCE_DIR}/cmakescripts/md5cmp.cmake)
+      ${MD5CMP} ${MD5_JPEG_420_ISLOW} testout_420_islow.jpg)
   endif()
 
   # 2/1--   CC: YCC->RGB  SAMP: h2v2 merged  IDCT: 16x16 islow  ENT: huff
@@ -691,71 +712,139 @@
     string(REGEX REPLACE "_" "/" scalearg ${scale})
     add_test(djpeg${suffix}-420m-islow-${scale}
       ${dir}djpeg${suffix} -dct int -scale ${scalearg} -nosmooth -ppm
-        -outfile testout_420m_islow_${scale}.ppm
-        ${CMAKE_SOURCE_DIR}/testimages/${TESTORIG})
+        -outfile testout_420m_islow_${scale}.ppm ${TESTIMAGES}/${TESTORIG})
     add_test(djpeg${suffix}-420m-islow-${scale}-cmp
-      ${CMAKE_COMMAND} -DMD5=${MD5_PPM_420M_ISLOW_${scale}}
-        -DFILE=testout_420m_islow_${scale}.ppm
-        -P ${CMAKE_SOURCE_DIR}/cmakescripts/md5cmp.cmake)
+      ${MD5CMP} ${MD5_PPM_420M_ISLOW_${scale}} testout_420m_islow_${scale}.ppm)
   endforeach()
 
   if(NOT WITH_12BIT)
     # CC: YCC->RGB (dithered)  SAMP: h2v2 fancy  IDCT: islow  ENT: huff
     add_test(djpeg${suffix}-420-islow-256
       ${dir}djpeg${suffix} -dct int -colors 256 -bmp
-        -outfile testout_420_islow_256.bmp
-        ${CMAKE_SOURCE_DIR}/testimages/${TESTORIG})
+        -outfile testout_420_islow_256.bmp ${TESTIMAGES}/${TESTORIG})
     add_test(djpeg${suffix}-420-islow-256-cmp
-      ${CMAKE_COMMAND} -DMD5=${MD5_BMP_420_ISLOW_256}
-        -DFILE=testout_420_islow_256.bmp
-        -P ${CMAKE_SOURCE_DIR}/cmakescripts/md5cmp.cmake)
+      ${MD5CMP} ${MD5_BMP_420_ISLOW_256} testout_420_islow_256.bmp)
+
     # CC: YCC->RGB565  SAMP: h2v2 fancy  IDCT: islow  ENT: huff
     add_test(djpeg${suffix}-420-islow-565
       ${dir}djpeg${suffix} -dct int -rgb565 -dither none -bmp
-        -outfile testout_420_islow_565.bmp
-        ${CMAKE_SOURCE_DIR}/testimages/${TESTORIG})
+        -outfile testout_420_islow_565.bmp ${TESTIMAGES}/${TESTORIG})
     add_test(djpeg${suffix}-420-islow-565-cmp
-      ${CMAKE_COMMAND} -DMD5=${MD5_BMP_420_ISLOW_565}
-        -DFILE=testout_420_islow_565.bmp
-        -P ${CMAKE_SOURCE_DIR}/cmakescripts/md5cmp.cmake)
+      ${MD5CMP} ${MD5_BMP_420_ISLOW_565} testout_420_islow_565.bmp)
+
     # CC: YCC->RGB565 (dithered)  SAMP: h2v2 fancy  IDCT: islow  ENT: huff
     add_test(djpeg${suffix}-420-islow-565D
       ${dir}djpeg${suffix} -dct int -rgb565 -bmp
-        -outfile testout_420_islow_565D.bmp
-        ${CMAKE_SOURCE_DIR}/testimages/${TESTORIG})
+        -outfile testout_420_islow_565D.bmp ${TESTIMAGES}/${TESTORIG})
     add_test(djpeg${suffix}-420-islow-565D-cmp
-      ${CMAKE_COMMAND} -DMD5=${MD5_BMP_420_ISLOW_565D}
-        -DFILE=testout_420_islow_565D.bmp
-        -P ${CMAKE_SOURCE_DIR}/cmakescripts/md5cmp.cmake)
+      ${MD5CMP} ${MD5_BMP_420_ISLOW_565D} testout_420_islow_565D.bmp)
+
     # CC: YCC->RGB565  SAMP: h2v2 merged  IDCT: islow  ENT: huff
     add_test(djpeg${suffix}-420m-islow-565
       ${dir}djpeg${suffix} -dct int -nosmooth -rgb565 -dither none -bmp
-        -outfile testout_420m_islow_565.bmp
-        ${CMAKE_SOURCE_DIR}/testimages/${TESTORIG})
+        -outfile testout_420m_islow_565.bmp ${TESTIMAGES}/${TESTORIG})
     add_test(djpeg${suffix}-420m-islow-565-cmp
-      ${CMAKE_COMMAND} -DMD5=${MD5_BMP_420M_ISLOW_565}
-        -DFILE=testout_420m_islow_565.bmp
-        -P ${CMAKE_SOURCE_DIR}/cmakescripts/md5cmp.cmake)
+      ${MD5CMP} ${MD5_BMP_420M_ISLOW_565} testout_420m_islow_565.bmp)
+
     # CC: YCC->RGB565 (dithered)  SAMP: h2v2 merged  IDCT: islow  ENT: huff
     add_test(djpeg${suffix}-420m-islow-565D
       ${dir}djpeg${suffix} -dct int -nosmooth -rgb565 -bmp
-        -outfile testout_420m_islow_565D.bmp
-        ${CMAKE_SOURCE_DIR}/testimages/${TESTORIG})
+        -outfile testout_420m_islow_565D.bmp ${TESTIMAGES}/${TESTORIG})
     add_test(djpeg${suffix}-420m-islow-565D-cmp
-      ${CMAKE_COMMAND} -DMD5=${MD5_BMP_420M_ISLOW_565D}
-        -DFILE=testout_420m_islow_565D.bmp
-        -P ${CMAKE_SOURCE_DIR}/cmakescripts/md5cmp.cmake)
+      ${MD5CMP} ${MD5_BMP_420M_ISLOW_565D} testout_420m_islow_565D.bmp)
   endif()
+
+  # Partial decode tests.  These tests are designed to cover all of the
+  # possible code paths in jpeg_skip_scanlines().
+
+  # Context rows: Yes  Intra-iMCU row: Yes  iMCU row prefetch: No   ENT: huff
+  add_test(djpeg${suffix}-420-islow-skip15_31
+    ${dir}djpeg${suffix} -dct int -skip 15,31 -ppm
+      -outfile testout_420_islow_skip15,31.ppm ${TESTIMAGES}/${TESTORIG})
+  add_test(djpeg${suffix}-420-islow-skip15_31-cmp
+    ${MD5CMP} ${MD5_PPM_420_ISLOW_SKIP15_31} testout_420_islow_skip15,31.ppm)
+
+  # Context rows: Yes  Intra-iMCU row: No   iMCU row prefetch: Yes  ENT: arith
+  if(WITH_ARITH_DEC)
+    add_test(djpeg${suffix}-420-islow-ari-skip16_139
+      ${dir}djpeg${suffix} -dct int -skip 16,139 -ppm
+        -outfile testout_420_islow_ari_skip16,139.ppm
+        ${TESTIMAGES}/testimgari.jpg)
+    add_test(djpeg${suffix}-420-islow-ari_skip16_139-cmp
+      ${MD5CMP} ${MD5_PPM_420_ISLOW_ARI_SKIP16_139}
+        testout_420_islow_ari_skip16,139.ppm)
+  endif()
+
+  # Context rows: Yes  Intra-iMCU row: No   iMCU row prefetch: No   ENT: prog huff
+  add_test(cjpeg${suffix}-420-islow-prog
+    ${dir}cjpeg${suffix} -dct int -prog
+      -outfile testout_420_islow_prog.jpg ${TESTIMAGES}/testorig.ppm)
+  add_test(djpeg${suffix}-420-islow-prog-crop62x62_71_71
+    ${dir}djpeg${suffix} -dct int -crop 62x62+71+71 -ppm
+      -outfile testout_420_islow_prog_crop62x62,71,71.ppm
+      testout_420_islow_prog.jpg)
+  add_test(djpeg${suffix}-420-islow-prog-crop62x62_71_71-cmp
+    ${MD5CMP} ${MD5_PPM_420_ISLOW_PROG_CROP62x62_71_71}
+      testout_420_islow_prog_crop62x62,71,71.ppm)
+
+  # Context rows: Yes  Intra-iMCU row: No   iMCU row prefetch: No   ENT: arith
+  if(WITH_ARITH_DEC)
+    add_test(djpeg${suffix}-420-islow-ari-crop53x53_4_4
+      ${dir}djpeg${suffix} -dct int -crop 53x53+4+4 -ppm
+        -outfile testout_420_islow_ari_crop53x53,4,4.ppm
+        ${TESTIMAGES}/testimgari.jpg)
+    add_test(djpeg${suffix}-420-islow-ari-crop53x53_4_4-cmp
+      ${MD5CMP} ${MD5_PPM_420_ISLOW_ARI_CROP53x53_4_4}
+        testout_420_islow_ari_crop53x53,4,4.ppm)
+  endif()
+
+  # Context rows: No   Intra-iMCU row: Yes  ENT: huff
+  add_test(cjpeg${suffix}-444-islow
+    ${dir}cjpeg${suffix} -dct int -sample 1x1
+      -outfile testout_444_islow.jpg ${TESTIMAGES}/testorig.ppm)
+  add_test(djpeg${suffix}-444-islow-skip1_6
+    ${dir}djpeg${suffix} -dct int -skip 1,6 -ppm
+      -outfile testout_444_islow_skip1,6.ppm testout_444_islow.jpg)
+  add_test(djpeg${suffix}-444-islow-skip1_6-cmp
+    ${MD5CMP} ${MD5_PPM_444_ISLOW_SKIP1_6} testout_444_islow_skip1,6.ppm)
+
+  # Context rows: No   Intra-iMCU row: No   ENT: prog huff
+  add_test(cjpeg${suffix}-444-islow-prog
+    ${dir}cjpeg${suffix} -dct int -prog -sample 1x1
+      -outfile testout_444_islow_prog.jpg ${TESTIMAGES}/testorig.ppm)
+  add_test(djpeg${suffix}-444-islow-prog-crop98x98_13_13
+    ${dir}djpeg${suffix} -dct int -crop 98x98+13+13 -ppm
+      -outfile testout_444_islow_prog_crop98x98,13,13.ppm
+      testout_444_islow_prog.jpg)
+  add_test(djpeg${suffix}-444-islow-prog_crop98x98_13_13-cmp
+    ${MD5CMP} ${MD5_PPM_444_ISLOW_PROG_CROP98x98_13_13}
+      testout_444_islow_prog_crop98x98,13,13.ppm)
+
+  # Context rows: No   Intra-iMCU row: No   ENT: arith
+  if(WITH_ARITH_ENC)
+    add_test(cjpeg${suffix}-444-islow-ari
+      ${dir}cjpeg${suffix} -dct int -arithmetic -sample 1x1
+        -outfile testout_444_islow_ari.jpg ${TESTIMAGES}/testorig.ppm)
+    if(WITH_ARITH_DEC)
+      add_test(djpeg${suffix}-444-islow-ari-crop37x37_0_0
+        ${dir}djpeg${suffix} -dct int -crop 37x37+0+0 -ppm
+          -outfile testout_444_islow_ari_crop37x37,0,0.ppm
+          testout_444_islow_ari.jpg)
+      add_test(djpeg${suffix}-444-islow-ari-crop37x37_0_0-cmp
+        ${MD5CMP} ${MD5_PPM_444_ISLOW_ARI_CROP37x37_0_0}
+          testout_444_islow_ari_crop37x37,0,0.ppm)
+    endif()
+  endif()
+
   add_test(jpegtran${suffix}-crop
     ${dir}jpegtran${suffix} -crop 120x90+20+50 -transpose -perfect
-      -outfile testout_crop.jpg ${CMAKE_SOURCE_DIR}/testimages/${TESTORIG})
+      -outfile testout_crop.jpg ${TESTIMAGES}/${TESTORIG})
   add_test(jpegtran${suffix}-crop-cmp
-    ${CMAKE_COMMAND} -DMD5=${MD5_JPEG_CROP} -DFILE=testout_crop.jpg
-      -P ${CMAKE_SOURCE_DIR}/cmakescripts/md5cmp.cmake)
+    ${MD5CMP} ${MD5_JPEG_CROP} testout_crop.jpg)
 
 endforeach()
 
-add_custom_target(testclean COMMAND ${CMAKE_COMMAND} -P
+add_custom_target(testclean COMMAND ${MD5CMP} -P
   ${CMAKE_SOURCE_DIR}/cmakescripts/testclean.cmake)
 
 
@@ -835,7 +924,7 @@
 
 install(TARGETS rdjpgcom wrjpgcom RUNTIME DESTINATION bin)
 
-install(FILES ${CMAKE_SOURCE_DIR}/README ${CMAKE_SOURCE_DIR}/README-turbo.txt
+install(FILES ${CMAKE_SOURCE_DIR}/README.ijg ${CMAKE_SOURCE_DIR}/README.md
   ${CMAKE_SOURCE_DIR}/example.c ${CMAKE_SOURCE_DIR}/libjpeg.txt
   ${CMAKE_SOURCE_DIR}/structure.txt ${CMAKE_SOURCE_DIR}/usage.txt
   ${CMAKE_SOURCE_DIR}/wizard.txt
diff --git a/ChangeLog.md b/ChangeLog.md
new file mode 100644
index 0000000..0691ccc
--- /dev/null
+++ b/ChangeLog.md
@@ -0,0 +1,945 @@
+1.5.0
+=====
+
+### Significant changes relative to 1.5 beta1:
+
+1. Fixed an issue whereby a malformed motion-JPEG frame could cause the "fast
+path" of libjpeg-turbo's Huffman decoder to read from uninitialized memory.
+
+2. Added libjpeg-turbo version and build information to the global string table
+of the libjpeg and TurboJPEG API libraries.  This is a common practice in other
+infrastructure libraries, such as OpenSSL and libpng, because it makes it easy
+to examine an application binary and determine which version of the library the
+application was linked against.
+
+3. Fixed a couple of issues in the PPM reader that would cause buffer overruns
+in cjpeg if one of the values in a binary PPM/PGM input file exceeded the
+maximum value defined in the file's header.  libjpeg-turbo 1.4.2 already
+included a similar fix for ASCII PPM/PGM files.  Note that these issues were
+not security bugs, since they were confined to the cjpeg program and did not
+affect any of the libjpeg-turbo libraries.
+
+4. Fixed an issue whereby attempting to decompress a JPEG file with a corrupt
+header using the `tjDecompressToYUV2()` function would cause the function to
+abort without returning an error and, under certain circumstances, corrupt the
+stack.  This only occurred if `tjDecompressToYUV2()` was called prior to
+calling `tjDecompressHeader3()`, or if the return value from
+`tjDecompressHeader3()` was ignored (both cases represent incorrect usage of
+the TurboJPEG API.)
+
+5. Fixed an issue in the ARM 32-bit SIMD-accelerated Huffman encoder that
+prevented the code from assembling properly with clang.
+
+6. The `jpeg_stdio_src()`, `jpeg_mem_src()`, `jpeg_stdio_dest()`, and
+`jpeg_mem_dest()` functions in the libjpeg API will now throw an error if a
+source/destination manager has already been assigned to the compress or
+decompress object by a different function or by the calling program.  This
+prevents these functions from attempting to reuse a source/destination manager
+structure that was allocated elsewhere, because there is no way to ensure that
+it would be big enough to accommodate the new source/destination manager.
+
+
+1.4.90 (1.5 beta1)
+==================
+
+### Significant changes relative to 1.4.2:
+
+1. Added full SIMD acceleration for PowerPC platforms using AltiVec VMX
+(128-bit SIMD) instructions.  Although the performance of libjpeg-turbo on
+PowerPC was already good, due to the increased number of registers available
+to the compiler vs. x86, it was still possible to speed up compression by about
+3-4x and decompression by about 2-2.5x (relative to libjpeg v6b) through the
+use of AltiVec instructions.
+
+2. Added two new libjpeg API functions (`jpeg_skip_scanlines()` and
+`jpeg_crop_scanline()`) that can be used to partially decode a JPEG image.  See
+[libjpeg.txt](libjpeg.txt) for more details.
+
+3. The TJCompressor and TJDecompressor classes in the TurboJPEG Java API now
+implement the Closeable interface, so those classes can be used with a
+try-with-resources statement.
+
+4. The TurboJPEG Java classes now throw unchecked idiomatic exceptions
+(IllegalArgumentException, IllegalStateException) for unrecoverable errors
+caused by incorrect API usage, and those classes throw a new checked exception
+type (TJException) for errors that are passed through from the C library.
+
+5. Source buffers for the TurboJPEG C API functions, as well as the
+`jpeg_mem_src()` function in the libjpeg API, are now declared as const
+pointers.  This facilitates passing read-only buffers to those functions and
+ensures the caller that the source buffer will not be modified.  This should
+not create any backward API or ABI incompatibilities with prior libjpeg-turbo
+releases.
+
+6. The MIPS DSPr2 SIMD code can now be compiled to support either FR=0 or FR=1
+FPUs.
+
+7. Fixed additional negative left shifts and other issues reported by the GCC
+and Clang undefined behavior sanitizers.  Most of these issues affected only
+32-bit code, and none of them was known to pose a security threat, but removing
+the warnings makes it easier to detect actual security issues, should they
+arise in the future.
+
+8. Removed the unnecessary `.arch` directive from the ARM64 NEON SIMD code.
+This directive was preventing the code from assembling using the clang
+integrated assembler.
+
+9. Fixed a regression caused by 1.4.1[6] that prevented 32-bit and 64-bit
+libjpeg-turbo RPMs from being installed simultaneously on recent Red Hat/Fedora
+distributions.  This was due to the addition of a macro in jconfig.h that
+allows the Huffman codec to determine the word size at compile time.  Since
+that macro differs between 32-bit and 64-bit builds, this caused a conflict
+between the i386 and x86_64 RPMs (any differing files, other than executables,
+are not allowed when 32-bit and 64-bit RPMs are installed simultaneously.)
+Since the macro is used only internally, it has been moved into jconfigint.h.
+
+10. The x86-64 SIMD code can now be disabled at run time by setting the
+`JSIMD_FORCENONE` environment variable to `1` (the other SIMD implementations
+already had this capability.)
+
+11. Added a new command-line argument to TJBench (`-nowrite`) that prevents the
+benchmark from outputting any images.  This removes any potential operating
+system overhead that might be caused by lazy writes to disk and thus improves
+the consistency of the performance measurements.
+
+12. Added SIMD acceleration for Huffman encoding on SSE2-capable x86 and x86-64
+platforms.  This speeds up the compression of full-color JPEGs by about 10-15%
+on average (relative to libjpeg-turbo 1.4.x) when using modern Intel and AMD
+CPUs.  Additionally, this works around an issue in the clang optimizer that
+prevents it (as of this writing) from achieving the same performance as GCC
+when compiling the C version of the Huffman encoder
+(<https://llvm.org/bugs/show_bug.cgi?id=16035>).  For the purposes of
+benchmarking or regression testing, SIMD-accelerated Huffman encoding can be
+disabled by setting the `JSIMD_NOHUFFENC` environment variable to `1`.
+
+13. Added ARM 64-bit (ARMv8) NEON SIMD implementations of the commonly-used
+compression algorithms (including the slow integer forward DCT and h2v2 & h2v1
+downsampling algorithms, which are not accelerated in the 32-bit NEON
+implementation.)  This speeds up the compression of full-color JPEGs by about
+75% on average on a Cavium ThunderX processor and by about 2-2.5x on average on
+Cortex-A53 and Cortex-A57 cores.
+
+14. Added SIMD acceleration for Huffman encoding on NEON-capable ARM 32-bit
+and 64-bit platforms.
+
+    For 32-bit code, this speeds up the compression of full-color JPEGs by
+about 30% on average on a typical iOS device (iPhone 4S, Cortex-A9) and by
+about 6-7% on average on a typical Android device (Nexus 5X, Cortex-A53 and
+Cortex-A57), relative to libjpeg-turbo 1.4.x.  Note that the larger speedup
+under iOS is due to the fact that iOS builds use LLVM, which does not optimize
+the C Huffman encoder as well as GCC does.
+
+    For 64-bit code, NEON-accelerated Huffman encoding speeds up the
+compression of full-color JPEGs by about 40% on average on a typical iOS device
+(iPhone 5S, Apple A7) and by about 7-8% on average on a typical Android device
+(Nexus 5X, Cortex-A53 and Cortex-A57), in addition to the speedup described in
+[13] above.
+
+    For the purposes of benchmarking or regression testing, SIMD-accelerated
+Huffman encoding can be disabled by setting the `JSIMD_NOHUFFENC` environment
+variable to `1`.
+
+15. pkg-config (.pc) scripts are now included for both the libjpeg and
+TurboJPEG API libraries on Un*x systems.  Note that if a project's build system
+relies on these scripts, then it will not be possible to build that project
+with libjpeg or with a prior version of libjpeg-turbo.
+
+16. Optimized the ARM 64-bit (ARMv8) NEON SIMD decompression routines to
+improve performance on CPUs with in-order pipelines.  This speeds up the
+decompression of full-color JPEGs by nearly 2x on average on a Cavium ThunderX
+processor and by about 15% on average on a Cortex-A53 core.
+
+17. Fixed an issue in the accelerated Huffman decoder that could have caused
+the decoder to read past the end of the input buffer when a malformed,
+specially-crafted JPEG image was being decompressed.  In prior versions of
+libjpeg-turbo, the accelerated Huffman decoder was invoked (in most cases) only
+if there were > 128 bytes of data in the input buffer.  However, it is possible
+to construct a JPEG image in which a single Huffman block is over 430 bytes
+long, so this version of libjpeg-turbo activates the accelerated Huffman
+decoder only if there are > 512 bytes of data in the input buffer.
+
+18. Fixed a memory leak in tjunittest encountered when running the program
+with the `-yuv` option.
+
+
+1.4.2
+=====
+
+### Significant changes relative to 1.4.1:
+
+1. Fixed an issue whereby cjpeg would segfault if a Windows bitmap with a
+negative width or height was used as an input image (Windows bitmaps can have
+a negative height if they are stored in top-down order, but such files are
+rare and not supported by libjpeg-turbo.)
+
+2. Fixed an issue whereby, under certain circumstances, libjpeg-turbo would
+incorrectly encode certain JPEG images when quality=100 and the fast integer
+forward DCT were used.  This was known to cause `make test` to fail when the
+library was built with `-march=haswell` on x86 systems.
+
+3. Fixed an issue whereby libjpeg-turbo would crash when built with the latest
+& greatest development version of the Clang/LLVM compiler.  This was caused by
+an x86-64 ABI conformance issue in some of libjpeg-turbo's 64-bit SSE2 SIMD
+routines.  Those routines were incorrectly using a 64-bit `mov` instruction to
+transfer a 32-bit JDIMENSION argument, whereas the x86-64 ABI allows the upper
+(unused) 32 bits of a 32-bit argument's register to be undefined.  The new
+Clang/LLVM optimizer uses load combining to transfer multiple adjacent 32-bit
+structure members into a single 64-bit register, and this exposed the ABI
+conformance issue.
+
+4. Fixed a bug in the MIPS DSPr2 4:2:0 "plain" (non-fancy and non-merged)
+upsampling routine that caused a buffer overflow (and subsequent segfault) when
+decompressing a 4:2:0 JPEG image whose scaled output width was less than 16
+pixels.  The "plain" upsampling routines are normally only used when
+decompressing a non-YCbCr JPEG image, but they are also used when decompressing
+a JPEG image whose scaled output height is 1.
+
+5. Fixed various negative left shifts and other issues reported by the GCC and
+Clang undefined behavior sanitizers.  None of these was known to pose a
+security threat, but removing the warnings makes it easier to detect actual
+security issues, should they arise in the future.
+
+
+1.4.1
+=====
+
+### Significant changes relative to 1.4.0:
+
+1. tjbench now properly handles CMYK/YCCK JPEG files.  Passing an argument of
+`-cmyk` (instead of, for instance, `-rgb`) will cause tjbench to internally
+convert the source bitmap to CMYK prior to compression, to generate YCCK JPEG
+files, and to internally convert the decompressed CMYK pixels back to RGB after
+decompression (the latter is done automatically if a CMYK or YCCK JPEG is
+passed to tjbench as a source image.)  The CMYK<->RGB conversion operation is
+not benchmarked.  NOTE: The quick & dirty CMYK<->RGB conversions that tjbench
+uses are suitable for testing only.  Proper conversion between CMYK and RGB
+requires a color management system.
+
+2. `make test` now performs additional bitwise regression tests using tjbench,
+mainly for the purpose of testing compression from/decompression to a subregion
+of a larger image buffer.
+
+3. `make test` no longer tests the regression of the floating point DCT/IDCT
+by default, since the results of those tests can vary if the algorithms in
+question are not implemented using SIMD instructions on a particular platform.
+See the comments in [Makefile.am](Makefile.am) for information on how to
+re-enable the tests and to specify an expected result for them based on the
+particulars of your platform.
+
+4. The NULL color conversion routines have been significantly optimized,
+which speeds up the compression of RGB and CMYK JPEGs by 5-20% when using
+64-bit code and 0-3% when using 32-bit code, and the decompression of those
+images by 10-30% when using 64-bit code and 3-12% when using 32-bit code.
+
+5. Fixed an "illegal instruction" error that occurred when djpeg from a
+SIMD-enabled libjpeg-turbo MIPS build was executed with the `-nosmooth` option
+on a MIPS machine that lacked DSPr2 support.  The MIPS SIMD routines for h2v1
+and h2v2 merged upsampling were not properly checking for the existence of
+DSPr2.
+
+6. Performance has been improved significantly on 64-bit non-Linux and
+non-Windows platforms (generally 10-20% faster compression and 5-10% faster
+decompression.)  Due to an oversight, the 64-bit version of the accelerated
+Huffman codec was not being compiled in when libjpeg-turbo was built on
+platforms other than Windows or Linux.  Oops.
+
+7. Fixed an extremely rare bug in the Huffman encoder that caused 64-bit
+builds of libjpeg-turbo to incorrectly encode a few specific test images when
+quality=98, an optimized Huffman table, and the slow integer forward DCT were
+used.
+
+8. The Windows (CMake) build system now supports building only static or only
+shared libraries.  This is accomplished by adding either `-DENABLE_STATIC=0` or
+`-DENABLE_SHARED=0` to the CMake command line.
+
+9. TurboJPEG API functions will now return an error code if a warning is
+triggered in the underlying libjpeg API.  For instance, if a JPEG file is
+corrupt, the TurboJPEG decompression functions will attempt to decompress
+as much of the image as possible, but those functions will now return -1 to
+indicate that the decompression was not entirely successful.
+
+10. Fixed a bug in the MIPS DSPr2 4:2:2 fancy upsampling routine that caused a
+buffer overflow (and subsequent segfault) when decompressing a 4:2:2 JPEG image
+in which the right-most MCU was 5 or 6 pixels wide.
+
+
+1.4.0
+=====
+
+### Significant changes relative to 1.4 beta1:
+
+1. Fixed a build issue on OS X PowerPC platforms (md5cmp failed to build
+because OS X does not provide the `le32toh()` and `htole32()` functions.)
+
+2. The non-SIMD RGB565 color conversion code did not work correctly on big
+endian machines.  This has been fixed.
+
+3. Fixed an issue in `tjPlaneSizeYUV()` whereby it would erroneously return 1
+instead of -1 if `componentID` was > 0 and `subsamp` was `TJSAMP_GRAY`.
+
+3. Fixed an issue in `tjBufSizeYUV2()` whereby it would erroneously return 0
+instead of -1 if `width` was < 1.
+
+5. The Huffman encoder now uses `clz` and `bsr` instructions for bit counting
+on ARM64 platforms (see 1.4 beta1[5].)
+
+6. The `close()` method in the TJCompressor and TJDecompressor Java classes is
+now idempotent.  Previously, that method would call the native `tjDestroy()`
+function even if the TurboJPEG instance had already been destroyed.  This
+caused an exception to be thrown during finalization, if the `close()` method
+had already been called.  The exception was caught, but it was still an
+expensive operation.
+
+7. The TurboJPEG API previously generated an error (`Could not determine
+subsampling type for JPEG image`) when attempting to decompress grayscale JPEG
+images that were compressed with a sampling factor other than 1 (for instance,
+with `cjpeg -grayscale -sample 2x2`).  Subsampling technically has no meaning
+with grayscale JPEGs, and thus the horizontal and vertical sampling factors
+for such images are ignored by the decompressor.  However, the TurboJPEG API
+was being too rigid and was expecting the sampling factors to be equal to 1
+before it treated the image as a grayscale JPEG.
+
+8. cjpeg, djpeg, and jpegtran now accept an argument of `-version`, which will
+print the library version and exit.
+
+9. Referring to 1.4 beta1[15], another extremely rare circumstance was
+discovered under which the Huffman encoder's local buffer can be overrun
+when a buffered destination manager is being used and an
+extremely-high-frequency block (basically junk image data) is being encoded.
+Even though the Huffman local buffer was increased from 128 bytes to 136 bytes
+to address the previous issue, the new issue caused even the larger buffer to
+be overrun.  Further analysis reveals that, in the absolute worst case (such as
+setting alternating AC coefficients to 32767 and -32768 in the JPEG scanning
+order), the Huffman encoder can produce encoded blocks that approach double the
+size of the unencoded blocks.  Thus, the Huffman local buffer was increased to
+256 bytes, which should prevent any such issue from re-occurring in the future.
+
+10. The new `tjPlaneSizeYUV()`, `tjPlaneWidth()`, and `tjPlaneHeight()`
+functions were not actually usable on any platform except OS X and Windows,
+because those functions were not included in the libturbojpeg mapfile.  This
+has been fixed.
+
+11. Restored the `JPP()`, `JMETHOD()`, and `FAR` macros in the libjpeg-turbo
+header files.  The `JPP()` and `JMETHOD()` macros were originally implemented
+in libjpeg as a way of supporting non-ANSI compilers that lacked support for
+prototype parameters.  libjpeg-turbo has never supported such compilers, but
+some software packages still use the macros to define their own prototypes.
+Similarly, libjpeg-turbo has never supported MS-DOS and other platforms that
+have far symbols, but some software packages still use the `FAR` macro.  A
+pretty good argument can be made that this is a bad practice on the part of the
+software in question, but since this affects more than one package, it's just
+easier to fix it here.
+
+12. Fixed issues that were preventing the ARM 64-bit SIMD code from compiling
+for iOS, and included an ARMv8 architecture in all of the binaries installed by
+the "official" libjpeg-turbo SDK for OS X.
+
+
+1.3.90 (1.4 beta1)
+==================
+
+### Significant changes relative to 1.3.1:
+
+1. New features in the TurboJPEG API:
+
+     - YUV planar images can now be generated with an arbitrary line padding
+(previously only 4-byte padding, which was compatible with X Video, was
+supported.)
+     - The decompress-to-YUV function has been extended to support image
+scaling.
+     - JPEG images can now be compressed from YUV planar source images.
+     - YUV planar images can now be decoded into RGB or grayscale images.
+     - 4:1:1 subsampling is now supported.  This is mainly included for
+compatibility, since 4:1:1 is not fully accelerated in libjpeg-turbo and has no
+significant advantages relative to 4:2:0.
+     - CMYK images are now supported.  This feature allows CMYK source images
+to be compressed to YCCK JPEGs and YCCK or CMYK JPEGs to be decompressed to
+CMYK destination images.  Conversion between CMYK/YCCK and RGB or YUV images is
+not supported.  Such conversion requires a color management system and is thus
+out of scope for a codec library.
+     - The handling of YUV images in the Java API has been significantly
+refactored and should now be much more intuitive.
+     - The Java API now supports encoding a YUV image from an arbitrary
+position in a large image buffer.
+     - All of the YUV functions now have a corresponding function that operates
+on separate image planes instead of a unified image buffer.  This allows for
+compressing/decoding from or decompressing/encoding to a subregion of a larger
+YUV image.  It also allows for handling YUV formats that swap the order of the
+U and V planes.
+
+2. Added SIMD acceleration for DSPr2-capable MIPS platforms.  This speeds up
+the compression of full-color JPEGs by 70-80% on such platforms and
+decompression by 25-35%.
+
+3. If an application attempts to decompress a Huffman-coded JPEG image whose
+header does not contain Huffman tables, libjpeg-turbo will now insert the
+default Huffman tables.  In order to save space, many motion JPEG video frames
+are encoded without the default Huffman tables, so these frames can now be
+successfully decompressed by libjpeg-turbo without additional work on the part
+of the application.  An application can still override the Huffman tables, for
+instance to re-use tables from a previous frame of the same video.
+
+4. The Mac packaging system now uses pkgbuild and productbuild rather than
+PackageMaker (which is obsolete and no longer supported.)  This means that
+OS X 10.6 "Snow Leopard" or later must be used when packaging libjpeg-turbo,
+although the packages produced can be installed on OS X 10.5 "Leopard" or
+later.  OS X 10.4 "Tiger" is no longer supported.
+
+5. The Huffman encoder now uses `clz` and `bsr` instructions for bit counting
+on ARM platforms rather than a lookup table.  This reduces the memory footprint
+by 64k, which may be important for some mobile applications.  Out of four
+Android devices that were tested, two demonstrated a small overall performance
+loss (~3-4% on average) with ARMv6 code and a small gain (also ~3-4%) with
+ARMv7 code when enabling this new feature, but the other two devices
+demonstrated a significant overall performance gain with both ARMv6 and ARMv7
+code (~10-20%) when enabling the feature.  Actual mileage may vary.
+
+6. Worked around an issue with Visual C++ 2010 and later that caused incorrect
+pixels to be generated when decompressing a JPEG image to a 256-color bitmap,
+if compiler optimization was enabled when libjpeg-turbo was built.  This caused
+the regression tests to fail when doing a release build under Visual C++ 2010
+and later.
+
+7. Improved the accuracy and performance of the non-SIMD implementation of the
+floating point inverse DCT (using code borrowed from libjpeg v8a and later.)
+The accuracy of this implementation now matches the accuracy of the SSE/SSE2
+implementation.  Note, however, that the floating point DCT/IDCT algorithms are
+mainly a legacy feature.  They generally do not produce significantly better
+accuracy than the slow integer DCT/IDCT algorithms, and they are quite a bit
+slower.
+
+8. Added a new output colorspace (`JCS_RGB565`) to the libjpeg API that allows
+for decompressing JPEG images into RGB565 (16-bit) pixels.  If dithering is not
+used, then this code path is SIMD-accelerated on ARM platforms.
+
+9. Numerous obsolete features, such as support for non-ANSI compilers and
+support for the MS-DOS memory model, were removed from the libjpeg code,
+greatly improving its readability and making it easier to maintain and extend.
+
+10. Fixed a segfault that occurred when calling `output_message()` with
+`msg_code` set to `JMSG_COPYRIGHT`.
+
+11. Fixed an issue whereby wrjpgcom was allowing comments longer than 65k
+characters to be passed on the command line, which was causing it to generate
+incorrect JPEG files.
+
+12. Fixed a bug in the build system that was causing the Windows version of
+wrjpgcom to be built using the rdjpgcom source code.
+
+13. Restored 12-bit-per-component JPEG support.  A 12-bit version of
+libjpeg-turbo can now be built by passing an argument of `--with-12bit` to
+configure (Unix) or `-DWITH_12BIT=1` to cmake (Windows.)  12-bit JPEG support
+is included only for convenience.  Enabling this feature disables all of the
+performance features in libjpeg-turbo, as well as arithmetic coding and the
+TurboJPEG API.  The resulting library still contains the other libjpeg-turbo
+features (such as the colorspace extensions), but in general, it performs no
+faster than libjpeg v6b.
+
+14. Added ARM 64-bit SIMD acceleration for the YCC-to-RGB color conversion
+and IDCT algorithms (both are used during JPEG decompression.)  For unknown
+reasons (probably related to clang), this code cannot currently be compiled for
+iOS.
+
+15. Fixed an extremely rare bug that could cause the Huffman encoder's local
+buffer to overrun when a very high-frequency MCU is compressed using quality
+100 and no subsampling, and when the JPEG output buffer is being dynamically
+resized by the destination manager.  This issue was so rare that, even with a
+test program specifically designed to make the bug occur (by injecting random
+high-frequency YUV data into the compressor), it was reproducible only once in
+about every 25 million iterations.
+
+16. Fixed an oversight in the TurboJPEG C wrapper:  if any of the JPEG
+compression functions was called repeatedly with the same
+automatically-allocated destination buffer, then TurboJPEG would erroneously
+assume that the `jpegSize` parameter was equal to the size of the buffer, when
+in fact that parameter was probably equal to the size of the most recently
+compressed JPEG image.  If the size of the previous JPEG image was not as large
+as the current JPEG image, then TurboJPEG would unnecessarily reallocate the
+destination buffer.
+
+
+1.3.1
+=====
+
+### Significant changes relative to 1.3.0:
+
+1. On Un*x systems, `make install` now installs the libjpeg-turbo libraries
+into /opt/libjpeg-turbo/lib32 by default on any 32-bit system, not just x86,
+and into /opt/libjpeg-turbo/lib64 by default on any 64-bit system, not just
+x86-64.  You can override this by overriding either the `prefix` or `libdir`
+configure variables.
+
+2. The Windows installer now places a copy of the TurboJPEG DLLs in the same
+directory as the rest of the libjpeg-turbo binaries.  This was mainly done
+to support TurboVNC 1.3, which bundles the DLLs in its Windows installation.
+When using a 32-bit version of CMake on 64-bit Windows, it is impossible to
+access the c:\WINDOWS\system32 directory, which made it impossible for the
+TurboVNC build scripts to bundle the 64-bit TurboJPEG DLL.
+
+3. Fixed a bug whereby attempting to encode a progressive JPEG with arithmetic
+entropy coding (by passing arguments of `-progressive -arithmetic` to cjpeg or
+jpegtran, for instance) would result in an error, `Requested feature was
+omitted at compile time`.
+
+4. Fixed a couple of issues whereby malformed JPEG images would cause
+libjpeg-turbo to use uninitialized memory during decompression.
+
+5. Fixed an error (`Buffer passed to JPEG library is too small`) that occurred
+when calling the TurboJPEG YUV encoding function with a very small (< 5x5)
+source image, and added a unit test to check for this error.
+
+6. The Java classes should now build properly under Visual Studio 2010 and
+later.
+
+7. Fixed an issue that prevented SRPMs generated using the in-tree packaging
+tools from being rebuilt on certain newer Linux distributions.
+
+8. Numerous minor fixes to eliminate compilation and build/packaging system
+warnings, fix cosmetic issues, improve documentation clarity, and other general
+source cleanup.
+
+
+1.3.0
+=====
+
+### Significant changes relative to 1.3 beta1:
+
+1. `make test` now works properly on FreeBSD, and it no longer requires the
+md5sum executable to be present on other Un*x platforms.
+
+2. Overhauled the packaging system:
+
+     - To avoid conflict with vendor-supplied libjpeg-turbo packages, the
+official RPMs and DEBs for libjpeg-turbo have been renamed to
+"libjpeg-turbo-official".
+     - The TurboJPEG libraries are now located under /opt/libjpeg-turbo in the
+official Linux and Mac packages, to avoid conflict with vendor-supplied
+packages and also to streamline the packaging system.
+     - Release packages are now created with the directory structure defined
+by the configure variables `prefix`, `bindir`, `libdir`, etc. (Un\*x) or by the
+`CMAKE_INSTALL_PREFIX` variable (Windows.)  The exception is that the docs are
+always located under the system default documentation directory on Un\*x and
+Mac systems, and on Windows, the TurboJPEG DLL is always located in the Windows
+system directory.
+     - To avoid confusion, official libjpeg-turbo packages on Linux/Unix
+platforms (except for Mac) will always install the 32-bit libraries in
+/opt/libjpeg-turbo/lib32 and the 64-bit libraries in /opt/libjpeg-turbo/lib64.
+     - Fixed an issue whereby, in some cases, the libjpeg-turbo executables on
+Un*x systems were not properly linking with the shared libraries installed by
+the same package.
+     - Fixed an issue whereby building the "installer" target on Windows when
+`WITH_JAVA=1` would fail if the TurboJPEG JAR had not been previously built.
+     - Building the "install" target on Windows now installs files into the
+same places that the installer does.
+
+3. Fixed a Huffman encoder bug that prevented I/O suspension from working
+properly.
+
+
+1.2.90 (1.3 beta1)
+==================
+
+### Significant changes relative to 1.2.1:
+
+1. Added support for additional scaling factors (3/8, 5/8, 3/4, 7/8, 9/8, 5/4,
+11/8, 3/2, 13/8, 7/4, 15/8, and 2) when decompressing.  Note that the IDCT will
+not be SIMD-accelerated when using any of these new scaling factors.
+
+2. The TurboJPEG dynamic library is now versioned.  It was not strictly
+necessary to do so, because TurboJPEG uses versioned symbols, and if a function
+changes in an ABI-incompatible way, that function is renamed and a legacy
+function is provided to maintain backward compatibility.  However, certain
+Linux distro maintainers have a policy against accepting any library that isn't
+versioned.
+
+3. Extended the TurboJPEG Java API so that it can be used to compress a JPEG
+image from and decompress a JPEG image to an arbitrary position in a large
+image buffer.
+
+4. The `tjDecompressToYUV()` function now supports the `TJFLAG_FASTDCT` flag.
+
+5. The 32-bit supplementary package for amd64 Debian systems now provides
+symlinks in /usr/lib/i386-linux-gnu for the TurboJPEG libraries in /usr/lib32.
+This allows those libraries to be used on MultiArch-compatible systems (such as
+Ubuntu 11 and later) without setting the linker path.
+
+6. The TurboJPEG Java wrapper should now find the JNI library on Mac systems
+without having to pass `-Djava.library.path=/usr/lib` to java.
+
+7. TJBench has been ported to Java to provide a convenient way of validating
+the performance of the TurboJPEG Java API.  It can be run with
+`java -cp turbojpeg.jar TJBench`.
+
+8. cjpeg can now be used to generate JPEG files with the RGB colorspace
+(feature ported from jpeg-8d.)
+
+9. The width and height in the `-crop` argument passed to jpegtran can now be
+suffixed with `f` to indicate that, when the upper left corner of the cropping
+region is automatically moved to the nearest iMCU boundary, the bottom right
+corner should be moved by the same amount.  In other words, this feature causes
+jpegtran to strictly honor the specified width/height rather than the specified
+bottom right corner (feature ported from jpeg-8d.)
+
+10. JPEG files using the RGB colorspace can now be decompressed into grayscale
+images (feature ported from jpeg-8d.)
+
+11. Fixed a regression caused by 1.2.1[7] whereby the build would fail with
+multiple "Mismatch in operand sizes" errors when attempting to build the x86
+SIMD code with NASM 0.98.
+
+12. The in-memory source/destination managers (`jpeg_mem_src()` and
+`jpeg_mem_dest()`) are now included by default when building libjpeg-turbo with
+libjpeg v6b or v7 emulation, so that programs can take advantage of these
+functions without requiring the use of the backward-incompatible libjpeg v8
+ABI.  The "age number" of the libjpeg-turbo library on Un*x systems has been
+incremented by 1 to reflect this.  You can disable this feature with a
+configure/CMake switch in order to retain strict API/ABI compatibility with the
+libjpeg v6b or v7 API/ABI (or with previous versions of libjpeg-turbo.)  See
+[README.md](README.md) for more details.
+
+13. Added ARMv7s architecture to libjpeg.a and libturbojpeg.a in the official
+libjpeg-turbo binary package for OS X, so that those libraries can be used to
+build applications that leverage the faster CPUs in the iPhone 5 and iPad 4.
+
+
+1.2.1
+=====
+
+### Significant changes relative to 1.2.0:
+
+1. Creating or decoding a JPEG file that uses the RGB colorspace should now
+properly work when the input or output colorspace is one of the libjpeg-turbo
+colorspace extensions.
+
+2. When libjpeg-turbo was built without SIMD support and merged (non-fancy)
+upsampling was used along with an alpha-enabled colorspace during
+decompression, the unused byte of the decompressed pixels was not being set to
+0xFF.  This has been fixed.  TJUnitTest has also been extended to test for the
+correct behavior of the colorspace extensions when merged upsampling is used.
+
+3. Fixed a bug whereby the libjpeg-turbo SSE2 SIMD code would not preserve the
+upper 64 bits of xmm6 and xmm7 on Win64 platforms, which violated the Win64
+calling conventions.
+
+4. Fixed a regression caused by 1.2.0[6] whereby decompressing corrupt JPEG
+images (specifically, images in which the component count was erroneously set
+to a large value) would cause libjpeg-turbo to segfault.
+
+5. Worked around a severe performance issue with "Bobcat" (AMD Embedded APU)
+processors.  The `MASKMOVDQU` instruction, which was used by the libjpeg-turbo
+SSE2 SIMD code, is apparently implemented in microcode on AMD processors, and
+it is painfully slow on Bobcat processors in particular.  Eliminating the use
+of this instruction improved performance by an order of magnitude on Bobcat
+processors and by a small amount (typically 5%) on AMD desktop processors.
+
+6. Added SIMD acceleration for performing 4:2:2 upsampling on NEON-capable ARM
+platforms.  This speeds up the decompression of 4:2:2 JPEGs by 20-25% on such
+platforms.
+
+7. Fixed a regression caused by 1.2.0[2] whereby, on Linux/x86 platforms
+running the 32-bit SSE2 SIMD code in libjpeg-turbo, decompressing a 4:2:0 or
+4:2:2 JPEG image into a 32-bit (RGBX, BGRX, etc.) buffer without using fancy
+upsampling would produce several incorrect columns of pixels at the right-hand
+side of the output image if each row in the output image was not evenly
+divisible by 16 bytes.
+
+8. Fixed an issue whereby attempting to build the SIMD extensions with Xcode
+4.3 on OS X platforms would cause NASM to return numerous errors of the form
+"'%define' expects a macro identifier".
+
+9. Added flags to the TurboJPEG API that allow the caller to force the use of
+either the fast or the accurate DCT/IDCT algorithms in the underlying codec.
+
+
+1.2.0
+=====
+
+### Significant changes relative to 1.2 beta1:
+
+1. Fixed build issue with YASM on Unix systems (the libjpeg-turbo build system
+was not adding the current directory to the assembler include path, so YASM
+was not able to find jsimdcfg.inc.)
+
+2. Fixed out-of-bounds read in SSE2 SIMD code that occurred when decompressing
+a JPEG image to a bitmap buffer whose size was not a multiple of 16 bytes.
+This was more of an annoyance than an actual bug, since it did not cause any
+actual run-time problems, but the issue showed up when running libjpeg-turbo in
+valgrind.  See <http://crbug.com/72399> for more information.
+
+3. Added a compile-time macro (`LIBJPEG_TURBO_VERSION`) that can be used to
+check the version of libjpeg-turbo against which an application was compiled.
+
+4. Added new RGBA/BGRA/ABGR/ARGB colorspace extension constants (libjpeg API)
+and pixel formats (TurboJPEG API), which allow applications to specify that,
+when decompressing to a 4-component RGB buffer, the unused byte should be set
+to 0xFF so that it can be interpreted as an opaque alpha channel.
+
+5. Fixed regression issue whereby DevIL failed to build against libjpeg-turbo
+because libjpeg-turbo's distributed version of jconfig.h contained an `INLINE`
+macro, which conflicted with a similar macro in DevIL.  This macro is used only
+internally when building libjpeg-turbo, so it was moved into config.h.
+
+6. libjpeg-turbo will now correctly decompress erroneous CMYK/YCCK JPEGs whose
+K component is assigned a component ID of 1 instead of 4.  Although these files
+are in violation of the spec, other JPEG implementations handle them
+correctly.
+
+7. Added ARMv6 and ARMv7 architectures to libjpeg.a and libturbojpeg.a in
+the official libjpeg-turbo binary package for OS X, so that those libraries can
+be used to build both OS X and iOS applications.
+
+
+1.1.90 (1.2 beta1)
+==================
+
+### Significant changes relative to 1.1.1:
+
+1. Added a Java wrapper for the TurboJPEG API.  See [java/README](java/README)
+for more details.
+
+2. The TurboJPEG API can now be used to scale down images during
+decompression.
+
+3. Added SIMD routines for RGB-to-grayscale color conversion, which
+significantly improves the performance of grayscale JPEG compression from an
+RGB source image.
+
+4. Improved the performance of the C color conversion routines, which are used
+on platforms for which SIMD acceleration is not available.
+
+5. Added a function to the TurboJPEG API that performs lossless transforms.
+This function is implemented using the same back end as jpegtran, but it
+performs transcoding entirely in memory and allows multiple transforms and/or
+crop operations to be batched together, so the source coefficients only need to
+be read once.  This is useful when generating image tiles from a single source
+JPEG.
+
+6. Added tests for the new TurboJPEG scaled decompression and lossless
+transform features to tjbench (the TurboJPEG benchmark, formerly called
+"jpgtest".)
+
+7. Added support for 4:4:0 (transposed 4:2:2) subsampling in TurboJPEG, which
+was necessary in order for it to read 4:2:2 JPEG files that had been losslessly
+transposed or rotated 90 degrees.
+
+8. All legacy VirtualGL code has been re-factored, and this has allowed
+libjpeg-turbo, in its entirety, to be re-licensed under a BSD-style license.
+
+9. libjpeg-turbo can now be built with YASM.
+
+10. Added SIMD acceleration for ARM Linux and iOS platforms that support
+NEON instructions.
+
+11. Refactored the TurboJPEG C API and documented it using Doxygen.  The
+TurboJPEG 1.2 API uses pixel formats to define the size and component order of
+the uncompressed source/destination images, and it includes a more efficient
+version of `TJBUFSIZE()` that computes a worst-case JPEG size based on the
+level of chrominance subsampling.  The refactored implementation of the
+TurboJPEG API now uses the libjpeg memory source and destination managers,
+which allows the TurboJPEG compressor to grow the JPEG buffer as necessary.
+
+12. Eliminated errors in the output of jpegtran on Windows that occurred when
+the application was invoked using I/O redirection
+(`jpegtran <input.jpg >output.jpg`.)
+
+13. The inclusion of libjpeg v7 and v8 emulation as well as arithmetic coding
+support in libjpeg-turbo v1.1.0 introduced several new error constants in
+jerror.h, and these were mistakenly enabled for all emulation modes, causing
+the error enum in libjpeg-turbo to sometimes have different values than the
+same enum in libjpeg.  This represents an ABI incompatibility, and it caused
+problems with rare applications that took specific action based on a particular
+error value.  The fix was to include the new error constants conditionally
+based on whether libjpeg v7 or v8 emulation was enabled.
+
+14. Fixed an issue whereby Windows applications that used libjpeg-turbo would
+fail to compile if the Windows system headers were included before jpeglib.h.
+This issue was caused by a conflict in the definition of the INT32 type.
+
+15. Fixed 32-bit supplementary package for amd64 Debian systems, which was
+broken by enhancements to the packaging system in 1.1.
+
+16. When decompressing a JPEG image using an output colorspace of
+`JCS_EXT_RGBX`, `JCS_EXT_BGRX`, `JCS_EXT_XBGR`, or `JCS_EXT_XRGB`,
+libjpeg-turbo will now set the unused byte to 0xFF, which allows applications
+to interpret that byte as an alpha channel (0xFF = opaque).
+
+
+1.1.1
+=====
+
+### Significant changes relative to 1.1.0:
+
+1. Fixed a 1-pixel error in row 0, column 21 of the luminance plane generated
+by `tjEncodeYUV()`.
+
+2. libjpeg-turbo's accelerated Huffman decoder previously ignored unexpected
+markers found in the middle of the JPEG data stream during decompression.  It
+will now hand off decoding of a particular block to the unaccelerated Huffman
+decoder if an unexpected marker is found, so that the unaccelerated Huffman
+decoder can generate an appropriate warning.
+
+3. Older versions of MinGW64 prefixed symbol names with underscores by
+default, which differed from the behavior of 64-bit Visual C++.  MinGW64 1.0
+has adopted the behavior of 64-bit Visual C++ as the default, so to accommodate
+this, the libjpeg-turbo SIMD function names are no longer prefixed with an
+underscore when building with MinGW64.  This means that, when building
+libjpeg-turbo with older versions of MinGW64, you will now have to add
+`-fno-leading-underscore` to the `CFLAGS`.
+
+4. Fixed a regression bug in the NSIS script that caused the Windows installer
+build to fail when using the Visual Studio IDE.
+
+5. Fixed a bug in `jpeg_read_coefficients()` whereby it would not initialize
+`cinfo->image_width` and `cinfo->image_height` if libjpeg v7 or v8 emulation
+was enabled.  This specifically caused the jpegoptim program to fail if it was
+linked against a version of libjpeg-turbo that was built with libjpeg v7 or v8
+emulation.
+
+6. Eliminated excessive I/O overhead that occurred when reading BMP files in
+cjpeg.
+
+7. Eliminated errors in the output of cjpeg on Windows that occurred when the
+application was invoked using I/O redirection (`cjpeg <inputfile >output.jpg`.)
+
+
+1.1.0
+=====
+
+### Significant changes relative to 1.1 beta1:
+
+1. The algorithm used by the SIMD quantization function cannot produce correct
+results when the JPEG quality is >= 98 and the fast integer forward DCT is
+used.  Thus, the non-SIMD quantization function is now used for those cases,
+and libjpeg-turbo should now produce identical output to libjpeg v6b in all
+cases.
+
+2. Despite the above, the fast integer forward DCT still degrades somewhat for
+JPEG qualities greater than 95, so the TurboJPEG wrapper will now automatically
+use the slow integer forward DCT when generating JPEG images of quality 96 or
+greater.  This reduces compression performance by as much as 15% for these
+high-quality images but is necessary to ensure that the images are perceptually
+lossless.  It also ensures that the library can avoid the performance pitfall
+created by [1].
+
+3. Ported jpgtest.cxx to pure C to avoid the need for a C++ compiler.
+
+4. Fixed visual artifacts in grayscale JPEG compression caused by a typo in
+the RGB-to-luminance lookup tables.
+
+5. The Windows distribution packages now include the libjpeg run-time programs
+(cjpeg, etc.)
+
+6. All packages now include jpgtest.
+
+7. The TurboJPEG dynamic library now uses versioned symbols.
+
+8. Added two new TurboJPEG API functions, `tjEncodeYUV()` and
+`tjDecompressToYUV()`, to replace the somewhat hackish `TJ_YUV` flag.
+
+
+1.0.90 (1.1 beta1)
+==================
+
+### Significant changes relative to 1.0.1:
+
+1. Added emulation of the libjpeg v7 and v8 APIs and ABIs.  See
+[README.md](README.md) for more details.  This feature was sponsored by
+CamTrace SAS.
+
+2. Created a new CMake-based build system for the Visual C++ and MinGW builds.
+
+3. Grayscale bitmaps can now be compressed from/decompressed to using the
+TurboJPEG API.
+
+4. jpgtest can now be used to test decompression performance with existing
+JPEG images.
+
+5. If the default install prefix (/opt/libjpeg-turbo) is used, then
+`make install` now creates /opt/libjpeg-turbo/lib32 and
+/opt/libjpeg-turbo/lib64 sym links to duplicate the behavior of the binary
+packages.
+
+6. All symbols in the libjpeg-turbo dynamic library are now versioned, even
+when the library is built with libjpeg v6b emulation.
+
+7. Added arithmetic encoding and decoding support (can be disabled with
+configure or CMake options)
+
+8. Added a `TJ_YUV` flag to the TurboJPEG API, which causes both the compressor
+and decompressor to output planar YUV images.
+
+9. Added an extended version of `tjDecompressHeader()` to the TurboJPEG API,
+which allows the caller to determine the type of subsampling used in a JPEG
+image.
+
+10. Added further protections against invalid Huffman codes.
+
+
+1.0.1
+=====
+
+### Significant changes relative to 1.0.0:
+
+1. The Huffman decoder will now handle erroneous Huffman codes (for instance,
+from a corrupt JPEG image.)  Previously, these would cause libjpeg-turbo to
+crash under certain circumstances.
+
+2. Fixed typo in SIMD dispatch routines that was causing 4:2:2 upsampling to
+be used instead of 4:2:0 when decompressing JPEG images using SSE2 code.
+
+3. The configure script will now automatically determine whether the
+`INCOMPLETE_TYPES_BROKEN` macro should be defined.
+
+
+1.0.0
+=====
+
+### Significant changes relative to 0.0.93:
+
+1. 2983700: Further FreeBSD build tweaks (no longer necessary to specify
+`--host` when configuring on a 64-bit system)
+
+2. Created symlinks in the Unix/Linux packages so that the TurboJPEG
+include file can always be found in /opt/libjpeg-turbo/include, the 32-bit
+static libraries can always be found in /opt/libjpeg-turbo/lib32, and the
+64-bit static libraries can always be found in /opt/libjpeg-turbo/lib64.
+
+3. The Unix/Linux distribution packages now include the libjpeg run-time
+programs (cjpeg, etc.) and man pages.
+
+4. Created a 32-bit supplementary package for amd64 Debian systems, which
+contains just the 32-bit libjpeg-turbo libraries.
+
+5. Moved the libraries from */lib32 to */lib in the i386 Debian package.
+
+6. Include distribution package for Cygwin
+
+7. No longer necessary to specify `--without-simd` on non-x86 architectures,
+and unit tests now work on those architectures.
+
+
+0.0.93
+======
+
+### Significant changes since 0.0.91:
+
+1. 2982659: Fixed x86-64 build on FreeBSD systems
+
+2. 2988188: Added support for Windows 64-bit systems
+
+
+0.0.91
+======
+
+### Significant changes relative to 0.0.90:
+
+1. Added documentation to .deb packages
+
+2. 2968313: Fixed data corruption issues when decompressing large JPEG images
+and/or using buffered I/O with the libjpeg-turbo decompressor
+
+
+0.0.90
+======
+
+Initial release
diff --git a/ChangeLog.txt b/ChangeLog.txt
deleted file mode 100644
index 68261e8..0000000
--- a/ChangeLog.txt
+++ /dev/null
@@ -1,791 +0,0 @@
-1.4.3
-=====
-
-[1] Fixed a regression caused by 1.4.1[6] that prevented 32-bit and 64-bit
-libjpeg-turbo RPMs from being installed simultaneously on recent Red Hat/Fedora
-distributions.  This was due to the addition of a macro in jconfig.h that
-allows the Huffman codec to determine the word size at compile time.  Since
-that macro differs between 32-bit and 64-bit builds, this caused a conflict
-between the i386 and x86_64 RPMs (any differing files, other than executables,
-are not allowed when 32-bit and 64-bit RPMs are installed simultaneously.)
-Since the macro is used only internally, it has been moved into jconfigint.h.
-
-[2] Fixed an issue in the accelerated Huffman decoder that could have caused
-the decoder to read past the end of the input buffer when a malformed,
-specially-crafted JPEG image was being decompressed.  In prior versions of
-libjpeg-turbo, the accelerated Huffman decoder was invoked (in most cases) only
-if there were > 128 bytes of data in the input buffer.  However, it is possible
-to construct a JPEG image in which a single Huffman block is over 430 bytes
-long, so this version of libjpeg-turbo activates the accelerated Huffman
-decoder only if there are > 512 bytes of data in the input buffer.
-
-[3] Fixed a memory leak in tjunittest encountered when running the program
-with the -yuv option.
-
-[4] Fixed an issue whereby a malformed motion-JPEG frame could cause the "fast
-path" of libjpeg-turbo's Huffman decoder to read from uninitialized memory.
-
-[5] Added libjpeg-turbo version and build information to the global string
-table of the libjpeg and TurboJPEG API libraries.  This is a common practice in
-other infrastructure libraries, such as OpenSSL and libpng, because it makes it
-easy to examine an application binary and determine which version of the
-library the application was linked against.
-
-[6] Fixed a couple of issues in the PPM reader that would cause buffer overruns
-in cjpeg if one of the values in a binary PPM/PGM input file exceeded the
-maximum value defined in the file's header.  libjpeg-turbo 1.4.2 already
-included a similar fix for ASCII PPM/PGM files.  Note that these issues were
-not security bugs, since they were confined to the cjpeg program and did not
-affect any of the libjpeg-turbo libraries.
-
-[7] Fixed an issue whereby attempting to decompress a JPEG file with a corrupt
-header using the tjDecompressToYUV2() function would cause the function to
-abort without returning an error and, under certain circumstances, corrupt the
-stack.  This only occurred if tjDecompressToYUV2() was called prior to calling
-calling tjDecompressHeader3(), or if the return value from
-tjDecompressHeader3() was ignored (both cases represent incorrect usage of the
-TurboJPEG API.)
-
-
-1.4.2
-=====
-
-[1] Fixed an issue whereby cjpeg would segfault if a Windows bitmap with a
-negative width or height was used as an input image (Windows bitmaps can have
-a negative height if they are stored in top-down order, but such files are
-rare and not supported by libjpeg-turbo.)
-
-[2] Fixed an issue whereby, under certain circumstances, libjpeg-turbo would
-incorrectly encode certain JPEG images when quality=100 and the fast integer
-forward DCT were used.  This was known to cause 'make test' to fail when the
-library was built with '-march=haswell' on x86 systems.
-
-[3] Fixed an issue whereby libjpeg-turbo would crash when built with the latest
-& greatest development version of the Clang/LLVM compiler.  This was caused by
-an x86-64 ABI conformance issue in some of libjpeg-turbo's 64-bit SSE2 SIMD
-routines.  Those routines were incorrectly using a 64-bit mov instruction to
-transfer a 32-bit JDIMENSION argument, whereas the x86-64 ABI allows the upper
-(unused) 32 bits of a 32-bit argument's register to be undefined.  The new
-Clang/LLVM optimizer uses load combining to transfer multiple adjacent 32-bit
-structure members into a single 64-bit register, and this exposed the ABI
-conformance issue.
-
-[4] Fixed a bug in the MIPS DSPr2 4:2:0 "plain" (non-fancy and non-merged)
-upsampling routine that caused a buffer overflow (and subsequent segfault) when
-decompressing a 4:2:0 JPEG image whose scaled output width was less than 16
-pixels.  The "plain" upsampling routines are normally only used when
-decompressing a non-YCbCr JPEG image, but they are also used when decompressing
-a JPEG image whose scaled output height is 1.
-
-[5] Fixed various negative left shifts and other issues reported by the GCC and
-Clang undefined behavior sanitizers.  None of these was known to pose a
-security threat, but removing the warnings makes it easier to detect actual
-security issues, should they arise in the future.
-
-
-1.4.1
-=====
-
-[1] tjbench now properly handles CMYK/YCCK JPEG files.  Passing an argument of
--cmyk (instead of, for instance, -rgb) will cause tjbench to internally convert
-the source bitmap to CMYK prior to compression, to generate YCCK JPEG files,
-and to internally convert the decompressed CMYK pixels back to RGB after
-decompression (the latter is done automatically if a CMYK or YCCK JPEG is
-passed to tjbench as a source image.)  The CMYK<->RGB conversion operation is
-not benchmarked.  NOTE: The quick & dirty CMYK<->RGB conversions that tjbench
-uses are suitable for testing only.  Proper conversion between CMYK and RGB
-requires a color management system.
-
-[2] 'make test' now performs additional bitwise regression tests using tjbench,
-mainly for the purpose of testing compression from/decompression to a subregion
-of a larger image buffer.
-
-[3] 'make test' no longer tests the regression of the floating point DCT/IDCT
-by default, since the results of those tests can vary if the algorithms in
-question are not implemented using SIMD instructions on a particular platform.
-See the comments in Makefile.am for information on how to re-enable the tests
-and to specify an expected result for them based on the particulars of your
-platform.
-
-[4] The NULL color conversion routines have been significantly optimized,
-which speeds up the compression of RGB and CMYK JPEGs by 5-20% when using
-64-bit code and 0-3% when using 32-bit code, and the decompression of those
-images by 10-30% when using 64-bit code and 3-12% when using 32-bit code.
-
-[5] Fixed an "illegal instruction" error that occurred when djpeg from a
-SIMD-enabled libjpeg-turbo MIPS build was executed with the -nosmooth option on
-a MIPS machine that lacked DSPr2 support.  The MIPS SIMD routines for h2v1 and
-h2v2 merged upsampling were not properly checking for the existence of DSPr2.
-
-[6] Performance has been improved significantly on 64-bit non-Linux and
-non-Windows platforms (generally 10-20% faster compression and 5-10% faster
-decompression.)  Due to an oversight, the 64-bit version of the accelerated
-Huffman codec was not being compiled in when libjpeg-turbo was built on
-platforms other than Windows or Linux.  Oops.
-
-[7] Fixed an extremely rare bug in the Huffman encoder that caused 64-bit
-builds of libjpeg-turbo to incorrectly encode a few specific test images when
-quality=98, an optimized Huffman table, and the slow integer forward DCT were
-used.
-
-[8] The Windows (CMake) build system now supports building only static or only
-shared libraries.  This is accomplished by adding either -DENABLE_STATIC=0 or
--DENABLE_SHARED=0 to the CMake command line.
-
-[9] TurboJPEG API functions will now return an error code if a warning is
-triggered in the underlying libjpeg API.  For instance, if a JPEG file is
-corrupt, the TurboJPEG decompression functions will attempt to decompress
-as much of the image as possible, but those functions will now return -1 to
-indicate that the decompression was not entirely successful.
-
-[10] Fixed a bug in the MIPS DSPr2 4:2:2 fancy upsampling routine that caused a
-buffer overflow (and subsequent segfault) when decompressing a 4:2:2 JPEG image
-in which the right-most MCU was 5 or 6 pixels wide.
-
-
-1.4.0
-=====
-
-[1] Fixed a build issue on OS X PowerPC platforms (md5cmp failed to build
-because OS X does not provide the le32toh() and htole32() functions.)
-
-[2] The non-SIMD RGB565 color conversion code did not work correctly on big
-endian machines.  This has been fixed.
-
-[3] Fixed an issue in tjPlaneSizeYUV() whereby it would erroneously return 1
-instead of -1 if componentID was > 0 and subsamp was TJSAMP_GRAY.
-
-[3] Fixed an issue in tjBufSizeYUV2() whereby it would erroneously return 0
-instead of -1 if width was < 1.
-
-[5] The Huffman encoder now uses clz and bsr instructions for bit counting on
-ARM64 platforms (see 1.4 beta1 [5].)
-
-[6] The close() method in the TJCompressor and TJDecompressor Java classes is
-now idempotent.  Previously, that method would call the native tjDestroy()
-function even if the TurboJPEG instance had already been destroyed.  This
-caused an exception to be thrown during finalization, if the close() method had
-already been called.  The exception was caught, but it was still an expensive
-operation.
-
-[7] The TurboJPEG API previously generated an error ("Could not determine
-subsampling type for JPEG image") when attempting to decompress grayscale JPEG
-images that were compressed with a sampling factor other than 1 (for instance,
-with 'cjpeg -grayscale -sample 2x2').  Subsampling technically has no meaning
-with grayscale JPEGs, and thus the horizontal and vertical sampling factors
-for such images are ignored by the decompressor.  However, the TurboJPEG API
-was being too rigid and was expecting the sampling factors to be equal to 1
-before it treated the image as a grayscale JPEG.
-
-[8] cjpeg, djpeg, and jpegtran now accept an argument of -version, which will
-print the library version and exit.
-
-[9] Referring to 1.4 beta1 [15], another extremely rare circumstance was
-discovered under which the Huffman encoder's local buffer can be overrun
-when a buffered destination manager is being used and an
-extremely-high-frequency block (basically junk image data) is being encoded.
-Even though the Huffman local buffer was increased from 128 bytes to 136 bytes
-to address the previous issue, the new issue caused even the larger buffer to
-be overrun.  Further analysis reveals that, in the absolute worst case (such as
-setting alternating AC coefficients to 32767 and -32768 in the JPEG scanning
-order), the Huffman encoder can produce encoded blocks that approach double the
-size of the unencoded blocks.  Thus, the Huffman local buffer was increased to
-256 bytes, which should prevent any such issue from re-occurring in the future.
-
-[10] The new tjPlaneSizeYUV(), tjPlaneWidth(), and tjPlaneHeight() functions
-were not actually usable on any platform except OS X and Windows, because
-those functions were not included in the libturbojpeg mapfile.  This has been
-fixed.
-
-[11] Restored the JPP(), JMETHOD(), and FAR macros in the libjpeg-turbo header
-files.  The JPP() and JMETHOD() macros were originally implemented in libjpeg
-as a way of supporting non-ANSI compilers that lacked support for prototype
-parameters.  libjpeg-turbo has never supported such compilers, but some
-software packages still use the macros to define their own prototypes.
-Similarly, libjpeg-turbo has never supported MS-DOS and other platforms that
-have far symbols, but some software packages still use the FAR macro.  A pretty
-good argument can be made that this is a bad practice on the part of the
-software in question, but since this affects more than one package, it's just
-easier to fix it here.
-
-[12] Fixed issues that were preventing the ARM 64-bit SIMD code from compiling
-for iOS, and included an ARMv8 architecture in all of the binaries installed by
-the "official" libjpeg-turbo SDK for OS X.
-
-
-1.3.90 (1.4 beta1)
-==================
-
-[1] New features in the TurboJPEG API:
--- YUV planar images can now be generated with an arbitrary line padding
-(previously only 4-byte padding, which was compatible with X Video, was
-supported.)
--- The decompress-to-YUV function has been extended to support image scaling.
--- JPEG images can now be compressed from YUV planar source images.
--- YUV planar images can now be decoded into RGB or grayscale images.
--- 4:1:1 subsampling is now supported.  This is mainly included for
-compatibility, since 4:1:1 is not fully accelerated in libjpeg-turbo and has no
-significant advantages relative to 4:2:0.
--- CMYK images are now supported.  This feature allows CMYK source images to be
-compressed to YCCK JPEGs and YCCK or CMYK JPEGs to be decompressed to CMYK
-destination images.  Conversion between CMYK/YCCK and RGB or YUV images is not
-supported.  Such conversion requires a color management system and is thus out
-of scope for a codec library.
--- The handling of YUV images in the Java API has been significantly refactored
-and should now be much more intuitive.
--- The Java API now supports encoding a YUV image from an arbitrary position in
-a large image buffer.
--- All of the YUV functions now have a corresponding function that operates on
-separate image planes instead of a unified image buffer.  This allows for
-compressing/decoding from or decompressing/encoding to a subregion of a larger
-YUV image.  It also allows for handling YUV formats that swap the order of the
-U and V planes.
-
-[2] Added SIMD acceleration for DSPr2-capable MIPS platforms.  This speeds up
-the compression of full-color JPEGs by 70-80% on such platforms and
-decompression by 25-35%.
-
-[3] If an application attempts to decompress a Huffman-coded JPEG image whose
-header does not contain Huffman tables, libjpeg-turbo will now insert the
-default Huffman tables.  In order to save space, many motion JPEG video frames
-are encoded without the default Huffman tables, so these frames can now be
-successfully decompressed by libjpeg-turbo without additional work on the part
-of the application.  An application can still override the Huffman tables, for
-instance to re-use tables from a previous frame of the same video.
-
-[4] The Mac packaging system now uses pkgbuild and productbuild rather than
-PackageMaker (which is obsolete and no longer supported.)  This means that
-OS X 10.6 "Snow Leopard" or later must be used when packaging libjpeg-turbo,
-although the packages produced can be installed on OS X 10.5 "Leopard" or
-later.  OS X 10.4 "Tiger" is no longer supported.
-
-[5] The Huffman encoder now uses clz and bsr instructions for bit counting on
-ARM platforms rather than a lookup table.  This reduces the memory footprint
-by 64k, which may be important for some mobile applications.  Out of four
-Android devices that were tested, two demonstrated a small overall performance
-loss (~3-4% on average) with ARMv6 code and a small gain (also ~3-4%) with
-ARMv7 code when enabling this new feature, but the other two devices
-demonstrated a significant overall performance gain with both ARMv6 and ARMv7
-code (~10-20%) when enabling the feature.  Actual mileage may vary.
-
-[6] Worked around an issue with Visual C++ 2010 and later that caused incorrect
-pixels to be generated when decompressing a JPEG image to a 256-color bitmap,
-if compiler optimization was enabled when libjpeg-turbo was built.  This caused
-the regression tests to fail when doing a release build under Visual C++ 2010
-and later.
-
-[7] Improved the accuracy and performance of the non-SIMD implementation of the
-floating point inverse DCT (using code borrowed from libjpeg v8a and later.)
-The accuracy of this implementation now matches the accuracy of the SSE/SSE2
-implementation.  Note, however, that the floating point DCT/IDCT algorithms are
-mainly a legacy feature.  They generally do not produce significantly better
-accuracy than the slow integer DCT/IDCT algorithms, and they are quite a bit
-slower.
-
-[8] Added a new output colorspace (JCS_RGB565) to the libjpeg API that allows
-for decompressing JPEG images into RGB565 (16-bit) pixels.  If dithering is not
-used, then this code path is SIMD-accelerated on ARM platforms.
-
-[9] Numerous obsolete features, such as support for non-ANSI compilers and
-support for the MS-DOS memory model, were removed from the libjpeg code,
-greatly improving its readability and making it easier to maintain and extend.
-
-[10] Fixed a segfault that occurred when calling output_message() with msg_code
-set to JMSG_COPYRIGHT.
-
-[11] Fixed an issue whereby wrjpgcom was allowing comments longer than 65k
-characters to be passed on the command line, which was causing it to generate
-incorrect JPEG files.
-
-[12] Fixed a bug in the build system that was causing the Windows version of
-wrjpgcom to be built using the rdjpgcom source code.
-
-[13] Restored 12-bit-per-component JPEG support.  A 12-bit version of
-libjpeg-turbo can now be built by passing an argument of --with-12bit to
-configure (Unix) or -DWITH_12BIT=1 to cmake (Windows.)  12-bit JPEG support is
-included only for convenience.  Enabling this feature disables all of the
-performance features in libjpeg-turbo, as well as arithmetic coding and the
-TurboJPEG API.  The resulting library still contains the other libjpeg-turbo
-features (such as the colorspace extensions), but in general, it performs no
-faster than libjpeg v6b.
-
-[14] Added ARM 64-bit SIMD acceleration for the YCC-to-RGB color conversion
-and IDCT algorithms (both are used during JPEG decompression.)  For unknown
-reasons (probably related to clang), this code cannot currently be compiled for
-iOS.
-
-[15] Fixed an extremely rare bug that could cause the Huffman encoder's local
-buffer to overrun when a very high-frequency MCU is compressed using quality
-100 and no subsampling, and when the JPEG output buffer is being dynamically
-resized by the destination manager.  This issue was so rare that, even with a
-test program specifically designed to make the bug occur (by injecting random
-high-frequency YUV data into the compressor), it was reproducible only once in
-about every 25 million iterations.
-
-[16] Fixed an oversight in the TurboJPEG C wrapper:  if any of the JPEG
-compression functions was called repeatedly with the same
-automatically-allocated destination buffer, then TurboJPEG would erroneously
-assume that the jpegSize parameter was equal to the size of the buffer, when in
-fact that parameter was probably equal to the size of the most recently
-compressed JPEG image.  If the size of the previous JPEG image was not as large
-as the current JPEG image, then TurboJPEG would unnecessarily reallocate the
-destination buffer.
-
-
-1.3.1
-=====
-
-[1] On Un*x systems, 'make install' now installs the libjpeg-turbo libraries
-into /opt/libjpeg-turbo/lib32 by default on any 32-bit system, not just x86,
-and into /opt/libjpeg-turbo/lib64 by default on any 64-bit system, not just
-x86-64.  You can override this by overriding either the 'prefix' or 'libdir'
-configure variables.
-
-[2] The Windows installer now places a copy of the TurboJPEG DLLs in the same
-directory as the rest of the libjpeg-turbo binaries.  This was mainly done
-to support TurboVNC 1.3, which bundles the DLLs in its Windows installation.
-When using a 32-bit version of CMake on 64-bit Windows, it is impossible to
-access the c:\WINDOWS\system32 directory, which made it impossible for the
-TurboVNC build scripts to bundle the 64-bit TurboJPEG DLL.
-
-[3] Fixed a bug whereby attempting to encode a progressive JPEG with arithmetic
-entropy coding (by passing arguments of -progressive -arithmetic to cjpeg or
-jpegtran, for instance) would result in an error, "Requested feature was
-omitted at compile time".
-
-[4] Fixed a couple of issues whereby malformed JPEG images would cause
-libjpeg-turbo to use uninitialized memory during decompression.
-
-[5] Fixed an error ("Buffer passed to JPEG library is too small") that occurred
-when calling the TurboJPEG YUV encoding function with a very small (< 5x5)
-source image, and added a unit test to check for this error.
-
-[6] The Java classes should now build properly under Visual Studio 2010 and
-later.
-
-[7] Fixed an issue that prevented SRPMs generated using the in-tree packaging
-tools from being rebuilt on certain newer Linux distributions.
-
-[8] Numerous minor fixes to eliminate compilation and build/packaging system
-warnings, fix cosmetic issues, improve documentation clarity, and other general
-source cleanup.
-
-
-1.3.0
-=====
-
-[1] 'make test' now works properly on FreeBSD, and it no longer requires the
-md5sum executable to be present on other Un*x platforms.
-
-[2] Overhauled the packaging system:
--- To avoid conflict with vendor-supplied libjpeg-turbo packages, the
-official RPMs and DEBs for libjpeg-turbo have been renamed to
-"libjpeg-turbo-official".
--- The TurboJPEG libraries are now located under /opt/libjpeg-turbo in the
-official Linux and Mac packages, to avoid conflict with vendor-supplied
-packages and also to streamline the packaging system.
--- Release packages are now created with the directory structure defined
-by the configure variables "prefix", "bindir", "libdir", etc. (Un*x) or by the
-CMAKE_INSTALL_PREFIX variable (Windows.)  The exception is that the docs are
-always located under the system default documentation directory on Un*x and Mac
-systems, and on Windows, the TurboJPEG DLL is always located in the Windows
-system directory.
--- To avoid confusion, official libjpeg-turbo packages on Linux/Unix platforms
-(except for Mac) will always install the 32-bit libraries in
-/opt/libjpeg-turbo/lib32 and the 64-bit libraries in /opt/libjpeg-turbo/lib64.
--- Fixed an issue whereby, in some cases, the libjpeg-turbo executables on Un*x
-systems were not properly linking with the shared libraries installed by the
-same package.
--- Fixed an issue whereby building the "installer" target on Windows when
-WITH_JAVA=1 would fail if the TurboJPEG JAR had not been previously built.
--- Building the "install" target on Windows now installs files into the same
-places that the installer does.
-
-[3] Fixed a Huffman encoder bug that prevented I/O suspension from working
-properly.
-
-
-1.2.90 (1.3 beta1)
-==================
-
-[1] Added support for additional scaling factors (3/8, 5/8, 3/4, 7/8, 9/8, 5/4,
-11/8, 3/2, 13/8, 7/4, 15/8, and 2) when decompressing.  Note that the IDCT will
-not be SIMD-accelerated when using any of these new scaling factors.
-
-[2] The TurboJPEG dynamic library is now versioned.  It was not strictly
-necessary to do so, because TurboJPEG uses versioned symbols, and if a function
-changes in an ABI-incompatible way, that function is renamed and a legacy
-function is provided to maintain backward compatibility.  However, certain
-Linux distro maintainers have a policy against accepting any library that isn't
-versioned.
-
-[3] Extended the TurboJPEG Java API so that it can be used to compress a JPEG
-image from and decompress a JPEG image to an arbitrary position in a large
-image buffer.
-
-[4] The tjDecompressToYUV() function now supports the TJFLAG_FASTDCT flag.
-
-[5] The 32-bit supplementary package for amd64 Debian systems now provides
-symlinks in /usr/lib/i386-linux-gnu for the TurboJPEG libraries in /usr/lib32.
-This allows those libraries to be used on MultiArch-compatible systems (such as
-Ubuntu 11 and later) without setting the linker path.
-
-[6] The TurboJPEG Java wrapper should now find the JNI library on Mac systems
-without having to pass -Djava.library.path=/usr/lib to java.
-
-[7] TJBench has been ported to Java to provide a convenient way of validating
-the performance of the TurboJPEG Java API.  It can be run with
-'java -cp turbojpeg.jar TJBench'.
-
-[8] cjpeg can now be used to generate JPEG files with the RGB colorspace
-(feature ported from jpeg-8d.)
-
-[9] The width and height in the -crop argument passed to jpegtran can now be
-suffixed with "f" to indicate that, when the upper left corner of the cropping
-region is automatically moved to the nearest iMCU boundary, the bottom right
-corner should be moved by the same amount.  In other words, this feature causes
-jpegtran to strictly honor the specified width/height rather than the specified
-bottom right corner (feature ported from jpeg-8d.)
-
-[10] JPEG files using the RGB colorspace can now be decompressed into grayscale
-images (feature ported from jpeg-8d.)
-
-[11] Fixed a regression caused by 1.2.1[7] whereby the build would fail with
-multiple "Mismatch in operand sizes" errors when attempting to build the x86
-SIMD code with NASM 0.98.
-
-[12] The in-memory source/destination managers (jpeg_mem_src() and
-jpeg_mem_dest()) are now included by default when building libjpeg-turbo with
-libjpeg v6b or v7 emulation, so that programs can take advantage of these
-functions without requiring the use of the backward-incompatible libjpeg v8
-ABI.  The "age number" of the libjpeg-turbo library on Un*x systems has been
-incremented by 1 to reflect this.  You can disable this feature with a
-configure/CMake switch in order to retain strict API/ABI compatibility with the
-libjpeg v6b or v7 API/ABI (or with previous versions of libjpeg-turbo.)  See
-README-turbo.txt for more details.
-
-[13] Added ARMv7s architecture to libjpeg.a and libturbojpeg.a in the official
-libjpeg-turbo binary package for OS X, so that those libraries can be used to
-build applications that leverage the faster CPUs in the iPhone 5 and iPad 4.
-
-
-1.2.1
-=====
-
-[1] Creating or decoding a JPEG file that uses the RGB colorspace should now
-properly work when the input or output colorspace is one of the libjpeg-turbo
-colorspace extensions.
-
-[2] When libjpeg-turbo was built without SIMD support and merged (non-fancy)
-upsampling was used along with an alpha-enabled colorspace during
-decompression, the unused byte of the decompressed pixels was not being set to
-0xFF.  This has been fixed.  TJUnitTest has also been extended to test for the
-correct behavior of the colorspace extensions when merged upsampling is used.
-
-[3] Fixed a bug whereby the libjpeg-turbo SSE2 SIMD code would not preserve the
-upper 64 bits of xmm6 and xmm7 on Win64 platforms, which violated the Win64
-calling conventions.
-
-[4] Fixed a regression caused by 1.2.0[6] whereby decompressing corrupt JPEG
-images (specifically, images in which the component count was erroneously set
-to a large value) would cause libjpeg-turbo to segfault.
-
-[5] Worked around a severe performance issue with "Bobcat" (AMD Embedded APU)
-processors.  The MASKMOVDQU instruction, which was used by the libjpeg-turbo
-SSE2 SIMD code, is apparently implemented in microcode on AMD processors, and
-it is painfully slow on Bobcat processors in particular.  Eliminating the use
-of this instruction improved performance by an order of magnitude on Bobcat
-processors and by a small amount (typically 5%) on AMD desktop processors.
-
-[6] Added SIMD acceleration for performing 4:2:2 upsampling on NEON-capable ARM
-platforms.  This speeds up the decompression of 4:2:2 JPEGs by 20-25% on such
-platforms.
-
-[7] Fixed a regression caused by 1.2.0[2] whereby, on Linux/x86 platforms
-running the 32-bit SSE2 SIMD code in libjpeg-turbo, decompressing a 4:2:0 or
-4:2:2 JPEG image into a 32-bit (RGBX, BGRX, etc.) buffer without using fancy
-upsampling would produce several incorrect columns of pixels at the right-hand
-side of the output image if each row in the output image was not evenly
-divisible by 16 bytes.
-
-[8] Fixed an issue whereby attempting to build the SIMD extensions with Xcode
-4.3 on OS X platforms would cause NASM to return numerous errors of the form
-"'%define' expects a macro identifier".
-
-[9] Added flags to the TurboJPEG API that allow the caller to force the use of
-either the fast or the accurate DCT/IDCT algorithms in the underlying codec.
-
-
-1.2.0
-=====
-
-[1] Fixed build issue with YASM on Unix systems (the libjpeg-turbo build system
-was not adding the current directory to the assembler include path, so YASM
-was not able to find jsimdcfg.inc.)
-
-[2] Fixed out-of-bounds read in SSE2 SIMD code that occurred when decompressing
-a JPEG image to a bitmap buffer whose size was not a multiple of 16 bytes.
-This was more of an annoyance than an actual bug, since it did not cause any
-actual run-time problems, but the issue showed up when running libjpeg-turbo in
-valgrind.  See http://crbug.com/72399 for more information.
-
-[3] Added a compile-time macro (LIBJPEG_TURBO_VERSION) that can be used to
-check the version of libjpeg-turbo against which an application was compiled.
-
-[4] Added new RGBA/BGRA/ABGR/ARGB colorspace extension constants (libjpeg API)
-and pixel formats (TurboJPEG API), which allow applications to specify that,
-when decompressing to a 4-component RGB buffer, the unused byte should be set
-to 0xFF so that it can be interpreted as an opaque alpha channel.
-
-[5] Fixed regression issue whereby DevIL failed to build against libjpeg-turbo
-because libjpeg-turbo's distributed version of jconfig.h contained an INLINE
-macro, which conflicted with a similar macro in DevIL.  This macro is used only
-internally when building libjpeg-turbo, so it was moved into config.h.
-
-[6] libjpeg-turbo will now correctly decompress erroneous CMYK/YCCK JPEGs whose
-K component is assigned a component ID of 1 instead of 4.  Although these files
-are in violation of the spec, other JPEG implementations handle them
-correctly.
-
-[7] Added ARMv6 and ARMv7 architectures to libjpeg.a and libturbojpeg.a in
-the official libjpeg-turbo binary package for OS X, so that those libraries can
-be used to build both OS X and iOS applications.
-
-
-1.1.90 (1.2 beta1)
-==================
-
-[1] Added a Java wrapper for the TurboJPEG API.  See java/README for more
-details.
-
-[2] The TurboJPEG API can now be used to scale down images during
-decompression.
-
-[3] Added SIMD routines for RGB-to-grayscale color conversion, which
-significantly improves the performance of grayscale JPEG compression from an
-RGB source image.
-
-[4] Improved the performance of the C color conversion routines, which are used
-on platforms for which SIMD acceleration is not available.
-
-[5] Added a function to the TurboJPEG API that performs lossless transforms.
-This function is implemented using the same back end as jpegtran, but it
-performs transcoding entirely in memory and allows multiple transforms and/or
-crop operations to be batched together, so the source coefficients only need to
-be read once.  This is useful when generating image tiles from a single source
-JPEG.
-
-[6] Added tests for the new TurboJPEG scaled decompression and lossless
-transform features to tjbench (the TurboJPEG benchmark, formerly called
-"jpgtest".)
-
-[7] Added support for 4:4:0 (transposed 4:2:2) subsampling in TurboJPEG, which
-was necessary in order for it to read 4:2:2 JPEG files that had been losslessly
-transposed or rotated 90 degrees.
-
-[8] All legacy VirtualGL code has been re-factored, and this has allowed
-libjpeg-turbo, in its entirety, to be re-licensed under a BSD-style license.
-
-[9] libjpeg-turbo can now be built with YASM.
-
-[10] Added SIMD acceleration for ARM Linux and iOS platforms that support
-NEON instructions.
-
-[11] Refactored the TurboJPEG C API and documented it using Doxygen.  The
-TurboJPEG 1.2 API uses pixel formats to define the size and component order of
-the uncompressed source/destination images, and it includes a more efficient
-version of TJBUFSIZE() that computes a worst-case JPEG size based on the level
-of chrominance subsampling.  The refactored implementation of the TurboJPEG API
-now uses the libjpeg memory source and destination managers, which allows the
-TurboJPEG compressor to grow the JPEG buffer as necessary.
-
-[12] Eliminated errors in the output of jpegtran on Windows that occurred when
-the application was invoked using I/O redirection
-(jpegtran <input.jpg >output.jpg).
-
-[13] The inclusion of libjpeg v7 and v8 emulation as well as arithmetic coding
-support in libjpeg-turbo v1.1.0 introduced several new error constants in
-jerror.h, and these were mistakenly enabled for all emulation modes, causing
-the error enum in libjpeg-turbo to sometimes have different values than the
-same enum in libjpeg.  This represents an ABI incompatibility, and it caused
-problems with rare applications that took specific action based on a particular
-error value.  The fix was to include the new error constants conditionally
-based on whether libjpeg v7 or v8 emulation was enabled.
-
-[14] Fixed an issue whereby Windows applications that used libjpeg-turbo would
-fail to compile if the Windows system headers were included before jpeglib.h.
-This issue was caused by a conflict in the definition of the INT32 type.
-
-[15] Fixed 32-bit supplementary package for amd64 Debian systems, which was
-broken by enhancements to the packaging system in 1.1.
-
-[16] When decompressing a JPEG image using an output colorspace of
-JCS_EXT_RGBX, JCS_EXT_BGRX, JCS_EXT_XBGR, or JCS_EXT_XRGB, libjpeg-turbo will
-now set the unused byte to 0xFF, which allows applications to interpret that
-byte as an alpha channel (0xFF = opaque).
-
-
-1.1.1
-=====
-
-[1] Fixed a 1-pixel error in row 0, column 21 of the luminance plane generated
-by tjEncodeYUV().
-
-[2] libjpeg-turbo's accelerated Huffman decoder previously ignored unexpected
-markers found in the middle of the JPEG data stream during decompression.  It
-will now hand off decoding of a particular block to the unaccelerated Huffman
-decoder if an unexpected marker is found, so that the unaccelerated Huffman
-decoder can generate an appropriate warning.
-
-[3] Older versions of MinGW64 prefixed symbol names with underscores by
-default, which differed from the behavior of 64-bit Visual C++.  MinGW64 1.0
-has adopted the behavior of 64-bit Visual C++ as the default, so to accommodate
-this, the libjpeg-turbo SIMD function names are no longer prefixed with an
-underscore when building with MinGW64.  This means that, when building
-libjpeg-turbo with older versions of MinGW64, you will now have to add
--fno-leading-underscore to the CFLAGS.
-
-[4] Fixed a regression bug in the NSIS script that caused the Windows installer
-build to fail when using the Visual Studio IDE.
-
-[5] Fixed a bug in jpeg_read_coefficients() whereby it would not initialize
-cinfo->image_width and cinfo->image_height if libjpeg v7 or v8 emulation was
-enabled.  This specifically caused the jpegoptim program to fail if it was
-linked against a version of libjpeg-turbo that was built with libjpeg v7 or v8
-emulation.
-
-[6] Eliminated excessive I/O overhead that occurred when reading BMP files in
-cjpeg.
-
-[7] Eliminated errors in the output of cjpeg on Windows that occurred when the
-application was invoked using I/O redirection (cjpeg <inputfile >output.jpg).
-
-
-1.1.0
-=====
-
-[1] The algorithm used by the SIMD quantization function cannot produce correct
-results when the JPEG quality is >= 98 and the fast integer forward DCT is
-used.  Thus, the non-SIMD quantization function is now used for those cases,
-and libjpeg-turbo should now produce identical output to libjpeg v6b in all
-cases.
-
-[2] Despite the above, the fast integer forward DCT still degrades somewhat for
-JPEG qualities greater than 95, so the TurboJPEG wrapper will now automatically
-use the slow integer forward DCT when generating JPEG images of quality 96 or
-greater.  This reduces compression performance by as much as 15% for these
-high-quality images but is necessary to ensure that the images are perceptually
-lossless.  It also ensures that the library can avoid the performance pitfall
-created by [1].
-
-[3] Ported jpgtest.cxx to pure C to avoid the need for a C++ compiler.
-
-[4] Fixed visual artifacts in grayscale JPEG compression caused by a typo in
-the RGB-to-luminance lookup tables.
-
-[5] The Windows distribution packages now include the libjpeg run-time programs
-(cjpeg, etc.)
-
-[6] All packages now include jpgtest.
-
-[7] The TurboJPEG dynamic library now uses versioned symbols.
-
-[8] Added two new TurboJPEG API functions, tjEncodeYUV() and
-tjDecompressToYUV(), to replace the somewhat hackish TJ_YUV flag.
-
-
-1.0.90 (1.1 beta1)
-==================
-
-[1] Added emulation of the libjpeg v7 and v8 APIs and ABIs.  See
-README-turbo.txt for more details.  This feature was sponsored by CamTrace SAS.
-
-[2] Created a new CMake-based build system for the Visual C++ and MinGW builds.
-
-[3] Grayscale bitmaps can now be compressed from/decompressed to using the
-TurboJPEG API.
-
-[4] jpgtest can now be used to test decompression performance with existing
-JPEG images.
-
-[5] If the default install prefix (/opt/libjpeg-turbo) is used, then
-'make install' now creates /opt/libjpeg-turbo/lib32 and
-/opt/libjpeg-turbo/lib64 sym links to duplicate the behavior of the binary
-packages.
-
-[6] All symbols in the libjpeg-turbo dynamic library are now versioned, even
-when the library is built with libjpeg v6b emulation.
-
-[7] Added arithmetic encoding and decoding support (can be disabled with
-configure or CMake options)
-
-[8] Added a TJ_YUV flag to the TurboJPEG API, which causes both the compressor
-and decompressor to output planar YUV images.
-
-[9] Added an extended version of tjDecompressHeader() to the TurboJPEG API,
-which allows the caller to determine the type of subsampling used in a JPEG
-image.
-
-[10] Added further protections against invalid Huffman codes.
-
-
-1.0.1
-=====
-
-[1] The Huffman decoder will now handle erroneous Huffman codes (for instance,
-from a corrupt JPEG image.)  Previously, these would cause libjpeg-turbo to
-crash under certain circumstances.
-
-[2] Fixed typo in SIMD dispatch routines that was causing 4:2:2 upsampling to
-be used instead of 4:2:0 when decompressing JPEG images using SSE2 code.
-
-[3] configure script will now automatically determine whether the
-INCOMPLETE_TYPES_BROKEN macro should be defined.
-
-
-1.0.0
-=====
-
-[1] 2983700: Further FreeBSD build tweaks (no longer necessary to specify
---host when configuring on a 64-bit system)
-
-[2] Created symlinks in the Unix/Linux packages so that the TurboJPEG
-include file can always be found in /opt/libjpeg-turbo/include, the 32-bit
-static libraries can always be found in /opt/libjpeg-turbo/lib32, and the
-64-bit static libraries can always be found in /opt/libjpeg-turbo/lib64.
-
-[3] The Unix/Linux distribution packages now include the libjpeg run-time
-programs (cjpeg, etc.) and man pages.
-
-[4] Created a 32-bit supplementary package for amd64 Debian systems, which
-contains just the 32-bit libjpeg-turbo libraries.
-
-[5] Moved the libraries from */lib32 to */lib in the i386 Debian package.
-
-[6] Include distribution package for Cygwin
-
-[7] No longer necessary to specify --without-simd on non-x86 architectures, and
-unit tests now work on those architectures.
-
-
-0.0.93
-======
-
-[1] 2982659, Fixed x86-64 build on FreeBSD systems
-
-[2] 2988188: Added support for Windows 64-bit systems
-
-
-0.0.91
-======
-
-[1] Added documentation to .deb packages
-
-[2] 2968313: Fixed data corruption issues when decompressing large JPEG images
-and/or using buffered I/O with the libjpeg-turbo decompressor
-
-
-0.0.90
-======
-
-Initial release
diff --git a/LICENSE.md b/LICENSE.md
new file mode 100644
index 0000000..4623e29
--- /dev/null
+++ b/LICENSE.md
@@ -0,0 +1,88 @@
+libjpeg-turbo Licenses
+======================
+
+libjpeg-turbo is covered by three compatible BSD-style open source licenses:
+
+- The IJG (Independent JPEG Group) License, which is listed in
+  [README.ijg](README.ijg)
+
+  This license applies to the libjpeg API library and associated programs
+  (any code inherited from libjpeg, and any modifications to that code.)
+
+- The Modified (3-clause) BSD License, which is listed in
+  [turbojpeg.c](turbojpeg.c)
+
+  This license covers the TurboJPEG API library and associated programs.
+
+- The zlib License, which is listed in [simd/jsimdext.inc](simd/jsimdext.inc)
+
+  This license is a subset of the other two, and it covers the libjpeg-turbo
+  SIMD extensions.
+
+
+Complying with the libjpeg-turbo Licenses
+=========================================
+
+This section provides a roll-up of the libjpeg-turbo licensing terms, to the
+best of our understanding.
+
+1.  If you are distributing a modified version of the libjpeg-turbo source,
+    then:
+
+    1.  You cannot alter or remove any existing copyright or license notices
+        from the source.
+
+        **Origin**
+        - Clause 1 of the IJG License
+        - Clause 1 of the Modified BSD License
+        - Clauses 1 and 3 of the zlib License
+
+    2.  You must add your own copyright notice to the header of each source
+        file you modified, so others can tell that you modified that file (if
+        there is not an existing copyright header in that file, then you can
+        simply add a notice stating that you modified the file.)
+
+        **Origin**
+        - Clause 1 of the IJG License
+        - Clause 2 of the zlib License
+
+    3.  You must include the IJG README file, and you must not alter any of the
+        copyright or license text in that file.
+
+        **Origin**
+        - Clause 1 of the IJG License
+
+2.  If you are distributing only libjpeg-turbo binaries without the source, or
+    if you are distributing an application that statically links with
+    libjpeg-turbo, then:
+
+    1.  Your product documentation must include a message stating:
+
+        This software is based in part on the work of the Independent JPEG
+        Group.
+
+        **Origin**
+        - Clause 2 of the IJG license
+
+    2.  If your binary distribution includes or uses the TurboJPEG API, then
+        your product documentation must include the text of the Modified BSD
+        License.
+
+        **Origin**
+        - Clause 2 of the Modified BSD License
+
+3.  You cannot use the name of the IJG or The libjpeg-turbo Project or the
+    contributors thereof in advertising, publicity, etc.
+
+    **Origin**
+    - IJG License
+    - Clause 3 of the Modified BSD License
+
+4.  The IJG and The libjpeg-turbo Project do not warrant libjpeg-turbo to be
+    free of defects, nor do we accept any liability for undesirable
+    consequences resulting from your use of the software.
+
+    **Origin**
+    - IJG License
+    - Modified BSD License
+    - zlib License
diff --git a/LICENSE.txt b/LICENSE.txt
deleted file mode 100644
index 5cab36e..0000000
--- a/LICENSE.txt
+++ /dev/null
@@ -1,79 +0,0 @@
-libjpeg-turbo Licenses
-----------------------
-
-libjpeg-turbo is covered by three compatible BSD-style open source licenses:
-
--- The IJG (Independent JPEG Group) License, which is listed in README
-
-   This license applies to the libjpeg API library and associated programs
-   (any code inherited from libjpeg, and any modifications to that code.)
-
--- The Modified (3-clause) BSD License, which is listed in turbojpeg.c
-
-   This license covers the TurboJPEG API library and associated programs.
-
--- The zlib License, which is listed in simd/jsimdext.inc
-
-   This license is a subset of the other two, and it covers the libjpeg-turbo
-   SIMD extensions.
-
-
-Complying with the libjpeg-turbo Licenses
------------------------------------------
-
-This section provides a roll-up of the libjpeg-turbo licensing terms, to the
-best of our understanding.
-
-1.  If you are distributing a modified version of the libjpeg-turbo source,
-    then:
-
-    a.  You cannot alter or remove any existing copyright or license notices
-        from the source.
-
-        Origin:  Clause 1 of the IJG License
-                 Clause 1 of the Modified BSD License
-                 Clauses 1 and 3 of the zlib License
-
-    b.  You must add your own copyright notice to the header of each source
-        file you modified, so others can tell that you modified that file (if
-        there is not an existing copyright header in that file, then you can
-        simply add a notice stating that you modified the file.)
-
-        Origin:  Clause 1 of the IJG License
-                 Clause 2 of the zlib License
-
-    c.  You must include the IJG README file, and you must not alter any of the
-        copyright or license text in that file.
-
-        Origin:  Clause 1 of the IJG License
-
-2.  If you are distributing only libjpeg-turbo binaries without the source, or
-    if you are distributing an application that statically links with
-    libjpeg-turbo, then:
-
-    a.  Your product documentation must include a message stating:
-
-        This software is based in part on the work of the Independent JPEG
-        Group.
-
-        Origin:  Clause 2 of the IJG license
-
-    b.  If your binary distribution includes or uses the TurboJPEG API, then
-        your product documentation must include the text of the Modified BSD
-        License.
-
-        Origin:  Clause 2 of the Modified BSD License
-
-3.  You cannot use the name of the IJG or The libjpeg-turbo Project or the
-    contributors thereof in advertising, publicity, etc.
-
-    Origin:  IJG License
-             Clause 3 of the Modified BSD License
-
-4.  The IJG and The libjpeg-turbo Project do not warrant libjpeg-turbo to be
-    free of defects, nor do we accept any liability for undesirable
-    consequences resulting from your use of the software.
-
-    Origin:  IJG License
-             Modified BSD License
-             zlib License
diff --git a/Makefile.am b/Makefile.am
index d4b1647..80f0059 100644
--- a/Makefile.am
+++ b/Makefile.am
@@ -10,6 +10,8 @@
 
 nodist_include_HEADERS = jconfig.h
 
+pkgconfigdir = $(libdir)/pkgconfig
+pkgconfig_DATA = pkgscripts/libjpeg.pc pkgscripts/libturbojpeg.pc
 
 HDRS = jchuff.h jdct.h jdhuff.h jerror.h jinclude.h jmemsys.h jmorecfg.h \
 	jpegint.h jpeglib.h jversion.h jsimd.h jsimddct.h jpegcomp.h \
@@ -155,11 +157,11 @@
 
 dist_man1_MANS = cjpeg.1 djpeg.1 jpegtran.1 rdjpgcom.1 wrjpgcom.1
 
-DOCS= coderules.txt jconfig.txt change.log rdrle.c wrrle.c BUILDING.txt \
-	ChangeLog.txt
+DOCS= coderules.txt jconfig.txt change.log rdrle.c wrrle.c BUILDING.md \
+	ChangeLog.md
 
-dist_doc_DATA = README README-turbo.txt libjpeg.txt structure.txt usage.txt \
-	wizard.txt
+dist_doc_DATA = README.ijg README.md libjpeg.txt structure.txt usage.txt \
+	wizard.txt LICENSE.md
 
 exampledir = $(docdir)
 dist_example_DATA = example.c
@@ -168,7 +170,8 @@
 EXTRA_DIST = win release $(DOCS) testimages CMakeLists.txt \
 	sharedlib/CMakeLists.txt cmakescripts libjpeg.map.in doc doxygen.config \
 	doxygen-extra.css jccolext.c jdcolext.c jdcol565.c jdmrgext.c jdmrg565.c \
-	jstdhuff.c LICENSE.txt
+	jstdhuff.c jdcoefct.h jdmainct.h jdmaster.h jdsample.h wrppm.h \
+	md5/CMakeLists.txt
 
 dist-hook:
 	rm -rf `find $(distdir) -name .svn`
@@ -210,6 +213,10 @@
 MD5_PPM_420M_ISLOW_3_8 = 343d19015531b7bbe746124127244fa8
 MD5_PPM_420M_ISLOW_1_4 = 35fd59d866e44659edfa3c18db2a3edb
 MD5_PPM_420M_ISLOW_1_8 = ccaed48ac0aedefda5d4abe4013f4ad7
+MD5_PPM_420_ISLOW_SKIP15_31 = 86664cd9dc956536409e44e244d20a97
+MD5_PPM_420_ISLOW_PROG_CROP62x62_71_71 = 452a21656115a163029cfba5c04fa76a
+MD5_PPM_444_ISLOW_SKIP1_6 = ef63901f71ef7a75cd78253fc0914f84
+MD5_PPM_444_ISLOW_PROG_CROP98x98_13_13 = 15b173fb5872d9575572fbcc1b05956f
 MD5_JPEG_CROP = cdb35ff4b4519392690ea040c56ea99c
 
 else
@@ -233,7 +240,7 @@
 MD5_BMP_GRAY_ISLOW_565 = 12f78118e56a2f48b966f792fedf23cc
 MD5_BMP_GRAY_ISLOW_565D = bdbbd616441a24354c98553df5dc82db
 MD5_JPEG_420S_IFAST_OPT = 388708217ac46273ca33086b22827ed8
-# See README-turbo.txt for more details on why this next bit is necessary.
+# See README.md for more details on why this next bit is necessary.
 MD5_JPEG_3x2_FLOAT_PROG_SSE = 343e3f8caf8af5986ebaf0bdc13b5c71
 MD5_PPM_3x2_FLOAT_SSE = 1a75f36e5904d6fc3a85a43da9ad89bb
 MD5_JPEG_3x2_FLOAT_PROG_32BIT = 9bca803d2042bd1eb03819e2bf92b3e5
@@ -262,6 +269,13 @@
 MD5_BMP_420_ISLOW_565D = 6bde71526acc44bcff76f696df8638d2
 MD5_BMP_420M_ISLOW_565 = 8dc0185245353cfa32ad97027342216f
 MD5_BMP_420M_ISLOW_565D =d1be3a3339166255e76fa50a0d70d73e
+MD5_PPM_420_ISLOW_SKIP15_31 = c4c65c1e43d7275cd50328a61e6534f0
+MD5_PPM_420_ISLOW_ARI_SKIP16_139 = 087c6b123db16ac00cb88c5b590bb74a
+MD5_PPM_420_ISLOW_PROG_CROP62x62_71_71 = 26eb36ccc7d1f0cb80cdabb0ac8b5d99
+MD5_PPM_420_ISLOW_ARI_CROP53x53_4_4 = 886c6775af22370257122f8b16207e6d
+MD5_PPM_444_ISLOW_SKIP1_6 = 5606f86874cf26b8fcee1117a0a436a6
+MD5_PPM_444_ISLOW_PROG_CROP98x98_13_13 = db87dc7ce26bcdc7a6b56239ce2b9d6c
+MD5_PPM_444_ISLOW_ARI_CROP37x37_0_0 = cb57b32bd6d03e35432362f7bf184b6d
 MD5_JPEG_CROP = b4197f377e621c4e9b1d20471432610d
 
 endif
@@ -269,7 +283,11 @@
 .PHONY: test
 test: tjquicktest tjbittest bittest
 
+if CROSS_COMPILING
+tjquicktest: testclean
+else
 tjquicktest: testclean all
+endif
 
 if WITH_TURBOJPEG
 if WITH_JAVA
@@ -288,7 +306,11 @@
 endif
 	echo GREAT SUCCESS!
 
+if CROSS_COMPILING
+tjbittest: testclean
+else
 tjbittest: testclean all
+endif
 
 if WITH_TURBOJPEG
 
@@ -329,7 +351,7 @@
 	for i in 8 16 32 64 128; do \
 		md5/md5cmp $(MD5_PPM_444_TILE) testout_tile_444_Q95_$$i\x$$i.ppm; \
 	done
-	rm testout_tile_GRAY_* testout_tile_420_* testout_tile_422_* testout_tile_444_*
+	rm -f testout_tile_GRAY_* testout_tile_420_* testout_tile_422_* testout_tile_444_*
 
 	./tjbench testout_tile.ppm 95 -rgb -fastupsample -quiet -tile -benchtime 0.01 >/dev/null 2>&1
 	md5/md5cmp $(MD5_PPM_420M_8x8_TILE) testout_tile_420_Q95_8x8.ppm
@@ -340,12 +362,16 @@
 	for i in 16 32 64 128; do \
 		md5/md5cmp $(MD5_PPM_422M_TILE) testout_tile_422_Q95_$$i\x$$i.ppm; \
 	done
-	rm testout_tile_GRAY_* testout_tile_420_* testout_tile_422_* testout_tile_444_* testout_tile.ppm
+	rm -f testout_tile_GRAY_* testout_tile_420_* testout_tile_422_* testout_tile_444_* testout_tile.ppm
 	echo GREAT SUCCESS!
 
 endif
 
+if CROSS_COMPILING
+bittest: testclean
+else
 bittest: testclean all
+endif
 
 # These tests are carefully crafted to provide full coverage of as many of the
 # underlying algorithms as possible (including all of the SIMD-accelerated
@@ -357,18 +383,18 @@
 # CC: null  SAMP: fullsize  IDCT: islow  ENT: huff
 	./djpeg -dct int -ppm -outfile testout_rgb_islow.ppm testout_rgb_islow.jpg
 	md5/md5cmp $(MD5_PPM_RGB_ISLOW) testout_rgb_islow.ppm
-	rm testout_rgb_islow.ppm
+	rm -f testout_rgb_islow.ppm
 if WITH_12BIT
-	rm testout_rgb_islow.jpg
+	rm -f testout_rgb_islow.jpg
 else
 # CC: RGB->RGB565  SAMP: fullsize  IDCT: islow  ENT: huff
 	./djpeg -dct int -rgb565 -dither none -bmp -outfile testout_rgb_islow_565.bmp testout_rgb_islow.jpg
 	md5/md5cmp $(MD5_BMP_RGB_ISLOW_565) testout_rgb_islow_565.bmp
-	rm testout_rgb_islow_565.bmp
+	rm -f testout_rgb_islow_565.bmp
 # CC: RGB->RGB565 (dithered)  SAMP: fullsize  IDCT: islow  ENT: huff
 	./djpeg -dct int -rgb565 -bmp -outfile testout_rgb_islow_565D.bmp testout_rgb_islow.jpg
 	md5/md5cmp $(MD5_BMP_RGB_ISLOW_565D) testout_rgb_islow_565D.bmp
-	rm testout_rgb_islow_565D.bmp testout_rgb_islow.jpg
+	rm -f testout_rgb_islow_565D.bmp testout_rgb_islow.jpg
 endif
 
 # CC: RGB->YCC  SAMP: fullsize/h2v1  FDCT: ifast  ENT: 2-pass huff
@@ -377,22 +403,22 @@
 # CC: YCC->RGB  SAMP: fullsize/h2v1 fancy  IDCT: ifast  ENT: huff
 	./djpeg -dct fast -outfile testout_422_ifast.ppm testout_422_ifast_opt.jpg
 	md5/md5cmp $(MD5_PPM_422_IFAST) testout_422_ifast.ppm
-	rm testout_422_ifast.ppm
+	rm -f testout_422_ifast.ppm
 # CC: YCC->RGB  SAMP: h2v1 merged  IDCT: ifast  ENT: huff
 	./djpeg -dct fast -nosmooth -outfile testout_422m_ifast.ppm testout_422_ifast_opt.jpg
 	md5/md5cmp $(MD5_PPM_422M_IFAST) testout_422m_ifast.ppm
-	rm testout_422m_ifast.ppm
+	rm -f testout_422m_ifast.ppm
 if WITH_12BIT
-	rm testout_422_ifast_opt.jpg
+	rm -f testout_422_ifast_opt.jpg
 else
 # CC: YCC->RGB565  SAMP: h2v1 merged  IDCT: ifast  ENT: huff
 	./djpeg -dct int -nosmooth -rgb565 -dither none -bmp -outfile testout_422m_ifast_565.bmp testout_422_ifast_opt.jpg
 	md5/md5cmp $(MD5_BMP_422M_IFAST_565) testout_422m_ifast_565.bmp
-	rm testout_422m_ifast_565.bmp
+	rm -f testout_422m_ifast_565.bmp
 # CC: YCC->RGB565 (dithered)  SAMP: h2v1 merged  IDCT: ifast  ENT: huff
 	./djpeg -dct int -nosmooth -rgb565 -bmp -outfile testout_422m_ifast_565D.bmp testout_422_ifast_opt.jpg
 	md5/md5cmp $(MD5_BMP_422M_IFAST_565D) testout_422m_ifast_565D.bmp
-	rm testout_422m_ifast_565D.bmp testout_422_ifast_opt.jpg
+	rm -f testout_422m_ifast_565D.bmp testout_422_ifast_opt.jpg
 endif
 
 # CC: RGB->YCC  SAMP: fullsize/h2v2  FDCT: ifast  ENT: prog huff
@@ -401,11 +427,11 @@
 # CC: YCC->RGB  SAMP: fullsize/h2v2 fancy  IDCT: ifast  ENT: prog huff
 	./djpeg -dct fast -outfile testout_420_q100_ifast.ppm testout_420_q100_ifast_prog.jpg
 	md5/md5cmp $(MD5_PPM_420_Q100_IFAST) testout_420_q100_ifast.ppm
-	rm testout_420_q100_ifast.ppm
+	rm -f testout_420_q100_ifast.ppm
 # CC: YCC->RGB  SAMP: h2v2 merged  IDCT: ifast  ENT: prog huff
 	./djpeg -dct fast -nosmooth -outfile testout_420m_q100_ifast.ppm testout_420_q100_ifast_prog.jpg
 	md5/md5cmp $(MD5_PPM_420M_Q100_IFAST) testout_420m_q100_ifast.ppm
-	rm testout_420m_q100_ifast.ppm testout_420_q100_ifast_prog.jpg
+	rm -f testout_420m_q100_ifast.ppm testout_420_q100_ifast_prog.jpg
 
 # CC: RGB->Gray  SAMP: fullsize  FDCT: islow  ENT: huff
 	./cjpeg -gray -dct int -outfile testout_gray_islow.jpg $(srcdir)/testimages/testorig.ppm
@@ -413,29 +439,29 @@
 # CC: Gray->Gray  SAMP: fullsize  IDCT: islow  ENT: huff
 	./djpeg -dct int -outfile testout_gray_islow.ppm testout_gray_islow.jpg
 	md5/md5cmp $(MD5_PPM_GRAY_ISLOW) testout_gray_islow.ppm
-	rm testout_gray_islow.ppm
+	rm -f testout_gray_islow.ppm
 # CC: Gray->RGB  SAMP: fullsize  IDCT: islow  ENT: huff
 	./djpeg -dct int -rgb -outfile testout_gray_islow_rgb.ppm testout_gray_islow.jpg
 	md5/md5cmp $(MD5_PPM_GRAY_ISLOW_RGB) testout_gray_islow_rgb.ppm
-	rm testout_gray_islow_rgb.ppm
+	rm -f testout_gray_islow_rgb.ppm
 if WITH_12BIT
-	rm testout_gray_islow.jpg
+	rm -f testout_gray_islow.jpg
 else
 # CC: Gray->RGB565  SAMP: fullsize  IDCT: islow  ENT: huff
 	./djpeg -dct int -rgb565 -dither none -bmp -outfile testout_gray_islow_565.bmp testout_gray_islow.jpg
 	md5/md5cmp $(MD5_BMP_GRAY_ISLOW_565) testout_gray_islow_565.bmp
-	rm testout_gray_islow_565.bmp
+	rm -f testout_gray_islow_565.bmp
 # CC: Gray->RGB565 (dithered)  SAMP: fullsize  IDCT: islow  ENT: huff
 	./djpeg -dct int -rgb565 -bmp -outfile testout_gray_islow_565D.bmp testout_gray_islow.jpg
 	md5/md5cmp $(MD5_BMP_GRAY_ISLOW_565D) testout_gray_islow_565D.bmp
-	rm testout_gray_islow_565D.bmp testout_gray_islow.jpg
+	rm -f testout_gray_islow_565D.bmp testout_gray_islow.jpg
 endif
 
 # CC: RGB->YCC  SAMP: fullsize smooth/h2v2 smooth  FDCT: islow
 # ENT: 2-pass huff
 	./cjpeg -sample 2x2 -smooth 1 -dct int -opt -outfile testout_420s_ifast_opt.jpg $(srcdir)/testimages/testorig.ppm
 	md5/md5cmp $(MD5_JPEG_420S_IFAST_OPT) testout_420s_ifast_opt.jpg
-	rm testout_420s_ifast_opt.jpg
+	rm -f testout_420s_ifast_opt.jpg
 
 # The output of the floating point tests is not validated by default, because
 # the output differs depending on the type of floating point math used, and
@@ -469,7 +495,7 @@
 	elif [ "${FLOATTEST}" = "64bit" ]; then \
 		md5/md5cmp $(MD5_PPM_3x2_FLOAT_64BIT) testout_3x2_float.ppm; \
 	fi
-	rm testout_3x2_float.ppm testout_3x2_float_prog.jpg
+	rm -f testout_3x2_float.ppm testout_3x2_float_prog.jpg
 
 # CC: RGB->YCC  SAMP: fullsize/int  FDCT: ifast  ENT: prog huff
 	./cjpeg -sample 3x2 -dct fast -prog -outfile testout_3x2_ifast_prog.jpg $(srcdir)/testimages/testorig.ppm
@@ -477,106 +503,151 @@
 # CC: YCC->RGB  SAMP: fullsize/int  IDCT: ifast  ENT: prog huff
 	./djpeg -dct fast -outfile testout_3x2_ifast.ppm testout_3x2_ifast_prog.jpg
 	md5/md5cmp $(MD5_PPM_3x2_IFAST) testout_3x2_ifast.ppm
-	rm testout_3x2_ifast.ppm testout_3x2_ifast_prog.jpg
+	rm -f testout_3x2_ifast.ppm testout_3x2_ifast_prog.jpg
 
 if WITH_ARITH_ENC
 # CC: YCC->RGB  SAMP: fullsize/h2v2  FDCT: islow  ENT: arith
 	./cjpeg -dct int -arithmetic -outfile testout_420_islow_ari.jpg $(srcdir)/testimages/testorig.ppm
 	md5/md5cmp $(MD5_JPEG_420_ISLOW_ARI) testout_420_islow_ari.jpg
-	rm testout_420_islow_ari.jpg
+	rm -f testout_420_islow_ari.jpg
 	./jpegtran -arithmetic -outfile testout_420_islow_ari.jpg $(srcdir)/testimages/testimgint.jpg
 	md5/md5cmp $(MD5_JPEG_420_ISLOW_ARI) testout_420_islow_ari.jpg
-	rm testout_420_islow_ari.jpg
+	rm -f testout_420_islow_ari.jpg
 # CC: YCC->RGB  SAMP: fullsize  FDCT: islow  ENT: prog arith
-	./cjpeg -sample 1x1 -dct int -progressive -arithmetic -outfile testout_444_islow_progari.jpg $(srcdir)/testimages/testorig.ppm
+	./cjpeg -sample 1x1 -dct int -prog -arithmetic -outfile testout_444_islow_progari.jpg $(srcdir)/testimages/testorig.ppm
 	md5/md5cmp $(MD5_JPEG_444_ISLOW_PROGARI) testout_444_islow_progari.jpg
-	rm testout_444_islow_progari.jpg
+	rm -f testout_444_islow_progari.jpg
 endif
 if WITH_ARITH_DEC
 # CC: RGB->YCC  SAMP: h2v2 merged  IDCT: ifast  ENT: arith
 	./djpeg -fast -ppm -outfile testout_420m_ifast_ari.ppm $(srcdir)/testimages/testimgari.jpg
 	md5/md5cmp $(MD5_PPM_420M_IFAST_ARI) testout_420m_ifast_ari.ppm
-	rm testout_420m_ifast_ari.ppm
+	rm -f testout_420m_ifast_ari.ppm
 	./jpegtran -outfile testout_420_islow.jpg $(srcdir)/testimages/testimgari.jpg
 	md5/md5cmp $(MD5_JPEG_420_ISLOW) testout_420_islow.jpg
-	rm testout_420_islow.jpg
+	rm -f testout_420_islow.jpg
 endif
 
 # CC: YCC->RGB  SAMP: h2v2 merged  IDCT: 16x16 islow  ENT: huff
 	./djpeg -dct int -scale 2/1 -nosmooth -ppm -outfile testout_420m_islow_2_1.ppm $(srcdir)/testimages/$(TESTORIG)
 	md5/md5cmp $(MD5_PPM_420M_ISLOW_2_1) testout_420m_islow_2_1.ppm
-	rm testout_420m_islow_2_1.ppm
+	rm -f testout_420m_islow_2_1.ppm
 # CC: YCC->RGB  SAMP: h2v2 merged  IDCT: 15x15 islow  ENT: huff
 	./djpeg -dct int -scale 15/8 -nosmooth -ppm -outfile testout_420m_islow_15_8.ppm $(srcdir)/testimages/$(TESTORIG)
 	md5/md5cmp $(MD5_PPM_420M_ISLOW_15_8) testout_420m_islow_15_8.ppm
-	rm testout_420m_islow_15_8.ppm
+	rm -f testout_420m_islow_15_8.ppm
 # CC: YCC->RGB  SAMP: h2v2 merged  IDCT: 13x13 islow  ENT: huff
 	./djpeg -dct int -scale 13/8 -nosmooth -ppm -outfile testout_420m_islow_13_8.ppm $(srcdir)/testimages/$(TESTORIG)
 	md5/md5cmp $(MD5_PPM_420M_ISLOW_13_8) testout_420m_islow_13_8.ppm
-	rm testout_420m_islow_13_8.ppm
+	rm -f testout_420m_islow_13_8.ppm
 # CC: YCC->RGB  SAMP: h2v2 merged  IDCT: 11x11 islow  ENT: huff
 	./djpeg -dct int -scale 11/8 -nosmooth -ppm -outfile testout_420m_islow_11_8.ppm $(srcdir)/testimages/$(TESTORIG)
 	md5/md5cmp $(MD5_PPM_420M_ISLOW_11_8) testout_420m_islow_11_8.ppm
-	rm testout_420m_islow_11_8.ppm
+	rm -f testout_420m_islow_11_8.ppm
 # CC: YCC->RGB  SAMP: h2v2 merged  IDCT: 9x9 islow  ENT: huff
 	./djpeg -dct int -scale 9/8 -nosmooth -ppm -outfile testout_420m_islow_9_8.ppm $(srcdir)/testimages/$(TESTORIG)
 	md5/md5cmp $(MD5_PPM_420M_ISLOW_9_8) testout_420m_islow_9_8.ppm
-	rm testout_420m_islow_9_8.ppm
+	rm -f testout_420m_islow_9_8.ppm
 # CC: YCC->RGB  SAMP: h2v2 merged  IDCT: 7x7 islow/14x14 islow  ENT: huff
 	./djpeg -dct int -scale 7/8 -nosmooth -ppm -outfile testout_420m_islow_7_8.ppm $(srcdir)/testimages/$(TESTORIG)
 	md5/md5cmp $(MD5_PPM_420M_ISLOW_7_8) testout_420m_islow_7_8.ppm
-	rm testout_420m_islow_7_8.ppm
+	rm -f testout_420m_islow_7_8.ppm
 # CC: YCC->RGB  SAMP: h2v2 merged  IDCT: 6x6 islow/12x12 islow  ENT: huff
 	./djpeg -dct int -scale 3/4 -nosmooth -ppm -outfile testout_420m_islow_3_4.ppm $(srcdir)/testimages/$(TESTORIG)
 	md5/md5cmp $(MD5_PPM_420M_ISLOW_3_4) testout_420m_islow_3_4.ppm
-	rm testout_420m_islow_3_4.ppm
+	rm -f testout_420m_islow_3_4.ppm
 # CC: YCC->RGB  SAMP: h2v2 merged  IDCT: 5x5 islow/10x10 islow  ENT: huff
 	./djpeg -dct int -scale 5/8 -nosmooth -ppm -outfile testout_420m_islow_5_8.ppm $(srcdir)/testimages/$(TESTORIG)
 	md5/md5cmp $(MD5_PPM_420M_ISLOW_5_8) testout_420m_islow_5_8.ppm
-	rm testout_420m_islow_5_8.ppm
+	rm -f testout_420m_islow_5_8.ppm
 # CC: YCC->RGB  SAMP: h2v2 merged  IDCT: 4x4 islow/8x8 islow  ENT: huff
 	./djpeg -dct int -scale 1/2 -nosmooth -ppm -outfile testout_420m_islow_1_2.ppm $(srcdir)/testimages/$(TESTORIG)
 	md5/md5cmp $(MD5_PPM_420M_ISLOW_1_2) testout_420m_islow_1_2.ppm
-	rm testout_420m_islow_1_2.ppm
+	rm -f testout_420m_islow_1_2.ppm
 # CC: YCC->RGB  SAMP: h2v2 merged  IDCT: 3x3 islow/6x6 islow  ENT: huff
 	./djpeg -dct int -scale 3/8 -nosmooth -ppm -outfile testout_420m_islow_3_8.ppm $(srcdir)/testimages/$(TESTORIG)
 	md5/md5cmp $(MD5_PPM_420M_ISLOW_3_8) testout_420m_islow_3_8.ppm
-	rm testout_420m_islow_3_8.ppm
+	rm -f testout_420m_islow_3_8.ppm
 # CC: YCC->RGB  SAMP: h2v2 merged  IDCT: 2x2 islow/4x4 islow  ENT: huff
 	./djpeg -dct int -scale 1/4 -nosmooth -ppm -outfile testout_420m_islow_1_4.ppm $(srcdir)/testimages/$(TESTORIG)
 	md5/md5cmp $(MD5_PPM_420M_ISLOW_1_4) testout_420m_islow_1_4.ppm
-	rm testout_420m_islow_1_4.ppm
+	rm -f testout_420m_islow_1_4.ppm
 # CC: YCC->RGB  SAMP: h2v2 merged  IDCT: 1x1 islow/2x2 islow  ENT: huff
 	./djpeg -dct int -scale 1/8 -nosmooth -ppm -outfile testout_420m_islow_1_8.ppm $(srcdir)/testimages/$(TESTORIG)
 	md5/md5cmp $(MD5_PPM_420M_ISLOW_1_8) testout_420m_islow_1_8.ppm
-	rm testout_420m_islow_1_8.ppm
+	rm -f testout_420m_islow_1_8.ppm
 if WITH_12BIT
 else
 # CC: YCC->RGB (dithered)  SAMP: h2v2 fancy  IDCT: islow  ENT: huff
 	./djpeg -dct int -colors 256 -bmp -outfile testout_420_islow_256.bmp $(srcdir)/testimages/$(TESTORIG)
 	md5/md5cmp $(MD5_BMP_420_ISLOW_256) testout_420_islow_256.bmp
-	rm testout_420_islow_256.bmp
+	rm -f testout_420_islow_256.bmp
 # CC: YCC->RGB565  SAMP: h2v2 fancy  IDCT: islow  ENT: huff
 	./djpeg -dct int -rgb565 -dither none -bmp -outfile testout_420_islow_565.bmp $(srcdir)/testimages/$(TESTORIG)
 	md5/md5cmp $(MD5_BMP_420_ISLOW_565) testout_420_islow_565.bmp
-	rm testout_420_islow_565.bmp
+	rm -f testout_420_islow_565.bmp
 # CC: YCC->RGB565 (dithered)  SAMP: h2v2 fancy  IDCT: islow  ENT: huff
 	./djpeg -dct int -rgb565 -bmp -outfile testout_420_islow_565D.bmp $(srcdir)/testimages/$(TESTORIG)
 	md5/md5cmp $(MD5_BMP_420_ISLOW_565D) testout_420_islow_565D.bmp
-	rm testout_420_islow_565D.bmp
+	rm -f testout_420_islow_565D.bmp
 # CC: YCC->RGB565  SAMP: h2v2 merged  IDCT: islow  ENT: huff
 	./djpeg -dct int -nosmooth -rgb565 -dither none -bmp -outfile testout_420m_islow_565.bmp $(srcdir)/testimages/$(TESTORIG)
 	md5/md5cmp $(MD5_BMP_420M_ISLOW_565) testout_420m_islow_565.bmp
-	rm testout_420m_islow_565.bmp
+	rm -f testout_420m_islow_565.bmp
 # CC: YCC->RGB565 (dithered)  SAMP: h2v2 merged  IDCT: islow  ENT: huff
 	./djpeg -dct int -nosmooth -rgb565 -bmp -outfile testout_420m_islow_565D.bmp $(srcdir)/testimages/$(TESTORIG)
 	md5/md5cmp $(MD5_BMP_420M_ISLOW_565D) testout_420m_islow_565D.bmp
-	rm testout_420m_islow_565D.bmp
+	rm -f testout_420m_islow_565D.bmp
+endif
+
+# Partial decode tests.  These tests are designed to cover all of the possible
+# code paths in jpeg_skip_scanlines().
+
+# Context rows: Yes  Intra-iMCU row: Yes  iMCU row prefetch: No   ENT: huff
+	./djpeg -dct int -skip 15,31 -ppm -outfile testout_420_islow_skip15,31.ppm $(srcdir)/testimages/$(TESTORIG)
+	md5/md5cmp $(MD5_PPM_420_ISLOW_SKIP15_31) testout_420_islow_skip15,31.ppm
+	rm -f testout_420_islow_skip15,31.ppm
+# Context rows: Yes  Intra-iMCU row: No   iMCU row prefetch: Yes  ENT: arith
+if WITH_ARITH_DEC
+	./djpeg -dct int -skip 16,139 -ppm -outfile testout_420_islow_ari_skip16,139.ppm $(srcdir)/testimages/testimgari.jpg
+	md5/md5cmp $(MD5_PPM_420_ISLOW_ARI_SKIP16_139) testout_420_islow_ari_skip16,139.ppm
+	rm -f testout_420_islow_ari_skip16,139.ppm
+endif
+# Context rows: Yes  Intra-iMCU row: No   iMCU row prefetch: No   ENT: prog huff
+	./cjpeg -dct int -prog -outfile testout_420_islow_prog.jpg $(srcdir)/testimages/testorig.ppm
+	./djpeg -dct int -crop 62x62+71+71 -ppm -outfile testout_420_islow_prog_crop62x62,71,71.ppm testout_420_islow_prog.jpg
+	md5/md5cmp $(MD5_PPM_420_ISLOW_PROG_CROP62x62_71_71) testout_420_islow_prog_crop62x62,71,71.ppm
+	rm -f testout_420_islow_prog_crop62x62,71,71.ppm testout_420_islow_prog.jpg
+# Context rows: Yes  Intra-iMCU row: No   iMCU row prefetch: No   ENT: arith
+if WITH_ARITH_DEC
+	./djpeg -dct int -crop 53x53+4+4 -ppm -outfile testout_420_islow_ari_crop53x53,4,4.ppm $(srcdir)/testimages/testimgari.jpg
+	md5/md5cmp $(MD5_PPM_420_ISLOW_ARI_CROP53x53_4_4) testout_420_islow_ari_crop53x53,4,4.ppm
+	rm -f testout_420_islow_ari_crop53x53,4,4.ppm
+endif
+# Context rows: No   Intra-iMCU row: Yes  ENT: huff
+	./cjpeg -dct int -sample 1x1 -outfile testout_444_islow.jpg $(srcdir)/testimages/testorig.ppm
+	./djpeg -dct int -skip 1,6 -ppm -outfile testout_444_islow_skip1,6.ppm testout_444_islow.jpg
+	md5/md5cmp $(MD5_PPM_444_ISLOW_SKIP1_6) testout_444_islow_skip1,6.ppm
+	rm -f testout_444_islow_skip1,6.ppm testout_444_islow.jpg
+# Context rows: No   Intra-iMCU row: No   ENT: prog huff
+	./cjpeg -dct int -prog -sample 1x1 -outfile testout_444_islow_prog.jpg $(srcdir)/testimages/testorig.ppm
+	./djpeg -dct int -crop 98x98+13+13 -ppm -outfile testout_444_islow_prog_crop98x98,13,13.ppm testout_444_islow_prog.jpg
+	md5/md5cmp $(MD5_PPM_444_ISLOW_PROG_CROP98x98_13_13) testout_444_islow_prog_crop98x98,13,13.ppm
+	rm -f testout_444_islow_prog_crop98x98,13,13.ppm testout_444_islow_prog.jpg
+# Context rows: No   Intra-iMCU row: No   ENT: arith
+if WITH_ARITH_ENC
+	./cjpeg -dct int -arithmetic -sample 1x1 -outfile testout_444_islow_ari.jpg $(srcdir)/testimages/testorig.ppm
+if WITH_ARITH_DEC
+	./djpeg -dct int -crop 37x37+0+0 -ppm -outfile testout_444_islow_ari_crop37x37,0,0.ppm testout_444_islow_ari.jpg
+	md5/md5cmp $(MD5_PPM_444_ISLOW_ARI_CROP37x37_0_0) testout_444_islow_ari_crop37x37,0,0.ppm
+	rm -f testout_444_islow_ari_crop37x37,0,0.ppm
+endif
+	rm -f testout_444_islow_ari.jpg
 endif
 
 	./jpegtran -crop 120x90+20+50 -transpose -perfect -outfile testout_crop.jpg $(srcdir)/testimages/$(TESTORIG)
 	md5/md5cmp $(MD5_JPEG_CROP) testout_crop.jpg
-	rm testout_crop.jpg
+	rm -f testout_crop.jpg
 	echo GREAT SUCCESS!
 
 
diff --git a/README-turbo.txt b/README-turbo.txt
deleted file mode 100755
index 28b6c4d..0000000
--- a/README-turbo.txt
+++ /dev/null
@@ -1,339 +0,0 @@
-*******************************************************************************
-**     Background
-*******************************************************************************
-
-libjpeg-turbo is a JPEG image codec that uses SIMD instructions (MMX, SSE2,
-NEON) to accelerate baseline JPEG compression and decompression on x86, x86-64,
-and ARM systems.  On such systems, libjpeg-turbo is generally 2-4x as fast as
-libjpeg, all else being equal.  On other types of systems, libjpeg-turbo can
-still outperform libjpeg by a significant amount, by virtue of its
-highly-optimized Huffman coding routines.  In many cases, the performance of
-libjpeg-turbo rivals that of proprietary high-speed JPEG codecs.
-
-libjpeg-turbo implements both the traditional libjpeg API as well as the less
-powerful but more straightforward TurboJPEG API.  libjpeg-turbo also features
-colorspace extensions that allow it to compress from/decompress to 32-bit and
-big-endian pixel buffers (RGBX, XBGR, etc.), as well as a full-featured Java
-interface.
-
-libjpeg-turbo was originally based on libjpeg/SIMD, an MMX-accelerated
-derivative of libjpeg v6b developed by Miyasaka Masaru.  The TigerVNC and
-VirtualGL projects made numerous enhancements to the codec in 2009, and in
-early 2010, libjpeg-turbo spun off into an independent project, with the goal
-of making high-speed JPEG compression/decompression technology available to a
-broader range of users and developers.
-
-
-*******************************************************************************
-**     License
-*******************************************************************************
-
-libjpeg-turbo is covered by three compatible BSD-style open source licenses.
-Refer to LICENSE.txt for a roll-up of license terms.
-
-
-*******************************************************************************
-**     Using libjpeg-turbo
-*******************************************************************************
-
-libjpeg-turbo includes two APIs that can be used to compress and decompress
-JPEG images:
-
-  TurboJPEG API:  This API provides an easy-to-use interface for compressing
-  and decompressing JPEG images in memory.  It also provides some functionality
-  that would not be straightforward to achieve using the underlying libjpeg
-  API, such as generating planar YUV images and performing multiple
-  simultaneous lossless transforms on an image.  The Java interface for
-  libjpeg-turbo is written on top of the TurboJPEG API.
-
-  libjpeg API:  This is the de facto industry-standard API for compressing and
-  decompressing JPEG images.  It is more difficult to use than the TurboJPEG
-  API but also more powerful.  The libjpeg API implementation in libjpeg-turbo
-  is both API/ABI-compatible and mathematically compatible with libjpeg v6b.
-  It can also optionally be configured to be API/ABI-compatible with libjpeg v7
-  and v8 (see below.)
-
-There is no significant performance advantage to either API when both are used
-to perform similar operations.
-
-=====================
-Colorspace Extensions
-=====================
-
-libjpeg-turbo includes extensions that allow JPEG images to be compressed
-directly from (and decompressed directly to) buffers that use BGR, BGRX,
-RGBX, XBGR, and XRGB pixel ordering.  This is implemented with ten new
-colorspace constants:
-
-  JCS_EXT_RGB   /* red/green/blue */
-  JCS_EXT_RGBX  /* red/green/blue/x */
-  JCS_EXT_BGR   /* blue/green/red */
-  JCS_EXT_BGRX  /* blue/green/red/x */
-  JCS_EXT_XBGR  /* x/blue/green/red */
-  JCS_EXT_XRGB  /* x/red/green/blue */
-  JCS_EXT_RGBA  /* red/green/blue/alpha */
-  JCS_EXT_BGRA  /* blue/green/red/alpha */
-  JCS_EXT_ABGR  /* alpha/blue/green/red */
-  JCS_EXT_ARGB  /* alpha/red/green/blue */
-
-Setting cinfo.in_color_space (compression) or cinfo.out_color_space
-(decompression) to one of these values will cause libjpeg-turbo to read the
-red, green, and blue values from (or write them to) the appropriate position in
-the pixel when compressing from/decompressing to an RGB buffer.
-
-Your application can check for the existence of these extensions at compile
-time with:
-
-  #ifdef JCS_EXTENSIONS
-
-At run time, attempting to use these extensions with a libjpeg implementation
-that does not support them will result in a "Bogus input colorspace" error.
-Applications can trap this error in order to test whether run-time support is
-available for the colorspace extensions.
-
-When using the RGBX, BGRX, XBGR, and XRGB colorspaces during decompression, the
-X byte is undefined, and in order to ensure the best performance, libjpeg-turbo
-can set that byte to whatever value it wishes.  If an application expects the X
-byte to be used as an alpha channel, then it should specify JCS_EXT_RGBA,
-JCS_EXT_BGRA, JCS_EXT_ABGR, or JCS_EXT_ARGB.  When these colorspace constants
-are used, the X byte is guaranteed to be 0xFF, which is interpreted as opaque.
-
-Your application can check for the existence of the alpha channel colorspace
-extensions at compile time with:
-
-  #ifdef JCS_ALPHA_EXTENSIONS
-
-jcstest.c, located in the libjpeg-turbo source tree, demonstrates how to check
-for the existence of the colorspace extensions at compile time and run time.
-
-===================================
-libjpeg v7 and v8 API/ABI Emulation
-===================================
-
-With libjpeg v7 and v8, new features were added that necessitated extending the
-compression and decompression structures.  Unfortunately, due to the exposed
-nature of those structures, extending them also necessitated breaking backward
-ABI compatibility with previous libjpeg releases.  Thus, programs that were
-built to use libjpeg v7 or v8 did not work with libjpeg-turbo, since it is
-based on the libjpeg v6b code base.  Although libjpeg v7 and v8 are not
-as widely used as v6b, enough programs (including a few Linux distros) made
-the switch that there was a demand to emulate the libjpeg v7 and v8 ABIs
-in libjpeg-turbo.  It should be noted, however, that this feature was added
-primarily so that applications that had already been compiled to use libjpeg
-v7+ could take advantage of accelerated baseline JPEG encoding/decoding
-without recompiling.  libjpeg-turbo does not claim to support all of the
-libjpeg v7+ features, nor to produce identical output to libjpeg v7+ in all
-cases (see below.)
-
-By passing an argument of --with-jpeg7 or --with-jpeg8 to configure, or an
-argument of -DWITH_JPEG7=1 or -DWITH_JPEG8=1 to cmake, you can build a version
-of libjpeg-turbo that emulates the libjpeg v7 or v8 ABI, so that programs
-that are built against libjpeg v7 or v8 can be run with libjpeg-turbo.  The
-following section describes which libjpeg v7+ features are supported and which
-aren't.
-
-Support for libjpeg v7 and v8 Features:
----------------------------------------
-
-Fully supported:
-
--- libjpeg: IDCT scaling extensions in decompressor
-   libjpeg-turbo supports IDCT scaling with scaling factors of 1/8, 1/4, 3/8,
-   1/2, 5/8, 3/4, 7/8, 9/8, 5/4, 11/8, 3/2, 13/8, 7/4, 15/8, and 2/1 (only 1/4
-   and 1/2 are SIMD-accelerated.)
-
--- libjpeg: arithmetic coding
-
--- libjpeg: In-memory source and destination managers
-   See notes below.
-
--- cjpeg: Separate quality settings for luminance and chrominance
-   Note that the libpjeg v7+ API was extended to accommodate this feature only
-   for convenience purposes.  It has always been possible to implement this
-   feature with libjpeg v6b (see rdswitch.c for an example.)
-
--- cjpeg: 32-bit BMP support
-
--- cjpeg: -rgb option
-
--- jpegtran: lossless cropping
-
--- jpegtran: -perfect option
-
--- jpegtran: forcing width/height when performing lossless crop
-
--- rdjpgcom: -raw option
-
--- rdjpgcom: locale awareness
-
-
-Not supported:
-
-NOTE:  As of this writing, extensive research has been conducted into the
-usefulness of DCT scaling as a means of data reduction and SmartScale as a
-means of quality improvement.  The reader is invited to peruse the research at
-http://www.libjpeg-turbo.org/About/SmartScale and draw his/her own conclusions,
-but it is the general belief of our project that these features have not
-demonstrated sufficient usefulness to justify inclusion in libjpeg-turbo.
-
--- libjpeg: DCT scaling in compressor
-   cinfo.scale_num and cinfo.scale_denom are silently ignored.
-   There is no technical reason why DCT scaling could not be supported when
-   emulating the libjpeg v7+ API/ABI, but without the SmartScale extension (see
-   below), only scaling factors of 1/2, 8/15, 4/7, 8/13, 2/3, 8/11, 4/5, and
-   8/9 would be available, which is of limited usefulness.
-
--- libjpeg: SmartScale
-   cinfo.block_size is silently ignored.
-   SmartScale is an extension to the JPEG format that allows for DCT block
-   sizes other than 8x8.  Providing support for this new format would be
-   feasible (particularly without full acceleration.)  However, until/unless
-   the format becomes either an official industry standard or, at minimum, an
-   accepted solution in the community, we are hesitant to implement it, as
-   there is no sense of whether or how it might change in the future.  It is
-   our belief that SmartScale has not demonstrated sufficient usefulness as a
-   lossless format nor as a means of quality enhancement, and thus, our primary
-   interest in providing this feature would be as a means of supporting
-   additional DCT scaling factors.
-
--- libjpeg: Fancy downsampling in compressor
-   cinfo.do_fancy_downsampling is silently ignored.
-   This requires the DCT scaling feature, which is not supported.
-
--- jpegtran: Scaling
-   This requires both the DCT scaling and SmartScale features, which are not
-   supported.
-
--- Lossless RGB JPEG files
-   This requires the SmartScale feature, which is not supported.
-
-What About libjpeg v9?
-----------------------
-
-libjpeg v9 introduced yet another field to the JPEG compression structure
-(color_transform), thus making the ABI backward incompatible with that of
-libjpeg v8.  This new field was introduced solely for the purpose of supporting
-lossless SmartScale encoding.  Further, there was actually no reason to extend
-the API in this manner, as the color transform could have just as easily been
-activated by way of a new JPEG colorspace constant, thus preserving backward
-ABI compatibility.
-
-Our research (see link above) has shown that lossless SmartScale does not
-generally accomplish anything that can't already be accomplished better with
-existing, standard lossless formats.  Thus, at this time, it is our belief that
-there is not sufficient technical justification for software to upgrade from
-libjpeg v8 to libjpeg v9, and therefore, not sufficient technical justification
-for us to emulate the libjpeg v9 ABI.
-
-=====================================
-In-Memory Source/Destination Managers
-=====================================
-
-By default, libjpeg-turbo 1.3 and later includes the jpeg_mem_src() and
-jpeg_mem_dest() functions, even when not emulating the libjpeg v8 API/ABI.
-Previously, it was necessary to build libjpeg-turbo from source with libjpeg v8
-API/ABI emulation in order to use the in-memory source/destination managers,
-but several projects requested that those functions be included when emulating
-the libjpeg v6b API/ABI as well.  This allows the use of those functions by
-programs that need them without breaking ABI compatibility for programs that
-don't, and it allows those functions to be provided in the "official"
-libjpeg-turbo binaries.
-
-Those who are concerned about maintaining strict conformance with the libjpeg
-v6b or v7 API can pass an argument of --without-mem-srcdst to configure or
-an argument of -DWITH_MEM_SRCDST=0 to CMake prior to building libjpeg-turbo.
-This will restore the pre-1.3 behavior, in which jpeg_mem_src() and
-jpeg_mem_dest() are only included when emulating the libjpeg v8 API/ABI.
-
-On Un*x systems, including the in-memory source/destination managers changes
-the dynamic library version from 62.0.0 to 62.1.0 if using libjpeg v6b API/ABI
-emulation and from 7.0.0 to 7.1.0 if using libjpeg v7 API/ABI emulation.
-
-Note that, on most Un*x systems, the dynamic linker will not look for a
-function in a library until that function is actually used.  Thus, if a program
-is built against libjpeg-turbo 1.3+ and uses jpeg_mem_src() or jpeg_mem_dest(),
-that program will not fail if run against an older version of libjpeg-turbo or
-against libjpeg v7- until the program actually tries to call jpeg_mem_src() or
-jpeg_mem_dest().  Such is not the case on Windows.  If a program is built
-against the libjpeg-turbo 1.3+ DLL and uses jpeg_mem_src() or jpeg_mem_dest(),
-then it must use the libjpeg-turbo 1.3+ DLL at run time.
-
-Both cjpeg and djpeg have been extended to allow testing the in-memory
-source/destination manager functions.  See their respective man pages for more
-details.
-
-
-*******************************************************************************
-**     Mathematical Compatibility
-*******************************************************************************
-
-For the most part, libjpeg-turbo should produce identical output to libjpeg
-v6b.  The one exception to this is when using the floating point DCT/IDCT, in
-which case the outputs of libjpeg v6b and libjpeg-turbo can differ for the
-following reasons:
-
--- The SSE/SSE2 floating point DCT implementation in libjpeg-turbo is ever so
-   slightly more accurate than the implementation in libjpeg v6b, but not by
-   any amount perceptible to human vision (generally in the range of 0.01 to
-   0.08 dB gain in PNSR.)
--- When not using the SIMD extensions, libjpeg-turbo uses the more accurate
-   (and slightly faster) floating point IDCT algorithm introduced in libjpeg
-   v8a as opposed to the algorithm used in libjpeg v6b.  It should be noted,
-   however, that this algorithm basically brings the accuracy of the floating
-   point IDCT in line with the accuracy of the slow integer IDCT.  The floating
-   point DCT/IDCT algorithms are mainly a legacy feature, and they do not
-   produce significantly more accuracy than the slow integer algorithms (to put
-   numbers on this, the typical difference in PNSR between the two algorithms
-   is less than 0.10 dB, whereas changing the quality level by 1 in the upper
-   range of the quality scale is typically more like a 1.0 dB difference.)
--- If the floating point algorithms in libjpeg-turbo are not implemented using
-   SIMD instructions on a particular platform, then the accuracy of the
-   floating point DCT/IDCT can depend on the compiler settings.
-
-While libjpeg-turbo does emulate the libjpeg v8 API/ABI, under the hood, it is
-still using the same algorithms as libjpeg v6b, so there are several specific
-cases in which libjpeg-turbo cannot be expected to produce the same output as
-libjpeg v8:
-
--- When decompressing using scaling factors of 1/2 and 1/4, because libjpeg v8
-   implements those scaling algorithms differently than libjpeg v6b does, and
-   libjpeg-turbo's SIMD extensions are based on the libjpeg v6b behavior.
-
--- When using chrominance subsampling, because libjpeg v8 implements this
-   with its DCT/IDCT scaling algorithms rather than with a separate
-   downsampling/upsampling algorithm.  In our testing, the subsampled/upsampled
-   output of libjpeg v8 is less accurate than that of libjpeg v6b for this
-   reason.
-
--- When decompressing using a scaling factor > 1 and merged (AKA "non-fancy" or
-   "non-smooth") chrominance upsampling, because libjpeg v8 does not support
-   merged upsampling with scaling factors > 1.
-
-
-*******************************************************************************
-**     Performance Pitfalls
-*******************************************************************************
-
-===============
-Restart Markers
-===============
-
-The optimized Huffman decoder in libjpeg-turbo does not handle restart markers
-in a way that makes the rest of the libjpeg infrastructure happy, so it is
-necessary to use the slow Huffman decoder when decompressing a JPEG image that
-has restart markers.  This can cause the decompression performance to drop by
-as much as 20%, but the performance will still be much greater than that of
-libjpeg.  Many consumer packages, such as PhotoShop, use restart markers when
-generating JPEG images, so images generated by those programs will experience
-this issue.
-
-===============================================
-Fast Integer Forward DCT at High Quality Levels
-===============================================
-
-The algorithm used by the SIMD-accelerated quantization function cannot produce
-correct results whenever the fast integer forward DCT is used along with a JPEG
-quality of 98-100.  Thus, libjpeg-turbo must use the non-SIMD quantization
-function in those cases.  This causes performance to drop by as much as 40%.
-It is therefore strongly advised that you use the slow integer forward DCT
-whenever encoding images with a JPEG quality of 98 or higher.
diff --git a/README b/README.ijg
similarity index 93%
rename from README
rename to README.ijg
index e82a095..9c450ce 100644
--- a/README
+++ b/README.ijg
@@ -1,7 +1,7 @@
 libjpeg-turbo note:  This file has been modified by The libjpeg-turbo Project
 to include only information relevant to libjpeg-turbo, to wordsmith certain
 sections, and to remove impolitic language that existed in the libjpeg v8
-README.  It is included only for reference.  Please see README-turbo.txt for
+README.  It is included only for reference.  Please see README.md for
 information specific to libjpeg-turbo.
 
 
@@ -128,7 +128,7 @@
 fitness for a particular purpose.  This software is provided "AS IS", and you,
 its user, assume the entire risk as to its quality and accuracy.
 
-This software is copyright (C) 1991-2012, Thomas G. Lane, Guido Vollbeding.
+This software is copyright (C) 1991-2016, Thomas G. Lane, Guido Vollbeding.
 All Rights Reserved except as specified below.
 
 Permission is hereby granted to use, copy, modify, and distribute this
@@ -166,11 +166,11 @@
 but is also freely distributable.
 
 The IJG distribution formerly included code to read and write GIF files.
-To avoid entanglement with the Unisys LZW patent, GIF reading support has
-been removed altogether, and the GIF writer has been simplified to produce
-"uncompressed GIFs".  This technique does not use the LZW algorithm; the
-resulting GIF files are larger than usual, but are readable by all standard
-GIF decoders.
+To avoid entanglement with the Unisys LZW patent (now expired), GIF reading
+support has been removed altogether, and the GIF writer has been simplified
+to produce "uncompressed GIFs".  This technique does not use the LZW
+algorithm; the resulting GIF files are larger than usual, but are readable
+by all standard GIF decoders.
 
 We are required to state that
     "The Graphics Interchange Format(c) is the Copyright property of
@@ -189,8 +189,8 @@
 	Communications of the ACM, April 1991 (vol. 34 no. 4), pp. 30-44.
 (Adjacent articles in that issue discuss MPEG motion picture compression,
 applications of JPEG, and related topics.)  If you don't have the CACM issue
-handy, a PostScript file containing a revised version of Wallace's article is
-available at http://www.ijg.org/files/wallace.ps.gz.  The file (actually
+handy, a PDF file containing a revised version of Wallace's article is
+available at http://www.ijg.org/files/Wallace.JPEG.pdf.  The file (actually
 a preprint for an article that appeared in IEEE Trans. Consumer Electronics)
 omits the sample images that appeared in CACM, but it includes corrections
 and some added material.  Note: the Wallace article is copyright ACM and IEEE,
@@ -246,9 +246,7 @@
 
 The "official" archive site for this software is www.ijg.org.
 The most recent released version can always be found there in
-directory "files".  This particular version will be archived as
-http://www.ijg.org/files/jpegsrc.v8d.tar.gz, and in Windows-compatible
-"zip" archive format as http://www.ijg.org/files/jpegsr8d.zip.
+directory "files".
 
 The JPEG FAQ (Frequently Asked Questions) article is a source of some
 general information about JPEG.
diff --git a/README.md b/README.md
new file mode 100755
index 0000000..ca8866e
--- /dev/null
+++ b/README.md
@@ -0,0 +1,341 @@
+Background
+==========
+
+libjpeg-turbo is a JPEG image codec that uses SIMD instructions (MMX, SSE2,
+NEON, AltiVec) to accelerate baseline JPEG compression and decompression on
+x86, x86-64, ARM, and PowerPC systems.  On such systems, libjpeg-turbo is
+generally 2-6x as fast as libjpeg, all else being equal.  On other types of
+systems, libjpeg-turbo can still outperform libjpeg by a significant amount, by
+virtue of its highly-optimized Huffman coding routines.  In many cases, the
+performance of libjpeg-turbo rivals that of proprietary high-speed JPEG codecs.
+
+libjpeg-turbo implements both the traditional libjpeg API as well as the less
+powerful but more straightforward TurboJPEG API.  libjpeg-turbo also features
+colorspace extensions that allow it to compress from/decompress to 32-bit and
+big-endian pixel buffers (RGBX, XBGR, etc.), as well as a full-featured Java
+interface.
+
+libjpeg-turbo was originally based on libjpeg/SIMD, an MMX-accelerated
+derivative of libjpeg v6b developed by Miyasaka Masaru.  The TigerVNC and
+VirtualGL projects made numerous enhancements to the codec in 2009, and in
+early 2010, libjpeg-turbo spun off into an independent project, with the goal
+of making high-speed JPEG compression/decompression technology available to a
+broader range of users and developers.
+
+
+License
+=======
+
+libjpeg-turbo is covered by three compatible BSD-style open source licenses.
+Refer to [LICENSE.md](LICENSE.md) for a roll-up of license terms.
+
+
+Building libjpeg-turbo
+======================
+
+Refer to [BUILDING.md](BUILDING.md) for complete instructions.
+
+
+Using libjpeg-turbo
+===================
+
+libjpeg-turbo includes two APIs that can be used to compress and decompress
+JPEG images:
+
+- **TurboJPEG API**  
+  This API provides an easy-to-use interface for compressing and decompressing
+  JPEG images in memory.  It also provides some functionality that would not be
+  straightforward to achieve using the underlying libjpeg API, such as
+  generating planar YUV images and performing multiple simultaneous lossless
+  transforms on an image.  The Java interface for libjpeg-turbo is written on
+  top of the TurboJPEG API.
+
+- **libjpeg API**  
+  This is the de facto industry-standard API for compressing and decompressing
+  JPEG images.  It is more difficult to use than the TurboJPEG API but also
+  more powerful.  The libjpeg API implementation in libjpeg-turbo is both
+  API/ABI-compatible and mathematically compatible with libjpeg v6b.  It can
+  also optionally be configured to be API/ABI-compatible with libjpeg v7 and v8
+  (see below.)
+
+There is no significant performance advantage to either API when both are used
+to perform similar operations.
+
+Colorspace Extensions
+---------------------
+
+libjpeg-turbo includes extensions that allow JPEG images to be compressed
+directly from (and decompressed directly to) buffers that use BGR, BGRX,
+RGBX, XBGR, and XRGB pixel ordering.  This is implemented with ten new
+colorspace constants:
+
+    JCS_EXT_RGB   /* red/green/blue */
+    JCS_EXT_RGBX  /* red/green/blue/x */
+    JCS_EXT_BGR   /* blue/green/red */
+    JCS_EXT_BGRX  /* blue/green/red/x */
+    JCS_EXT_XBGR  /* x/blue/green/red */
+    JCS_EXT_XRGB  /* x/red/green/blue */
+    JCS_EXT_RGBA  /* red/green/blue/alpha */
+    JCS_EXT_BGRA  /* blue/green/red/alpha */
+    JCS_EXT_ABGR  /* alpha/blue/green/red */
+    JCS_EXT_ARGB  /* alpha/red/green/blue */
+
+Setting `cinfo.in_color_space` (compression) or `cinfo.out_color_space`
+(decompression) to one of these values will cause libjpeg-turbo to read the
+red, green, and blue values from (or write them to) the appropriate position in
+the pixel when compressing from/decompressing to an RGB buffer.
+
+Your application can check for the existence of these extensions at compile
+time with:
+
+    #ifdef JCS_EXTENSIONS
+
+At run time, attempting to use these extensions with a libjpeg implementation
+that does not support them will result in a "Bogus input colorspace" error.
+Applications can trap this error in order to test whether run-time support is
+available for the colorspace extensions.
+
+When using the RGBX, BGRX, XBGR, and XRGB colorspaces during decompression, the
+X byte is undefined, and in order to ensure the best performance, libjpeg-turbo
+can set that byte to whatever value it wishes.  If an application expects the X
+byte to be used as an alpha channel, then it should specify `JCS_EXT_RGBA`,
+`JCS_EXT_BGRA`, `JCS_EXT_ABGR`, or `JCS_EXT_ARGB`.  When these colorspace
+constants are used, the X byte is guaranteed to be 0xFF, which is interpreted
+as opaque.
+
+Your application can check for the existence of the alpha channel colorspace
+extensions at compile time with:
+
+    #ifdef JCS_ALPHA_EXTENSIONS
+
+[jcstest.c](jcstest.c), located in the libjpeg-turbo source tree, demonstrates
+how to check for the existence of the colorspace extensions at compile time and
+run time.
+
+libjpeg v7 and v8 API/ABI Emulation
+-----------------------------------
+
+With libjpeg v7 and v8, new features were added that necessitated extending the
+compression and decompression structures.  Unfortunately, due to the exposed
+nature of those structures, extending them also necessitated breaking backward
+ABI compatibility with previous libjpeg releases.  Thus, programs that were
+built to use libjpeg v7 or v8 did not work with libjpeg-turbo, since it is
+based on the libjpeg v6b code base.  Although libjpeg v7 and v8 are not
+as widely used as v6b, enough programs (including a few Linux distros) made
+the switch that there was a demand to emulate the libjpeg v7 and v8 ABIs
+in libjpeg-turbo.  It should be noted, however, that this feature was added
+primarily so that applications that had already been compiled to use libjpeg
+v7+ could take advantage of accelerated baseline JPEG encoding/decoding
+without recompiling.  libjpeg-turbo does not claim to support all of the
+libjpeg v7+ features, nor to produce identical output to libjpeg v7+ in all
+cases (see below.)
+
+By passing an argument of `--with-jpeg7` or `--with-jpeg8` to `configure`, or
+an argument of `-DWITH_JPEG7=1` or `-DWITH_JPEG8=1` to `cmake`, you can build a
+version of libjpeg-turbo that emulates the libjpeg v7 or v8 ABI, so that
+programs that are built against libjpeg v7 or v8 can be run with libjpeg-turbo.
+The following section describes which libjpeg v7+ features are supported and
+which aren't.
+
+### Support for libjpeg v7 and v8 Features
+
+#### Fully supported
+
+- **libjpeg: IDCT scaling extensions in decompressor**  
+  libjpeg-turbo supports IDCT scaling with scaling factors of 1/8, 1/4, 3/8,
+  1/2, 5/8, 3/4, 7/8, 9/8, 5/4, 11/8, 3/2, 13/8, 7/4, 15/8, and 2/1 (only 1/4
+  and 1/2 are SIMD-accelerated.)
+
+- **libjpeg: Arithmetic coding**
+
+- **libjpeg: In-memory source and destination managers**  
+  See notes below.
+
+- **cjpeg: Separate quality settings for luminance and chrominance**  
+  Note that the libpjeg v7+ API was extended to accommodate this feature only
+  for convenience purposes.  It has always been possible to implement this
+  feature with libjpeg v6b (see rdswitch.c for an example.)
+
+- **cjpeg: 32-bit BMP support**
+
+- **cjpeg: `-rgb` option**
+
+- **jpegtran: Lossless cropping**
+
+- **jpegtran: `-perfect` option**
+
+- **jpegtran: Forcing width/height when performing lossless crop**
+
+- **rdjpgcom: `-raw` option**
+
+- **rdjpgcom: Locale awareness**
+
+
+#### Not supported
+
+NOTE:  As of this writing, extensive research has been conducted into the
+usefulness of DCT scaling as a means of data reduction and SmartScale as a
+means of quality improvement.  The reader is invited to peruse the research at
+<http://www.libjpeg-turbo.org/About/SmartScale> and draw his/her own conclusions,
+but it is the general belief of our project that these features have not
+demonstrated sufficient usefulness to justify inclusion in libjpeg-turbo.
+
+- **libjpeg: DCT scaling in compressor**  
+  `cinfo.scale_num` and `cinfo.scale_denom` are silently ignored.
+  There is no technical reason why DCT scaling could not be supported when
+  emulating the libjpeg v7+ API/ABI, but without the SmartScale extension (see
+  below), only scaling factors of 1/2, 8/15, 4/7, 8/13, 2/3, 8/11, 4/5, and
+  8/9 would be available, which is of limited usefulness.
+
+- **libjpeg: SmartScale**  
+  `cinfo.block_size` is silently ignored.
+  SmartScale is an extension to the JPEG format that allows for DCT block
+  sizes other than 8x8.  Providing support for this new format would be
+  feasible (particularly without full acceleration.)  However, until/unless
+  the format becomes either an official industry standard or, at minimum, an
+  accepted solution in the community, we are hesitant to implement it, as
+  there is no sense of whether or how it might change in the future.  It is
+  our belief that SmartScale has not demonstrated sufficient usefulness as a
+  lossless format nor as a means of quality enhancement, and thus our primary
+  interest in providing this feature would be as a means of supporting
+  additional DCT scaling factors.
+
+- **libjpeg: Fancy downsampling in compressor**  
+  `cinfo.do_fancy_downsampling` is silently ignored.
+  This requires the DCT scaling feature, which is not supported.
+
+- **jpegtran: Scaling**  
+  This requires both the DCT scaling and SmartScale features, which are not
+  supported.
+
+- **Lossless RGB JPEG files**  
+  This requires the SmartScale feature, which is not supported.
+
+### What About libjpeg v9?
+
+libjpeg v9 introduced yet another field to the JPEG compression structure
+(`color_transform`), thus making the ABI backward incompatible with that of
+libjpeg v8.  This new field was introduced solely for the purpose of supporting
+lossless SmartScale encoding.  Furthermore, there was actually no reason to
+extend the API in this manner, as the color transform could have just as easily
+been activated by way of a new JPEG colorspace constant, thus preserving
+backward ABI compatibility.
+
+Our research (see link above) has shown that lossless SmartScale does not
+generally accomplish anything that can't already be accomplished better with
+existing, standard lossless formats.  Therefore, at this time it is our belief
+that there is not sufficient technical justification for software projects to
+upgrade from libjpeg v8 to libjpeg v9, and thus there is not sufficient
+echnical justification for us to emulate the libjpeg v9 ABI.
+
+In-Memory Source/Destination Managers
+-------------------------------------
+
+By default, libjpeg-turbo 1.3 and later includes the `jpeg_mem_src()` and
+`jpeg_mem_dest()` functions, even when not emulating the libjpeg v8 API/ABI.
+Previously, it was necessary to build libjpeg-turbo from source with libjpeg v8
+API/ABI emulation in order to use the in-memory source/destination managers,
+but several projects requested that those functions be included when emulating
+the libjpeg v6b API/ABI as well.  This allows the use of those functions by
+programs that need them, without breaking ABI compatibility for programs that
+don't, and it allows those functions to be provided in the "official"
+libjpeg-turbo binaries.
+
+Those who are concerned about maintaining strict conformance with the libjpeg
+v6b or v7 API can pass an argument of `--without-mem-srcdst` to `configure` or
+an argument of `-DWITH_MEM_SRCDST=0` to `cmake` prior to building
+libjpeg-turbo.  This will restore the pre-1.3 behavior, in which
+`jpeg_mem_src()` and `jpeg_mem_dest()` are only included when emulating the
+libjpeg v8 API/ABI.
+
+On Un*x systems, including the in-memory source/destination managers changes
+the dynamic library version from 62.0.0 to 62.1.0 if using libjpeg v6b API/ABI
+emulation and from 7.0.0 to 7.1.0 if using libjpeg v7 API/ABI emulation.
+
+Note that, on most Un*x systems, the dynamic linker will not look for a
+function in a library until that function is actually used.  Thus, if a program
+is built against libjpeg-turbo 1.3+ and uses `jpeg_mem_src()` or
+`jpeg_mem_dest()`, that program will not fail if run against an older version
+of libjpeg-turbo or against libjpeg v7- until the program actually tries to
+call `jpeg_mem_src()` or `jpeg_mem_dest()`.  Such is not the case on Windows.
+If a program is built against the libjpeg-turbo 1.3+ DLL and uses
+`jpeg_mem_src()` or `jpeg_mem_dest()`, then it must use the libjpeg-turbo 1.3+
+DLL at run time.
+
+Both cjpeg and djpeg have been extended to allow testing the in-memory
+source/destination manager functions.  See their respective man pages for more
+details.
+
+
+Mathematical Compatibility
+==========================
+
+For the most part, libjpeg-turbo should produce identical output to libjpeg
+v6b.  The one exception to this is when using the floating point DCT/IDCT, in
+which case the outputs of libjpeg v6b and libjpeg-turbo can differ for the
+following reasons:
+
+- The SSE/SSE2 floating point DCT implementation in libjpeg-turbo is ever so
+  slightly more accurate than the implementation in libjpeg v6b, but not by
+  any amount perceptible to human vision (generally in the range of 0.01 to
+  0.08 dB gain in PNSR.)
+
+- When not using the SIMD extensions, libjpeg-turbo uses the more accurate
+  (and slightly faster) floating point IDCT algorithm introduced in libjpeg
+  v8a as opposed to the algorithm used in libjpeg v6b.  It should be noted,
+  however, that this algorithm basically brings the accuracy of the floating
+  point IDCT in line with the accuracy of the slow integer IDCT.  The floating
+  point DCT/IDCT algorithms are mainly a legacy feature, and they do not
+  produce significantly more accuracy than the slow integer algorithms (to put
+  numbers on this, the typical difference in PNSR between the two algorithms
+  is less than 0.10 dB, whereas changing the quality level by 1 in the upper
+  range of the quality scale is typically more like a 1.0 dB difference.)
+
+- If the floating point algorithms in libjpeg-turbo are not implemented using
+  SIMD instructions on a particular platform, then the accuracy of the
+  floating point DCT/IDCT can depend on the compiler settings.
+
+While libjpeg-turbo does emulate the libjpeg v8 API/ABI, under the hood it is
+still using the same algorithms as libjpeg v6b, so there are several specific
+cases in which libjpeg-turbo cannot be expected to produce the same output as
+libjpeg v8:
+
+- When decompressing using scaling factors of 1/2 and 1/4, because libjpeg v8
+  implements those scaling algorithms differently than libjpeg v6b does, and
+  libjpeg-turbo's SIMD extensions are based on the libjpeg v6b behavior.
+
+- When using chrominance subsampling, because libjpeg v8 implements this
+  with its DCT/IDCT scaling algorithms rather than with a separate
+  downsampling/upsampling algorithm.  In our testing, the subsampled/upsampled
+  output of libjpeg v8 is less accurate than that of libjpeg v6b for this
+  reason.
+
+- When decompressing using a scaling factor > 1 and merged (AKA "non-fancy" or
+  "non-smooth") chrominance upsampling, because libjpeg v8 does not support
+  merged upsampling with scaling factors > 1.
+
+
+Performance Pitfalls
+====================
+
+Restart Markers
+---------------
+
+The optimized Huffman decoder in libjpeg-turbo does not handle restart markers
+in a way that makes the rest of the libjpeg infrastructure happy, so it is
+necessary to use the slow Huffman decoder when decompressing a JPEG image that
+has restart markers.  This can cause the decompression performance to drop by
+as much as 20%, but the performance will still be much greater than that of
+libjpeg.  Many consumer packages, such as PhotoShop, use restart markers when
+generating JPEG images, so images generated by those programs will experience
+this issue.
+
+Fast Integer Forward DCT at High Quality Levels
+-----------------------------------------------
+
+The algorithm used by the SIMD-accelerated quantization function cannot produce
+correct results whenever the fast integer forward DCT is used along with a JPEG
+quality of 98-100.  Thus, libjpeg-turbo must use the non-SIMD quantization
+function in those cases.  This causes performance to drop by as much as 40%.
+It is therefore strongly advised that you use the slow integer forward DCT
+whenever encoding images with a JPEG quality of 98 or higher.
diff --git a/acinclude.m4 b/acinclude.m4
index 4a13082..2c90762 100644
--- a/acinclude.m4
+++ b/acinclude.m4
@@ -3,8 +3,11 @@
 # Check that NASM exists and determine flags
 AC_DEFUN([AC_PROG_NASM],[
 
-AC_CHECK_PROGS(NASM, [nasm nasmw yasm])
-test -z "$NASM" && AC_MSG_ERROR([no nasm (Netwide Assembler) found])
+AC_ARG_VAR(NASM, [NASM command (used to build the x86/x86-64 SIMD code)])
+if test "x$NASM" = "x"; then
+  AC_CHECK_PROGS(NASM, [nasm nasmw yasm])
+  test -z "$NASM" && AC_MSG_ERROR([no nasm (Netwide Assembler) found])
+fi
 
 AC_MSG_CHECKING([for object file format of host system])
 case "$host_os" in
@@ -219,16 +222,20 @@
   CC="$CCAS"
   AC_COMPILE_IFELSE([AC_LANG_SOURCE([[
     .text
-    .arch armv8-a+fp+simd
-    movi v0.16b, #100]])], ac_good_gnu_arm_assembler=yes)
+    MYVAR .req x0
+    movi v0.16b, #100
+    mov MYVAR, #100
+    .unreq MYVAR]])], ac_good_gnu_arm_assembler=yes)
 
   ac_use_gas_preprocessor=no
   if test "x$ac_good_gnu_arm_assembler" = "xno" ; then
     CC="gas-preprocessor.pl $CCAS"
     AC_COMPILE_IFELSE([AC_LANG_SOURCE([[
       .text
-      .arch armv8-a+fp+simd
-      movi v0.16b, #100]])], ac_use_gas_preprocessor=yes)
+      MYVAR .req x0
+      movi v0.16b, #100
+      mov MYVAR, #100
+      .unreq MYVAR]])], ac_use_gas_preprocessor=yes)
   fi
   CFLAGS="$ac_save_CFLAGS"
   CC="$ac_save_CC"
diff --git a/cderror.h b/cderror.h
index 6569aef..63de498 100644
--- a/cderror.h
+++ b/cderror.h
@@ -4,7 +4,8 @@
  * Copyright (C) 1994-1997, Thomas G. Lane.
  * Modified 2009 by Guido Vollbeding.
  * This file is part of the Independent JPEG Group's software.
- * For conditions of distribution and use, see the accompanying README file.
+ * For conditions of distribution and use, see the accompanying README.ijg
+ * file.
  *
  * This file defines the error and message codes for the cjpeg/djpeg
  * applications.  These strings are not needed as part of the JPEG library
diff --git a/cdjpeg.c b/cdjpeg.c
index 7cc0d6e..441d671 100644
--- a/cdjpeg.c
+++ b/cdjpeg.c
@@ -5,7 +5,8 @@
  * Copyright (C) 1991-1997, Thomas G. Lane.
  * It was modified by The libjpeg-turbo Project to include only code relevant
  * to libjpeg-turbo.
- * For conditions of distribution and use, see the accompanying README file.
+ * For conditions of distribution and use, see the accompanying README.ijg
+ * file.
  *
  * This file contains common support routines used by the IJG application
  * programs (cjpeg, djpeg, jpegtran).
@@ -81,7 +82,7 @@
  */
 
 GLOBAL(boolean)
-keymatch (char * arg, const char * keyword, int minchars)
+keymatch (char *arg, const char *keyword, int minchars)
 {
   register int ca, ck;
   register int nmatched = 0;
diff --git a/cdjpeg.h b/cdjpeg.h
index 8461ee3..a65310e 100644
--- a/cdjpeg.h
+++ b/cdjpeg.h
@@ -5,7 +5,8 @@
  * Copyright (C) 1994-1997, Thomas G. Lane.
  * It was modified by The libjpeg-turbo Project to include only code relevant
  * to libjpeg-turbo.
- * For conditions of distribution and use, see the accompanying README file.
+ * For conditions of distribution and use, see the accompanying README.ijg
+ * file.
  *
  * This file contains common declarations for the sample applications
  * cjpeg and djpeg.  It is NOT used by the core JPEG library.
@@ -23,7 +24,7 @@
  * Object interface for cjpeg's source file decoding modules
  */
 
-typedef struct cjpeg_source_struct * cjpeg_source_ptr;
+typedef struct cjpeg_source_struct *cjpeg_source_ptr;
 
 struct cjpeg_source_struct {
   void (*start_input) (j_compress_ptr cinfo, cjpeg_source_ptr sinfo);
@@ -41,7 +42,7 @@
  * Object interface for djpeg's output file encoding modules
  */
 
-typedef struct djpeg_dest_struct * djpeg_dest_ptr;
+typedef struct djpeg_dest_struct *djpeg_dest_ptr;
 
 struct djpeg_dest_struct {
   /* start_output is called after jpeg_start_decompress finishes.
@@ -55,7 +56,7 @@
   void (*finish_output) (j_decompress_ptr cinfo, djpeg_dest_ptr dinfo);
 
   /* Target file spec; filled in by djpeg.c after object is created. */
-  FILE * output_file;
+  FILE *output_file;
 
   /* Output pixel-row buffer.  Created by module init or start_output.
    * Width is cinfo->output_width * cinfo->output_components;
@@ -82,7 +83,7 @@
   int percent_done;
 };
 
-typedef struct cdjpeg_progress_mgr * cd_progress_ptr;
+typedef struct cdjpeg_progress_mgr *cd_progress_ptr;
 
 
 /* Module selection routines for I/O modules. */
@@ -101,9 +102,9 @@
 
 /* cjpeg support routines (in rdswitch.c) */
 
-EXTERN(boolean) read_quant_tables (j_compress_ptr cinfo, char * filename,
+EXTERN(boolean) read_quant_tables (j_compress_ptr cinfo, char *filename,
                                    boolean force_baseline);
-EXTERN(boolean) read_scan_script (j_compress_ptr cinfo, char * filename);
+EXTERN(boolean) read_scan_script (j_compress_ptr cinfo, char *filename);
 EXTERN(boolean) set_quality_ratings (j_compress_ptr cinfo, char *arg,
                                      boolean force_baseline);
 EXTERN(boolean) set_quant_slots (j_compress_ptr cinfo, char *arg);
@@ -111,7 +112,7 @@
 
 /* djpeg support routines (in rdcolmap.c) */
 
-EXTERN(void) read_color_map (j_decompress_ptr cinfo, FILE * infile);
+EXTERN(void) read_color_map (j_decompress_ptr cinfo, FILE *infile);
 
 /* common support routines (in cdjpeg.c) */
 
@@ -119,7 +120,7 @@
 EXTERN(void) start_progress_monitor (j_common_ptr cinfo,
                                      cd_progress_ptr progress);
 EXTERN(void) end_progress_monitor (j_common_ptr cinfo);
-EXTERN(boolean) keymatch (char * arg, const char * keyword, int minchars);
+EXTERN(boolean) keymatch (char *arg, const char *keyword, int minchars);
 EXTERN(FILE *) read_stdin (void);
 EXTERN(FILE *) write_stdout (void);
 
diff --git a/change.log b/change.log
index b60ddd6..f090d77 100644
--- a/change.log
+++ b/change.log
@@ -1,9 +1,28 @@
-NOTE:  This file was modified by The libjpeg-turbo Project to include only
-information relevant to libjpeg-turbo.
+libjpeg-turbo note:  This file has been modified by The libjpeg-turbo Project
+to include only information relevant to libjpeg-turbo.  It is included only for
+reference.  Please see ChangeLog.md for information specific to libjpeg-turbo.
+
 
 CHANGE LOG for Independent JPEG Group's JPEG software
 
 
+Version 9b  17-Jan-2016
+-----------------------
+
+Document 'f' specifier for jpegtran -crop specification.
+Thank to Michele Martone for suggestion.
+
+
+Version 9  13-Jan-2013
+----------------------
+
+Add remark for jpeg_mem_dest() in jdatadst.c.
+Thank to Elie-Gregoire Khoury for the hint.
+
+Correct argument type in format string, avoid compiler warnings.
+Thank to Vincent Torri for hint.
+
+
 Version 8d  15-Jan-2012
 -----------------------
 
diff --git a/cjpeg.1 b/cjpeg.1
index e338c80..d1dc304 100644
--- a/cjpeg.1
+++ b/cjpeg.1
@@ -1,4 +1,4 @@
-.TH CJPEG 1 "21 November 2014"
+.TH CJPEG 1 "17 February 2016"
 .SH NAME
 cjpeg \- compress an image file to a JPEG file
 .SH SYNOPSIS
@@ -85,8 +85,8 @@
 and the closer the output image will be to the original input.  Normally you
 want to use the lowest quality setting (smallest file) that decompresses into
 something visually indistinguishable from the original image.  For this
-purpose the quality setting should be between 50 and 95; the default of 75 is
-often about right.  If you see defects at
+purpose the quality setting should generally be between 50 and 95 (the default
+is 75) for photographic images.  If you see defects at
 .B \-quality
 75, then go up 5 or 10 counts at a time until you are happy with the output
 image.  (The optimal setting will vary from one image to another.)
@@ -94,11 +94,10 @@
 .B \-quality
 100 will generate a quantization table of all 1's, minimizing loss in the
 quantization step (but there is still information loss in subsampling, as well
-as roundoff error).  This setting is mainly of interest for experimental
-purposes.  Quality values above about 95 are
-.B not
-recommended for normal use; the compressed file size goes up dramatically for
-hardly any gain in output image quality.
+as roundoff error.)  For most images, specifying a quality value above
+about 95 will increase the size of the compressed file dramatically, and while
+the quality gain from these higher quality values is measurable (using metrics
+such as PSNR or SSIM), it is rarely perceivable by human vision.
 .PP
 In the other direction, quality values below 50 will produce very small files
 of low image quality.  Settings around 5 to 10 might be useful in preparing an
@@ -338,11 +337,11 @@
 This file was modified by The libjpeg-turbo Project to include only information
 relevant to libjpeg-turbo, to wordsmith certain sections, and to describe
 features not present in libjpeg.
-.SH BUGS
+.SH ISSUES
 Support for GIF input files was removed in cjpeg v6b due to concerns over
 the Unisys LZW patent.  Although this patent expired in 2006, cjpeg still
 lacks GIF support, for these historical reasons.  (Conversion of GIF files to
-JPEG is usually a bad idea anyway.)
+JPEG is usually a bad idea anyway, since GIF is a 256-color format.)
 .PP
 Not all variants of BMP and Targa file formats are supported.
 .PP
diff --git a/cjpeg.c b/cjpeg.c
index 92e2824..713224f 100644
--- a/cjpeg.c
+++ b/cjpeg.c
@@ -6,7 +6,8 @@
  * Modified 2003-2011 by Guido Vollbeding.
  * libjpeg-turbo Modifications:
  * Copyright (C) 2010, 2013-2014, D. R. Commander.
- * For conditions of distribution and use, see the accompanying README file.
+ * For conditions of distribution and use, see the accompanying README.ijg
+ * file.
  *
  * This file contains a command-line user interface for the JPEG compressor.
  * It should work on any system with Unix- or MS-DOS-style command lines.
@@ -81,7 +82,7 @@
 
 
 LOCAL(cjpeg_source_ptr)
-select_file_type (j_compress_ptr cinfo, FILE * infile)
+select_file_type (j_compress_ptr cinfo, FILE *infile)
 {
   int c;
 
@@ -137,9 +138,9 @@
  */
 
 
-static const char * progname;   /* program name for error messages */
-static char * outfilename;      /* for -outfile switch */
-boolean memdst;  /* for -memdst switch */
+static const char *progname;    /* program name for error messages */
+static char *outfilename;       /* for -outfile switch */
+boolean memdst;                 /* for -memdst switch */
 
 
 LOCAL(void)
@@ -154,7 +155,8 @@
 #endif
 
   fprintf(stderr, "Switches (names may be abbreviated):\n");
-  fprintf(stderr, "  -quality N[,...]   Compression quality (0..100; 5-95 is useful range)\n");
+  fprintf(stderr, "  -quality N[,...]   Compression quality (0..100; 5-95 is most useful range,\n");
+  fprintf(stderr, "                     default is 75)\n");
   fprintf(stderr, "  -grayscale     Create monochrome JPEG file\n");
   fprintf(stderr, "  -rgb           Create RGB JPEG file\n");
 #ifdef ENTROPY_OPT_SUPPORTED
@@ -218,14 +220,14 @@
  */
 {
   int argn;
-  char * arg;
+  char *arg;
   boolean force_baseline;
   boolean simple_progressive;
-  char * qualityarg = NULL;     /* saves -quality parm if any */
-  char * qtablefile = NULL;     /* saves -qtables filename if any */
-  char * qslotsarg = NULL;      /* saves -qslots parm if any */
-  char * samplearg = NULL;      /* saves -sample parm if any */
-  char * scansarg = NULL;       /* saves -scans parm if any */
+  char *qualityarg = NULL;      /* saves -quality parm if any */
+  char *qtablefile = NULL;      /* saves -qtables filename if any */
+  char *qslotsarg = NULL;       /* saves -qslots parm if any */
+  char *samplearg = NULL;       /* saves -sample parm if any */
+  char *scansarg = NULL;        /* saves -scans parm if any */
 
   /* Set up default JPEG parameters. */
 
@@ -493,8 +495,8 @@
 #endif
   int file_index;
   cjpeg_source_ptr src_mgr;
-  FILE * input_file;
-  FILE * output_file = NULL;
+  FILE *input_file;
+  FILE *output_file = NULL;
   unsigned char *outbuffer = NULL;
   unsigned long outsize = 0;
   JDIMENSION num_scanlines;
diff --git a/cmakescripts/md5cmp.cmake b/cmakescripts/md5cmp.cmake
deleted file mode 100644
index c315aa8..0000000
--- a/cmakescripts/md5cmp.cmake
+++ /dev/null
@@ -1,15 +0,0 @@
-if(NOT MD5)
-  message(FATAL_ERROR "MD5 not specified")
-endif()
-
-if(NOT FILE)
-  message(FATAL_ERROR "FILE not specified")
-endif()
-
-file(MD5 ${FILE} MD5FILE)
-
-if(NOT MD5 STREQUAL MD5FILE)
-	message(FATAL_ERROR "MD5 of ${FILE} should be ${MD5}, not ${MD5FILE}.")
-else()
-	message(STATUS "${MD5}: OK")
-endif()
diff --git a/coderules.txt b/coderules.txt
index 8683e9a..a2f593a 100644
--- a/coderules.txt
+++ b/coderules.txt
@@ -4,7 +4,7 @@
 Copyright (C) 1991-1996, Thomas G. Lane.
 It was modified by The libjpeg-turbo Project to include only information
 relevant to libjpeg-turbo.
-For conditions of distribution and use, see the accompanying README file.
+For conditions of distribution and use, see the accompanying README.ijg file.
 
 
 Since numerous people will be contributing code and bug fixes, it's important
diff --git a/configure.ac b/configure.ac
index 0dd7612..c728ee8 100644
--- a/configure.ac
+++ b/configure.ac
@@ -2,7 +2,7 @@
 # Process this file with autoconf to produce a configure script.
 
 AC_PREREQ([2.56])
-AC_INIT([libjpeg-turbo], [1.4.3])
+AC_INIT([libjpeg-turbo], [1.4.90])
 
 AM_INIT_AUTOMAKE([-Wall foreign dist-bzip2])
 AC_PREFIX_DEFAULT(/opt/libjpeg-turbo)
@@ -25,6 +25,8 @@
   [BUILD="$with_build_date"],
   [BUILD=`date +%Y%m%d`])
 
+PKG_PROG_PKG_CONFIG
+
 # When the prefix is /opt/libjpeg-turbo, we assume that an "official" binary is
 # being created, and thus we install things into specific locations.
 
@@ -77,7 +79,7 @@
 
 # Check whether compiler supports pointers to undefined structures
 AC_MSG_CHECKING(whether compiler supports pointers to undefined structures)
-AC_TRY_COMPILE([ typedef struct undefined_structure * undef_struct_ptr; ], ,
+AC_TRY_COMPILE([ typedef struct undefined_structure *undef_struct_ptr; ], ,
   AC_MSG_RESULT(yes),
   [AC_MSG_RESULT(no)
    AC_DEFINE([INCOMPLETE_TYPES_BROKEN], [1],
@@ -192,7 +194,7 @@
 RPM_CONFIG_ARGS=
 
 # Memory source/destination managers
-SO_AGE=0
+SO_AGE=1
 MEM_SRCDST_FUNCTIONS=
 if test "x${with_jpeg8}" != "xyes"; then
   AC_MSG_CHECKING([whether to include in-memory source/destination managers])
@@ -203,7 +205,7 @@
     AC_MSG_RESULT(yes)
     AC_DEFINE([MEM_SRCDST_SUPPORTED], [1],
       [Support in-memory source/destination managers])
-    SO_AGE=1
+    SO_AGE=2
     MEM_SRCDST_FUNCTIONS="global:  jpeg_mem_dest;  jpeg_mem_src;";
   else
     AC_MSG_RESULT(no)
@@ -222,6 +224,16 @@
 
 AC_DEFINE_UNQUOTED(LIBJPEG_TURBO_VERSION, [$VERSION], [libjpeg-turbo version])
 
+m4_define(version_triplet,m4_split(AC_PACKAGE_VERSION,[[.]]))
+m4_define(version_major,m4_argn(1,version_triplet))
+m4_define(version_minor,m4_argn(2,version_triplet))
+m4_define(version_revision,m4_argn(3,version_triplet))
+VERSION_MAJOR=version_major
+VERSION_MINOR=version_minor
+VERSION_REVISION=version_revision
+LIBJPEG_TURBO_VERSION_NUMBER=`printf "%d%03d%03d" $VERSION_MAJOR $VERSION_MINOR $VERSION_REVISION`
+AC_DEFINE_UNQUOTED(LIBJPEG_TURBO_VERSION_NUMBER, [$LIBJPEG_TURBO_VERSION_NUMBER], [libjpeg-turbo version in integer form])
+
 VERSION_SCRIPT=yes
 AC_ARG_ENABLE([ld-version-script],
   AS_HELP_STRING([--disable-ld-version-script],
@@ -277,10 +289,13 @@
 AC_MSG_CHECKING([whether to include arithmetic encoding support])
 AC_ARG_WITH([arith-enc],
   AC_HELP_STRING([--without-arith-enc],
-    [Do not include arithmetic encoding support]))
+    [Do not include arithmetic encoding support when emulating the libjpeg v6b API/ABI]))
 if test "x$with_12bit" = "xyes"; then
   with_arith_enc=no
 fi
+if test "x${with_jpeg8}" = "xyes" -o "x${with_jpeg7}" = "xyes"; then
+  with_arith_enc=yes
+fi
 if test "x$with_arith_enc" = "xno"; then
   AC_MSG_RESULT(no)
   RPM_CONFIG_ARGS="$RPM_CONFIG_ARGS --without-arith-enc"
@@ -293,10 +308,13 @@
 AC_MSG_CHECKING([whether to include arithmetic decoding support])
 AC_ARG_WITH([arith-dec],
   AC_HELP_STRING([--without-arith-dec],
-    [Do not include arithmetic decoding support]))
+    [Do not include arithmetic decoding support when emulating the libjpeg v6b API/ABI]))
 if test "x$with_12bit" = "xyes"; then
   with_arith_dec=no
 fi
+if test "x${with_jpeg8}" = "xyes" -o "x${with_jpeg7}" = "xyes"; then
+  with_arith_dec=yes
+fi
 if test "x$with_arith_dec" = "xno"; then
   AC_MSG_RESULT(no)
   RPM_CONFIG_ARGS="$RPM_CONFIG_ARGS --without-arith-dec"
@@ -498,6 +516,10 @@
         fi
       fi
       ;;
+    powerpc*)
+      AC_MSG_RESULT([yes (powerpc)])
+      simd_arch=powerpc
+      ;;
     *)
       AC_MSG_RESULT([no ("$host_cpu")])
       with_simd=no;
@@ -523,8 +545,10 @@
 AM_CONDITIONAL([SIMD_ARM], [test "x$simd_arch" = "xarm"])
 AM_CONDITIONAL([SIMD_ARM_64], [test "x$simd_arch" = "xaarch64"])
 AM_CONDITIONAL([SIMD_MIPS], [test "x$simd_arch" = "xmips"])
+AM_CONDITIONAL([SIMD_POWERPC], [test "x$simd_arch" = "xpowerpc"])
 AM_CONDITIONAL([X86_64], [test "x$host_cpu" = "xx86_64" -o "x$host_cpu" = "xamd64"])
 AM_CONDITIONAL([WITH_TURBOJPEG], [test "x$with_turbojpeg" != "xno"])
+AM_CONDITIONAL([CROSS_COMPILING], [test "x$cross_compiling" = "xyes"])
 
 AC_ARG_VAR(PKGNAME, [distribution package name (default: libjpeg-turbo)])
 if test "x$PKGNAME" = "x"; then
@@ -571,6 +595,8 @@
 AC_CONFIG_FILES([pkgscripts/makedpkg.tmpl:release/makedpkg.in])
 AC_CONFIG_FILES([pkgscripts/makemacpkg.tmpl:release/makemacpkg.in])
 AC_CONFIG_FILES([pkgscripts/uninstall.tmpl:release/uninstall.in])
+AC_CONFIG_FILES([pkgscripts/libjpeg.pc:release/libjpeg.pc.in])
+AC_CONFIG_FILES([pkgscripts/libturbojpeg.pc:release/libturbojpeg.pc.in])
 if test "x$with_turbojpeg" != "xno"; then
   AC_CONFIG_FILES([tjbenchtest])
 fi
diff --git a/djpeg.1 b/djpeg.1
index 73d0514..7efde43 100644
--- a/djpeg.1
+++ b/djpeg.1
@@ -1,4 +1,4 @@
-.TH DJPEG 1 "21 November 2014"
+.TH DJPEG 1 "18 February 2016"
 .SH NAME
 djpeg \- decompress a JPEG file to an image file
 .SH SYNOPSIS
@@ -194,6 +194,18 @@
 Load input file into memory before decompressing.  This feature was implemented
 mainly as a way of testing the in-memory source manager (jpeg_mem_src().)
 .TP
+.BI \-skip " Y0,Y1"
+Decompress all rows of the JPEG image except those between Y0 and Y1
+(inclusive.)  Note that if decompression scaling is being used, then Y0 and Y1
+are relative to the scaled image dimensions.
+.TP
+.BI \-crop " WxH+X+Y"
+Decompress only a rectangular subregion of the image, starting at point X,Y
+with width W and height H.  If necessary, X will be shifted left to the nearest
+iMCU boundary, and the width will be increased accordingly.  Note that if
+decompression scaling is being used, then X, Y, W, and H are relative to the
+scaled image dimensions.
+.TP
 .B \-verbose
 Enable debug printout.  More
 .BR \-v 's
@@ -271,8 +283,10 @@
 This file was modified by The libjpeg-turbo Project to include only information
 relevant to libjpeg-turbo, to wordsmith certain sections, and to describe
 features not present in libjpeg.
-.SH BUGS
-To avoid the Unisys LZW patent,
-.B djpeg
-produces uncompressed GIF files.  These are larger than they should be, but
-are readable by standard GIF decoders.
+.SH ISSUES
+Support for compressed GIF output files was removed in djpeg v6b due to
+concerns over the Unisys LZW patent.  Although this patent expired in 2006,
+djpeg still lacks compressed GIF support, for these historical reasons.
+(Conversion of JPEG files to GIF is usually a bad idea anyway, since GIF is a
+256-color format.)  The uncompressed GIF files that djpeg generates are larger
+than they should be, but they are readable by standard GIF decoders.
diff --git a/djpeg.c b/djpeg.c
index 8ddff96..54cd525 100644
--- a/djpeg.c
+++ b/djpeg.c
@@ -3,9 +3,12 @@
  *
  * This file was part of the Independent JPEG Group's software:
  * Copyright (C) 1991-1997, Thomas G. Lane.
+ * Modified 2013 by Guido Vollbeding.
  * libjpeg-turbo Modifications:
- * Copyright (C) 2010-2011, 2013-2014, D. R. Commander.
- * For conditions of distribution and use, see the accompanying README file.
+ * Copyright (C) 2010-2011, 2013-2016, D. R. Commander.
+ * Copyright (C) 2015, Google, Inc.
+ * For conditions of distribution and use, see the accompanying README.ijg
+ * file.
  *
  * This file contains a command-line user interface for the JPEG decompressor.
  * It should work on any system with Unix- or MS-DOS-style command lines.
@@ -28,6 +31,7 @@
 #include "cdjpeg.h"             /* Common decls for cjpeg/djpeg applications */
 #include "jversion.h"           /* for version message */
 #include "jconfigint.h"
+#include "wrppm.h"
 
 #include <ctype.h>              /* to declare isprint() */
 
@@ -85,9 +89,12 @@
  */
 
 
-static const char * progname;   /* program name for error messages */
-static char * outfilename;      /* for -outfile switch */
-boolean memsrc;  /* for -memsrc switch */
+static const char *progname;    /* program name for error messages */
+static char *outfilename;       /* for -outfile switch */
+boolean memsrc;                 /* for -memsrc switch */
+boolean skip, crop;
+JDIMENSION skip_start, skip_end;
+JDIMENSION crop_x, crop_y, crop_width, crop_height;
 #define INPUT_BUF_SIZE  4096
 
 
@@ -164,6 +171,8 @@
   fprintf(stderr, "  -memsrc        Load input file into memory before decompressing\n");
 #endif
 
+  fprintf(stderr, "  -skip Y0,Y1    Decompress all rows except those between Y0 and Y1 (inclusive)\n");
+  fprintf(stderr, "  -crop WxH+X+Y  Decompress only a rectangular subregion of the image\n");
   fprintf(stderr, "  -verbose  or  -debug   Emit debug output\n");
   fprintf(stderr, "  -version       Print version information and exit\n");
   exit(EXIT_FAILURE);
@@ -183,12 +192,14 @@
  */
 {
   int argn;
-  char * arg;
+  char *arg;
 
   /* Set up default JPEG parameters. */
   requested_fmt = DEFAULT_FMT;  /* set default output file format */
   outfilename = NULL;
   memsrc = FALSE;
+  skip = FALSE;
+  crop = FALSE;
   cinfo->err->trace_level = 0;
 
   /* Scan command line options, adjust parameters */
@@ -298,7 +309,7 @@
         usage();
       if (for_real) {           /* too expensive to do twice! */
 #ifdef QUANT_2PASS_SUPPORTED    /* otherwise can't quantize to supplied map */
-        FILE * mapfile;
+        FILE *mapfile;
 
         if ((mapfile = fopen(argv[argn], READ_BINARY)) == NULL) {
           fprintf(stderr, "%s: can't open %s\n", progname, argv[argn]);
@@ -361,14 +372,32 @@
       /* RLE output format. */
       requested_fmt = FMT_RLE;
 
-    } else if (keymatch(arg, "scale", 1)) {
+    } else if (keymatch(arg, "scale", 2)) {
       /* Scale the output image by a fraction M/N. */
       if (++argn >= argc)       /* advance to next argument */
         usage();
-      if (sscanf(argv[argn], "%d/%d",
+      if (sscanf(argv[argn], "%u/%u",
                  &cinfo->scale_num, &cinfo->scale_denom) != 2)
         usage();
 
+    } else if (keymatch(arg, "skip", 2)) {
+      if (++argn >= argc)
+        usage();
+      if (sscanf(argv[argn], "%u,%u", &skip_start, &skip_end) != 2 ||
+          skip_start > skip_end)
+        usage();
+      skip = TRUE;
+
+    } else if (keymatch(arg, "crop", 2)) {
+      char c;
+      if (++argn >= argc)
+        usage();
+      if (sscanf(argv[argn], "%u%c%u+%u+%u", &crop_width, &c, &crop_height,
+                 &crop_x, &crop_y) != 5 ||
+          (c != 'X' && c != 'x') || crop_width < 1 || crop_height < 1)
+        usage();
+      crop = TRUE;
+
     } else if (keymatch(arg, "targa", 1)) {
       /* Targa output format. */
       requested_fmt = FMT_TARGA;
@@ -393,7 +422,7 @@
 jpeg_getc (j_decompress_ptr cinfo)
 /* Read next byte */
 {
-  struct jpeg_source_mgr * datasrc = cinfo->src;
+  struct jpeg_source_mgr *datasrc = cinfo->src;
 
   if (datasrc->bytes_in_buffer == 0) {
     if (! (*datasrc->fill_input_buffer) (cinfo))
@@ -408,7 +437,7 @@
 print_text_marker (j_decompress_ptr cinfo)
 {
   boolean traceit = (cinfo->err->trace_level >= 1);
-  INT32 length;
+  long length;
   unsigned int ch;
   unsigned int lastch = 0;
 
@@ -469,8 +498,8 @@
 #endif
   int file_index;
   djpeg_dest_ptr dest_mgr = NULL;
-  FILE * input_file;
-  FILE * output_file;
+  FILE *input_file;
+  FILE *output_file;
   unsigned char *inbuffer = NULL;
   unsigned long insize = 0;
   JDIMENSION num_scanlines;
@@ -634,14 +663,88 @@
   /* Start decompressor */
   (void) jpeg_start_decompress(&cinfo);
 
-  /* Write output file header */
-  (*dest_mgr->start_output) (&cinfo, dest_mgr);
+  /* Skip rows */
+  if (skip) {
+    JDIMENSION tmp;
 
-  /* Process data */
-  while (cinfo.output_scanline < cinfo.output_height) {
-    num_scanlines = jpeg_read_scanlines(&cinfo, dest_mgr->buffer,
-                                        dest_mgr->buffer_height);
-    (*dest_mgr->put_pixel_rows) (&cinfo, dest_mgr, num_scanlines);
+    /* Check for valid skip_end.  We cannot check this value until after
+     * jpeg_start_decompress() is called.  Note that we have already verified
+     * that skip_start <= skip_end.
+     */
+    if (skip_end > cinfo.output_height - 1) {
+      fprintf(stderr, "%s: skip region exceeds image height %d\n", progname,
+              cinfo.output_height);
+      exit(EXIT_FAILURE);
+    }
+
+    /* Write output file header.  This is a hack to ensure that the destination
+     * manager creates an output image of the proper size.
+     */
+    tmp = cinfo.output_height;
+    cinfo.output_height -= (skip_end - skip_start + 1);
+    (*dest_mgr->start_output) (&cinfo, dest_mgr);
+    cinfo.output_height = tmp;
+
+    /* Process data */
+    while (cinfo.output_scanline < skip_start) {
+      num_scanlines = jpeg_read_scanlines(&cinfo, dest_mgr->buffer,
+                                          dest_mgr->buffer_height);
+      (*dest_mgr->put_pixel_rows) (&cinfo, dest_mgr, num_scanlines);
+    }
+    jpeg_skip_scanlines(&cinfo, skip_end - skip_start + 1);
+    while (cinfo.output_scanline < cinfo.output_height) {
+      num_scanlines = jpeg_read_scanlines(&cinfo, dest_mgr->buffer,
+                                          dest_mgr->buffer_height);
+      (*dest_mgr->put_pixel_rows) (&cinfo, dest_mgr, num_scanlines);
+    }
+
+  /* Decompress a subregion */
+  } else if (crop) {
+    JDIMENSION tmp;
+
+    /* Check for valid crop dimensions.  We cannot check these values until
+     * after jpeg_start_decompress() is called.
+     */
+    if (crop_x + crop_width > cinfo.output_width ||
+        crop_y + crop_height > cinfo.output_height) {
+      fprintf(stderr, "%s: crop dimensions exceed image dimensions %d x %d\n",
+              progname, cinfo.output_width, cinfo.output_height);
+      exit(EXIT_FAILURE);
+    }
+
+    jpeg_crop_scanline(&cinfo, &crop_x, &crop_width);
+    ((ppm_dest_ptr) dest_mgr)->buffer_width = cinfo.output_width *
+                                              cinfo.out_color_components *
+                                              sizeof(JSAMPLE);
+
+    /* Write output file header.  This is a hack to ensure that the destination
+     * manager creates an output image of the proper size.
+     */
+    tmp = cinfo.output_height;
+    cinfo.output_height = crop_height;
+    (*dest_mgr->start_output) (&cinfo, dest_mgr);
+    cinfo.output_height = tmp;
+
+    /* Process data */
+    jpeg_skip_scanlines(&cinfo, crop_y);
+    while (cinfo.output_scanline < crop_y + crop_height) {
+      num_scanlines = jpeg_read_scanlines(&cinfo, dest_mgr->buffer,
+                                          dest_mgr->buffer_height);
+      (*dest_mgr->put_pixel_rows) (&cinfo, dest_mgr, num_scanlines);
+    }
+    jpeg_skip_scanlines(&cinfo, cinfo.output_height - crop_y - crop_height);
+
+  /* Normal full-image decompress */
+  } else {
+    /* Write output file header */
+    (*dest_mgr->start_output) (&cinfo, dest_mgr);
+
+    /* Process data */
+    while (cinfo.output_scanline < cinfo.output_height) {
+      num_scanlines = jpeg_read_scanlines(&cinfo, dest_mgr->buffer,
+                                          dest_mgr->buffer_height);
+      (*dest_mgr->put_pixel_rows) (&cinfo, dest_mgr, num_scanlines);
+    }
   }
 
 #ifdef PROGRESS_REPORT
diff --git a/doc/html/annotated.html b/doc/html/annotated.html
index dad0fb1..d0b0e1e 100644
--- a/doc/html/annotated.html
+++ b/doc/html/annotated.html
@@ -24,7 +24,7 @@
  <tr style="height: 56px;">
   <td style="padding-left: 0.5em;">
    <div id="projectname">TurboJPEG
-   &#160;<span id="projectnumber">1.4</span>
+   &#160;<span id="projectnumber">1.5</span>
    </div>
   </td>
  </tr>
diff --git a/doc/html/classes.html b/doc/html/classes.html
index 7720c67..275e96d 100644
--- a/doc/html/classes.html
+++ b/doc/html/classes.html
@@ -24,7 +24,7 @@
  <tr style="height: 56px;">
   <td style="padding-left: 0.5em;">
    <div id="projectname">TurboJPEG
-   &#160;<span id="projectnumber">1.4</span>
+   &#160;<span id="projectnumber">1.5</span>
    </div>
   </td>
  </tr>
diff --git a/doc/html/functions.html b/doc/html/functions.html
index b13faed..31d78f5 100644
--- a/doc/html/functions.html
+++ b/doc/html/functions.html
@@ -24,7 +24,7 @@
  <tr style="height: 56px;">
   <td style="padding-left: 0.5em;">
    <div id="projectname">TurboJPEG
-   &#160;<span id="projectnumber">1.4</span>
+   &#160;<span id="projectnumber">1.5</span>
    </div>
   </td>
  </tr>
diff --git a/doc/html/functions_vars.html b/doc/html/functions_vars.html
index cca5603..8373eac 100644
--- a/doc/html/functions_vars.html
+++ b/doc/html/functions_vars.html
@@ -24,7 +24,7 @@
  <tr style="height: 56px;">
   <td style="padding-left: 0.5em;">
    <div id="projectname">TurboJPEG
-   &#160;<span id="projectnumber">1.4</span>
+   &#160;<span id="projectnumber">1.5</span>
    </div>
   </td>
  </tr>
diff --git a/doc/html/group___turbo_j_p_e_g.html b/doc/html/group___turbo_j_p_e_g.html
index 233fe6c..4b8d306 100644
--- a/doc/html/group___turbo_j_p_e_g.html
+++ b/doc/html/group___turbo_j_p_e_g.html
@@ -24,7 +24,7 @@
  <tr style="height: 56px;">
   <td style="padding-left: 0.5em;">
    <div id="projectname">TurboJPEG
-   &#160;<span id="projectnumber">1.4</span>
+   &#160;<span id="projectnumber">1.5</span>
    </div>
   </td>
  </tr>
@@ -132,10 +132,10 @@
 <tr class="memdesc:ga0f6dbd18adf38b7d46ac547f0f4d562c"><td class="mdescLeft">&#160;</td><td class="mdescRight">The number of transform operations.  <a href="#ga0f6dbd18adf38b7d46ac547f0f4d562c">More...</a><br/></td></tr>
 <tr class="separator:ga0f6dbd18adf38b7d46ac547f0f4d562c"><td class="memSeparator" colspan="2">&#160;</td></tr>
 <tr class="memitem:ga50e03cb5ed115330e212417429600b00"><td class="memItemLeft" align="right" valign="top">#define&#160;</td><td class="memItemRight" valign="bottom"><a class="el" href="group___turbo_j_p_e_g.html#ga50e03cb5ed115330e212417429600b00">TJXOPT_PERFECT</a></td></tr>
-<tr class="memdesc:ga50e03cb5ed115330e212417429600b00"><td class="mdescLeft">&#160;</td><td class="mdescRight">This option will cause <a class="el" href="group___turbo_j_p_e_g.html#gae403193ceb4aafb7e0f56ab587b48616" title="Losslessly transform a JPEG image into another JPEG image.">tjTransform()</a> to return an error if the transform is not perfect.  <a href="#ga50e03cb5ed115330e212417429600b00">More...</a><br/></td></tr>
+<tr class="memdesc:ga50e03cb5ed115330e212417429600b00"><td class="mdescLeft">&#160;</td><td class="mdescRight">This option will cause <a class="el" href="group___turbo_j_p_e_g.html#gad02cd42b69f193a0623a9c801788df3a" title="Losslessly transform a JPEG image into another JPEG image.">tjTransform()</a> to return an error if the transform is not perfect.  <a href="#ga50e03cb5ed115330e212417429600b00">More...</a><br/></td></tr>
 <tr class="separator:ga50e03cb5ed115330e212417429600b00"><td class="memSeparator" colspan="2">&#160;</td></tr>
 <tr class="memitem:ga319826b7eb1583c0595bbe7b95428709"><td class="memItemLeft" align="right" valign="top">#define&#160;</td><td class="memItemRight" valign="bottom"><a class="el" href="group___turbo_j_p_e_g.html#ga319826b7eb1583c0595bbe7b95428709">TJXOPT_TRIM</a></td></tr>
-<tr class="memdesc:ga319826b7eb1583c0595bbe7b95428709"><td class="mdescLeft">&#160;</td><td class="mdescRight">This option will cause <a class="el" href="group___turbo_j_p_e_g.html#gae403193ceb4aafb7e0f56ab587b48616" title="Losslessly transform a JPEG image into another JPEG image.">tjTransform()</a> to discard any partial MCU blocks that cannot be transformed.  <a href="#ga319826b7eb1583c0595bbe7b95428709">More...</a><br/></td></tr>
+<tr class="memdesc:ga319826b7eb1583c0595bbe7b95428709"><td class="mdescLeft">&#160;</td><td class="mdescRight">This option will cause <a class="el" href="group___turbo_j_p_e_g.html#gad02cd42b69f193a0623a9c801788df3a" title="Losslessly transform a JPEG image into another JPEG image.">tjTransform()</a> to discard any partial MCU blocks that cannot be transformed.  <a href="#ga319826b7eb1583c0595bbe7b95428709">More...</a><br/></td></tr>
 <tr class="separator:ga319826b7eb1583c0595bbe7b95428709"><td class="memSeparator" colspan="2">&#160;</td></tr>
 <tr class="memitem:ga9c771a757fc1294add611906b89ab2d2"><td class="memItemLeft" align="right" valign="top">#define&#160;</td><td class="memItemRight" valign="bottom"><a class="el" href="group___turbo_j_p_e_g.html#ga9c771a757fc1294add611906b89ab2d2">TJXOPT_CROP</a></td></tr>
 <tr class="memdesc:ga9c771a757fc1294add611906b89ab2d2"><td class="mdescLeft">&#160;</td><td class="mdescRight">This option will enable lossless cropping.  <a href="#ga9c771a757fc1294add611906b89ab2d2">More...</a><br/></td></tr>
@@ -144,7 +144,7 @@
 <tr class="memdesc:ga3acee7b48ade1b99e5588736007c2589"><td class="mdescLeft">&#160;</td><td class="mdescRight">This option will discard the color data in the input image and produce a grayscale output image.  <a href="#ga3acee7b48ade1b99e5588736007c2589">More...</a><br/></td></tr>
 <tr class="separator:ga3acee7b48ade1b99e5588736007c2589"><td class="memSeparator" colspan="2">&#160;</td></tr>
 <tr class="memitem:gafbf992bbf6e006705886333703ffab31"><td class="memItemLeft" align="right" valign="top">#define&#160;</td><td class="memItemRight" valign="bottom"><a class="el" href="group___turbo_j_p_e_g.html#gafbf992bbf6e006705886333703ffab31">TJXOPT_NOOUTPUT</a></td></tr>
-<tr class="memdesc:gafbf992bbf6e006705886333703ffab31"><td class="mdescLeft">&#160;</td><td class="mdescRight">This option will prevent <a class="el" href="group___turbo_j_p_e_g.html#gae403193ceb4aafb7e0f56ab587b48616" title="Losslessly transform a JPEG image into another JPEG image.">tjTransform()</a> from outputting a JPEG image for this particular transform (this can be used in conjunction with a custom filter to capture the transformed DCT coefficients without transcoding them.)  <a href="#gafbf992bbf6e006705886333703ffab31">More...</a><br/></td></tr>
+<tr class="memdesc:gafbf992bbf6e006705886333703ffab31"><td class="mdescLeft">&#160;</td><td class="mdescRight">This option will prevent <a class="el" href="group___turbo_j_p_e_g.html#gad02cd42b69f193a0623a9c801788df3a" title="Losslessly transform a JPEG image into another JPEG image.">tjTransform()</a> from outputting a JPEG image for this particular transform (this can be used in conjunction with a custom filter to capture the transformed DCT coefficients without transcoding them.)  <a href="#gafbf992bbf6e006705886333703ffab31">More...</a><br/></td></tr>
 <tr class="separator:gafbf992bbf6e006705886333703ffab31"><td class="memSeparator" colspan="2">&#160;</td></tr>
 <tr class="memitem:ga0aba955473315e405295d978f0c16511"><td class="memItemLeft" align="right" valign="top">#define&#160;</td><td class="memItemRight" valign="bottom"><a class="el" href="group___turbo_j_p_e_g.html#ga0aba955473315e405295d978f0c16511">TJPAD</a>(width)</td></tr>
 <tr class="memdesc:ga0aba955473315e405295d978f0c16511"><td class="mdescLeft">&#160;</td><td class="mdescRight">Pad the given width to the nearest 32-bit boundary.  <a href="#ga0aba955473315e405295d978f0c16511">More...</a><br/></td></tr>
@@ -218,7 +218,7 @@
 <a class="el" href="group___turbo_j_p_e_g.html#gga2de531af4e7e6c4f124908376b354866a3064ee5dfb7f032df332818587567a08">TJXOP_ROT270</a>
 <br/>
  }</td></tr>
-<tr class="memdesc:ga2de531af4e7e6c4f124908376b354866"><td class="mdescLeft">&#160;</td><td class="mdescRight">Transform operations for <a class="el" href="group___turbo_j_p_e_g.html#gae403193ceb4aafb7e0f56ab587b48616" title="Losslessly transform a JPEG image into another JPEG image.">tjTransform()</a>  <a href="group___turbo_j_p_e_g.html#ga2de531af4e7e6c4f124908376b354866">More...</a><br/></td></tr>
+<tr class="memdesc:ga2de531af4e7e6c4f124908376b354866"><td class="mdescLeft">&#160;</td><td class="mdescRight">Transform operations for <a class="el" href="group___turbo_j_p_e_g.html#gad02cd42b69f193a0623a9c801788df3a" title="Losslessly transform a JPEG image into another JPEG image.">tjTransform()</a>  <a href="group___turbo_j_p_e_g.html#ga2de531af4e7e6c4f124908376b354866">More...</a><br/></td></tr>
 <tr class="separator:ga2de531af4e7e6c4f124908376b354866"><td class="memSeparator" colspan="2">&#160;</td></tr>
 </table><table class="memberdecls">
 <tr class="heading"><td colspan="2"><h2 class="groupheader"><a name="func-members"></a>
@@ -226,15 +226,15 @@
 <tr class="memitem:ga3d10c47fbe4a2489a2b30c931551d01a"><td class="memItemLeft" align="right" valign="top">DLLEXPORT <a class="el" href="group___turbo_j_p_e_g.html#ga758d2634ecb4949de7815cba621f5763">tjhandle</a> DLLCALL&#160;</td><td class="memItemRight" valign="bottom"><a class="el" href="group___turbo_j_p_e_g.html#ga3d10c47fbe4a2489a2b30c931551d01a">tjInitCompress</a> (void)</td></tr>
 <tr class="memdesc:ga3d10c47fbe4a2489a2b30c931551d01a"><td class="mdescLeft">&#160;</td><td class="mdescRight">Create a TurboJPEG compressor instance.  <a href="#ga3d10c47fbe4a2489a2b30c931551d01a">More...</a><br/></td></tr>
 <tr class="separator:ga3d10c47fbe4a2489a2b30c931551d01a"><td class="memSeparator" colspan="2">&#160;</td></tr>
-<tr class="memitem:gaba62b7a98f960839b588579898495cf2"><td class="memItemLeft" align="right" valign="top">DLLEXPORT int DLLCALL&#160;</td><td class="memItemRight" valign="bottom"><a class="el" href="group___turbo_j_p_e_g.html#gaba62b7a98f960839b588579898495cf2">tjCompress2</a> (<a class="el" href="group___turbo_j_p_e_g.html#ga758d2634ecb4949de7815cba621f5763">tjhandle</a> handle, unsigned char *srcBuf, int width, int pitch, int height, int pixelFormat, unsigned char **jpegBuf, unsigned long *jpegSize, int jpegSubsamp, int jpegQual, int flags)</td></tr>
-<tr class="memdesc:gaba62b7a98f960839b588579898495cf2"><td class="mdescLeft">&#160;</td><td class="mdescRight">Compress an RGB, grayscale, or CMYK image into a JPEG image.  <a href="#gaba62b7a98f960839b588579898495cf2">More...</a><br/></td></tr>
-<tr class="separator:gaba62b7a98f960839b588579898495cf2"><td class="memSeparator" colspan="2">&#160;</td></tr>
-<tr class="memitem:ga0b931126c7a615ddc3bbd0cca6698d67"><td class="memItemLeft" align="right" valign="top">DLLEXPORT int DLLCALL&#160;</td><td class="memItemRight" valign="bottom"><a class="el" href="group___turbo_j_p_e_g.html#ga0b931126c7a615ddc3bbd0cca6698d67">tjCompressFromYUV</a> (<a class="el" href="group___turbo_j_p_e_g.html#ga758d2634ecb4949de7815cba621f5763">tjhandle</a> handle, unsigned char *srcBuf, int width, int pad, int height, int subsamp, unsigned char **jpegBuf, unsigned long *jpegSize, int jpegQual, int flags)</td></tr>
-<tr class="memdesc:ga0b931126c7a615ddc3bbd0cca6698d67"><td class="mdescLeft">&#160;</td><td class="mdescRight">Compress a YUV planar image into a JPEG image.  <a href="#ga0b931126c7a615ddc3bbd0cca6698d67">More...</a><br/></td></tr>
-<tr class="separator:ga0b931126c7a615ddc3bbd0cca6698d67"><td class="memSeparator" colspan="2">&#160;</td></tr>
-<tr class="memitem:gaa89a1982cb4556b12ae7af4439991af6"><td class="memItemLeft" align="right" valign="top">DLLEXPORT int DLLCALL&#160;</td><td class="memItemRight" valign="bottom"><a class="el" href="group___turbo_j_p_e_g.html#gaa89a1982cb4556b12ae7af4439991af6">tjCompressFromYUVPlanes</a> (<a class="el" href="group___turbo_j_p_e_g.html#ga758d2634ecb4949de7815cba621f5763">tjhandle</a> handle, unsigned char **srcPlanes, int width, int *strides, int height, int subsamp, unsigned char **jpegBuf, unsigned long *jpegSize, int jpegQual, int flags)</td></tr>
-<tr class="memdesc:gaa89a1982cb4556b12ae7af4439991af6"><td class="mdescLeft">&#160;</td><td class="mdescRight">Compress a set of Y, U (Cb), and V (Cr) image planes into a JPEG image.  <a href="#gaa89a1982cb4556b12ae7af4439991af6">More...</a><br/></td></tr>
-<tr class="separator:gaa89a1982cb4556b12ae7af4439991af6"><td class="memSeparator" colspan="2">&#160;</td></tr>
+<tr class="memitem:gaf38f2ed44bdc88e730e08b632fa6e88e"><td class="memItemLeft" align="right" valign="top">DLLEXPORT int DLLCALL&#160;</td><td class="memItemRight" valign="bottom"><a class="el" href="group___turbo_j_p_e_g.html#gaf38f2ed44bdc88e730e08b632fa6e88e">tjCompress2</a> (<a class="el" href="group___turbo_j_p_e_g.html#ga758d2634ecb4949de7815cba621f5763">tjhandle</a> handle, const unsigned char *srcBuf, int width, int pitch, int height, int pixelFormat, unsigned char **jpegBuf, unsigned long *jpegSize, int jpegSubsamp, int jpegQual, int flags)</td></tr>
+<tr class="memdesc:gaf38f2ed44bdc88e730e08b632fa6e88e"><td class="mdescLeft">&#160;</td><td class="mdescRight">Compress an RGB, grayscale, or CMYK image into a JPEG image.  <a href="#gaf38f2ed44bdc88e730e08b632fa6e88e">More...</a><br/></td></tr>
+<tr class="separator:gaf38f2ed44bdc88e730e08b632fa6e88e"><td class="memSeparator" colspan="2">&#160;</td></tr>
+<tr class="memitem:ga6f6de375d6ec0020faba627e37e5a060"><td class="memItemLeft" align="right" valign="top">DLLEXPORT int DLLCALL&#160;</td><td class="memItemRight" valign="bottom"><a class="el" href="group___turbo_j_p_e_g.html#ga6f6de375d6ec0020faba627e37e5a060">tjCompressFromYUV</a> (<a class="el" href="group___turbo_j_p_e_g.html#ga758d2634ecb4949de7815cba621f5763">tjhandle</a> handle, const unsigned char *srcBuf, int width, int pad, int height, int subsamp, unsigned char **jpegBuf, unsigned long *jpegSize, int jpegQual, int flags)</td></tr>
+<tr class="memdesc:ga6f6de375d6ec0020faba627e37e5a060"><td class="mdescLeft">&#160;</td><td class="mdescRight">Compress a YUV planar image into a JPEG image.  <a href="#ga6f6de375d6ec0020faba627e37e5a060">More...</a><br/></td></tr>
+<tr class="separator:ga6f6de375d6ec0020faba627e37e5a060"><td class="memSeparator" colspan="2">&#160;</td></tr>
+<tr class="memitem:ga0b84c682d8accf097d7a743c965d3464"><td class="memItemLeft" align="right" valign="top">DLLEXPORT int DLLCALL&#160;</td><td class="memItemRight" valign="bottom"><a class="el" href="group___turbo_j_p_e_g.html#ga0b84c682d8accf097d7a743c965d3464">tjCompressFromYUVPlanes</a> (<a class="el" href="group___turbo_j_p_e_g.html#ga758d2634ecb4949de7815cba621f5763">tjhandle</a> handle, const unsigned char **srcPlanes, int width, const int *strides, int height, int subsamp, unsigned char **jpegBuf, unsigned long *jpegSize, int jpegQual, int flags)</td></tr>
+<tr class="memdesc:ga0b84c682d8accf097d7a743c965d3464"><td class="mdescLeft">&#160;</td><td class="mdescRight">Compress a set of Y, U (Cb), and V (Cr) image planes into a JPEG image.  <a href="#ga0b84c682d8accf097d7a743c965d3464">More...</a><br/></td></tr>
+<tr class="separator:ga0b84c682d8accf097d7a743c965d3464"><td class="memSeparator" colspan="2">&#160;</td></tr>
 <tr class="memitem:gaccc5bca7f12fcdcc302e6e1c6d4b311b"><td class="memItemLeft" align="right" valign="top">DLLEXPORT unsigned long DLLCALL&#160;</td><td class="memItemRight" valign="bottom"><a class="el" href="group___turbo_j_p_e_g.html#gaccc5bca7f12fcdcc302e6e1c6d4b311b">tjBufSize</a> (int width, int height, int jpegSubsamp)</td></tr>
 <tr class="memdesc:gaccc5bca7f12fcdcc302e6e1c6d4b311b"><td class="mdescLeft">&#160;</td><td class="mdescRight">The maximum size of the buffer (in bytes) required to hold a JPEG image with the given parameters.  <a href="#gaccc5bca7f12fcdcc302e6e1c6d4b311b">More...</a><br/></td></tr>
 <tr class="separator:gaccc5bca7f12fcdcc302e6e1c6d4b311b"><td class="memSeparator" colspan="2">&#160;</td></tr>
@@ -250,42 +250,42 @@
 <tr class="memitem:ga1a209696c6a80748f20e134b3c64789f"><td class="memItemLeft" align="right" valign="top">DLLEXPORT int&#160;</td><td class="memItemRight" valign="bottom"><a class="el" href="group___turbo_j_p_e_g.html#ga1a209696c6a80748f20e134b3c64789f">tjPlaneHeight</a> (int componentID, int height, int subsamp)</td></tr>
 <tr class="memdesc:ga1a209696c6a80748f20e134b3c64789f"><td class="mdescLeft">&#160;</td><td class="mdescRight">The plane height of a YUV image plane with the given parameters.  <a href="#ga1a209696c6a80748f20e134b3c64789f">More...</a><br/></td></tr>
 <tr class="separator:ga1a209696c6a80748f20e134b3c64789f"><td class="memSeparator" colspan="2">&#160;</td></tr>
-<tr class="memitem:ga0a5ffbf7cb58a5b6a8201114fe889360"><td class="memItemLeft" align="right" valign="top">DLLEXPORT int DLLCALL&#160;</td><td class="memItemRight" valign="bottom"><a class="el" href="group___turbo_j_p_e_g.html#ga0a5ffbf7cb58a5b6a8201114fe889360">tjEncodeYUV3</a> (<a class="el" href="group___turbo_j_p_e_g.html#ga758d2634ecb4949de7815cba621f5763">tjhandle</a> handle, unsigned char *srcBuf, int width, int pitch, int height, int pixelFormat, unsigned char *dstBuf, int pad, int subsamp, int flags)</td></tr>
-<tr class="memdesc:ga0a5ffbf7cb58a5b6a8201114fe889360"><td class="mdescLeft">&#160;</td><td class="mdescRight">Encode an RGB or grayscale image into a YUV planar image.  <a href="#ga0a5ffbf7cb58a5b6a8201114fe889360">More...</a><br/></td></tr>
-<tr class="separator:ga0a5ffbf7cb58a5b6a8201114fe889360"><td class="memSeparator" colspan="2">&#160;</td></tr>
-<tr class="memitem:gaa791db8598853ddcad24e42897ef1269"><td class="memItemLeft" align="right" valign="top">DLLEXPORT int DLLCALL&#160;</td><td class="memItemRight" valign="bottom"><a class="el" href="group___turbo_j_p_e_g.html#gaa791db8598853ddcad24e42897ef1269">tjEncodeYUVPlanes</a> (<a class="el" href="group___turbo_j_p_e_g.html#ga758d2634ecb4949de7815cba621f5763">tjhandle</a> handle, unsigned char *srcBuf, int width, int pitch, int height, int pixelFormat, unsigned char **dstPlanes, int *strides, int subsamp, int flags)</td></tr>
-<tr class="memdesc:gaa791db8598853ddcad24e42897ef1269"><td class="mdescLeft">&#160;</td><td class="mdescRight">Encode an RGB or grayscale image into separate Y, U (Cb), and V (Cr) image planes.  <a href="#gaa791db8598853ddcad24e42897ef1269">More...</a><br/></td></tr>
-<tr class="separator:gaa791db8598853ddcad24e42897ef1269"><td class="memSeparator" colspan="2">&#160;</td></tr>
+<tr class="memitem:gaabe05acd734990053ad1294b5ef239aa"><td class="memItemLeft" align="right" valign="top">DLLEXPORT int DLLCALL&#160;</td><td class="memItemRight" valign="bottom"><a class="el" href="group___turbo_j_p_e_g.html#gaabe05acd734990053ad1294b5ef239aa">tjEncodeYUV3</a> (<a class="el" href="group___turbo_j_p_e_g.html#ga758d2634ecb4949de7815cba621f5763">tjhandle</a> handle, const unsigned char *srcBuf, int width, int pitch, int height, int pixelFormat, unsigned char *dstBuf, int pad, int subsamp, int flags)</td></tr>
+<tr class="memdesc:gaabe05acd734990053ad1294b5ef239aa"><td class="mdescLeft">&#160;</td><td class="mdescRight">Encode an RGB or grayscale image into a YUV planar image.  <a href="#gaabe05acd734990053ad1294b5ef239aa">More...</a><br/></td></tr>
+<tr class="separator:gaabe05acd734990053ad1294b5ef239aa"><td class="memSeparator" colspan="2">&#160;</td></tr>
+<tr class="memitem:ga8a65ed3bd12df57c219d46afbc9008f1"><td class="memItemLeft" align="right" valign="top">DLLEXPORT int DLLCALL&#160;</td><td class="memItemRight" valign="bottom"><a class="el" href="group___turbo_j_p_e_g.html#ga8a65ed3bd12df57c219d46afbc9008f1">tjEncodeYUVPlanes</a> (<a class="el" href="group___turbo_j_p_e_g.html#ga758d2634ecb4949de7815cba621f5763">tjhandle</a> handle, const unsigned char *srcBuf, int width, int pitch, int height, int pixelFormat, unsigned char **dstPlanes, int *strides, int subsamp, int flags)</td></tr>
+<tr class="memdesc:ga8a65ed3bd12df57c219d46afbc9008f1"><td class="mdescLeft">&#160;</td><td class="mdescRight">Encode an RGB or grayscale image into separate Y, U (Cb), and V (Cr) image planes.  <a href="#ga8a65ed3bd12df57c219d46afbc9008f1">More...</a><br/></td></tr>
+<tr class="separator:ga8a65ed3bd12df57c219d46afbc9008f1"><td class="memSeparator" colspan="2">&#160;</td></tr>
 <tr class="memitem:gae5408179d041e2a2f7199c8283cf649e"><td class="memItemLeft" align="right" valign="top">DLLEXPORT <a class="el" href="group___turbo_j_p_e_g.html#ga758d2634ecb4949de7815cba621f5763">tjhandle</a> DLLCALL&#160;</td><td class="memItemRight" valign="bottom"><a class="el" href="group___turbo_j_p_e_g.html#gae5408179d041e2a2f7199c8283cf649e">tjInitDecompress</a> (void)</td></tr>
 <tr class="memdesc:gae5408179d041e2a2f7199c8283cf649e"><td class="mdescLeft">&#160;</td><td class="mdescRight">Create a TurboJPEG decompressor instance.  <a href="#gae5408179d041e2a2f7199c8283cf649e">More...</a><br/></td></tr>
 <tr class="separator:gae5408179d041e2a2f7199c8283cf649e"><td class="memSeparator" colspan="2">&#160;</td></tr>
-<tr class="memitem:gacd0fac3af74b3511d39b4781b7103086"><td class="memItemLeft" align="right" valign="top">DLLEXPORT int DLLCALL&#160;</td><td class="memItemRight" valign="bottom"><a class="el" href="group___turbo_j_p_e_g.html#gacd0fac3af74b3511d39b4781b7103086">tjDecompressHeader3</a> (<a class="el" href="group___turbo_j_p_e_g.html#ga758d2634ecb4949de7815cba621f5763">tjhandle</a> handle, unsigned char *jpegBuf, unsigned long jpegSize, int *width, int *height, int *jpegSubsamp, int *jpegColorspace)</td></tr>
-<tr class="memdesc:gacd0fac3af74b3511d39b4781b7103086"><td class="mdescLeft">&#160;</td><td class="mdescRight">Retrieve information about a JPEG image without decompressing it.  <a href="#gacd0fac3af74b3511d39b4781b7103086">More...</a><br/></td></tr>
-<tr class="separator:gacd0fac3af74b3511d39b4781b7103086"><td class="memSeparator" colspan="2">&#160;</td></tr>
+<tr class="memitem:ga3fced455e504e8ff4fbad28ba94a3020"><td class="memItemLeft" align="right" valign="top">DLLEXPORT int DLLCALL&#160;</td><td class="memItemRight" valign="bottom"><a class="el" href="group___turbo_j_p_e_g.html#ga3fced455e504e8ff4fbad28ba94a3020">tjDecompressHeader3</a> (<a class="el" href="group___turbo_j_p_e_g.html#ga758d2634ecb4949de7815cba621f5763">tjhandle</a> handle, const unsigned char *jpegBuf, unsigned long jpegSize, int *width, int *height, int *jpegSubsamp, int *jpegColorspace)</td></tr>
+<tr class="memdesc:ga3fced455e504e8ff4fbad28ba94a3020"><td class="mdescLeft">&#160;</td><td class="mdescRight">Retrieve information about a JPEG image without decompressing it.  <a href="#ga3fced455e504e8ff4fbad28ba94a3020">More...</a><br/></td></tr>
+<tr class="separator:ga3fced455e504e8ff4fbad28ba94a3020"><td class="memSeparator" colspan="2">&#160;</td></tr>
 <tr class="memitem:ga6449044b9af402999ccf52f401333be8"><td class="memItemLeft" align="right" valign="top">DLLEXPORT <a class="el" href="structtjscalingfactor.html">tjscalingfactor</a> *DLLCALL&#160;</td><td class="memItemRight" valign="bottom"><a class="el" href="group___turbo_j_p_e_g.html#ga6449044b9af402999ccf52f401333be8">tjGetScalingFactors</a> (int *numscalingfactors)</td></tr>
 <tr class="memdesc:ga6449044b9af402999ccf52f401333be8"><td class="mdescLeft">&#160;</td><td class="mdescRight">Returns a list of fractional scaling factors that the JPEG decompressor in this implementation of TurboJPEG supports.  <a href="#ga6449044b9af402999ccf52f401333be8">More...</a><br/></td></tr>
 <tr class="separator:ga6449044b9af402999ccf52f401333be8"><td class="memSeparator" colspan="2">&#160;</td></tr>
-<tr class="memitem:gada69cc6443d1bb493b40f1626259e5e9"><td class="memItemLeft" align="right" valign="top">DLLEXPORT int DLLCALL&#160;</td><td class="memItemRight" valign="bottom"><a class="el" href="group___turbo_j_p_e_g.html#gada69cc6443d1bb493b40f1626259e5e9">tjDecompress2</a> (<a class="el" href="group___turbo_j_p_e_g.html#ga758d2634ecb4949de7815cba621f5763">tjhandle</a> handle, unsigned char *jpegBuf, unsigned long jpegSize, unsigned char *dstBuf, int width, int pitch, int height, int pixelFormat, int flags)</td></tr>
-<tr class="memdesc:gada69cc6443d1bb493b40f1626259e5e9"><td class="mdescLeft">&#160;</td><td class="mdescRight">Decompress a JPEG image to an RGB, grayscale, or CMYK image.  <a href="#gada69cc6443d1bb493b40f1626259e5e9">More...</a><br/></td></tr>
-<tr class="separator:gada69cc6443d1bb493b40f1626259e5e9"><td class="memSeparator" colspan="2">&#160;</td></tr>
-<tr class="memitem:ga7c08b340ad7f8e85d407bd9e81d44d07"><td class="memItemLeft" align="right" valign="top">DLLEXPORT int DLLCALL&#160;</td><td class="memItemRight" valign="bottom"><a class="el" href="group___turbo_j_p_e_g.html#ga7c08b340ad7f8e85d407bd9e81d44d07">tjDecompressToYUV2</a> (<a class="el" href="group___turbo_j_p_e_g.html#ga758d2634ecb4949de7815cba621f5763">tjhandle</a> handle, unsigned char *jpegBuf, unsigned long jpegSize, unsigned char *dstBuf, int width, int pad, int height, int flags)</td></tr>
-<tr class="memdesc:ga7c08b340ad7f8e85d407bd9e81d44d07"><td class="mdescLeft">&#160;</td><td class="mdescRight">Decompress a JPEG image to a YUV planar image.  <a href="#ga7c08b340ad7f8e85d407bd9e81d44d07">More...</a><br/></td></tr>
-<tr class="separator:ga7c08b340ad7f8e85d407bd9e81d44d07"><td class="memSeparator" colspan="2">&#160;</td></tr>
-<tr class="memitem:ga0828a38ae29631ac28b6857cefb0eebf"><td class="memItemLeft" align="right" valign="top">DLLEXPORT int DLLCALL&#160;</td><td class="memItemRight" valign="bottom"><a class="el" href="group___turbo_j_p_e_g.html#ga0828a38ae29631ac28b6857cefb0eebf">tjDecompressToYUVPlanes</a> (<a class="el" href="group___turbo_j_p_e_g.html#ga758d2634ecb4949de7815cba621f5763">tjhandle</a> handle, unsigned char *jpegBuf, unsigned long jpegSize, unsigned char **dstPlanes, int width, int *strides, int height, int flags)</td></tr>
-<tr class="memdesc:ga0828a38ae29631ac28b6857cefb0eebf"><td class="mdescLeft">&#160;</td><td class="mdescRight">Decompress a JPEG image into separate Y, U (Cb), and V (Cr) image planes.  <a href="#ga0828a38ae29631ac28b6857cefb0eebf">More...</a><br/></td></tr>
-<tr class="separator:ga0828a38ae29631ac28b6857cefb0eebf"><td class="memSeparator" colspan="2">&#160;</td></tr>
-<tr class="memitem:ga132ae2c2cadcf64c8bb0f3bdf69da3ed"><td class="memItemLeft" align="right" valign="top">DLLEXPORT int DLLCALL&#160;</td><td class="memItemRight" valign="bottom"><a class="el" href="group___turbo_j_p_e_g.html#ga132ae2c2cadcf64c8bb0f3bdf69da3ed">tjDecodeYUV</a> (<a class="el" href="group___turbo_j_p_e_g.html#ga758d2634ecb4949de7815cba621f5763">tjhandle</a> handle, unsigned char *srcBuf, int pad, int subsamp, unsigned char *dstBuf, int width, int pitch, int height, int pixelFormat, int flags)</td></tr>
-<tr class="memdesc:ga132ae2c2cadcf64c8bb0f3bdf69da3ed"><td class="mdescLeft">&#160;</td><td class="mdescRight">Decode a YUV planar image into an RGB or grayscale image.  <a href="#ga132ae2c2cadcf64c8bb0f3bdf69da3ed">More...</a><br/></td></tr>
-<tr class="separator:ga132ae2c2cadcf64c8bb0f3bdf69da3ed"><td class="memSeparator" colspan="2">&#160;</td></tr>
-<tr class="memitem:ga6cb5b0e1101a2b20edea576e11faf93d"><td class="memItemLeft" align="right" valign="top">DLLEXPORT int DLLCALL&#160;</td><td class="memItemRight" valign="bottom"><a class="el" href="group___turbo_j_p_e_g.html#ga6cb5b0e1101a2b20edea576e11faf93d">tjDecodeYUVPlanes</a> (<a class="el" href="group___turbo_j_p_e_g.html#ga758d2634ecb4949de7815cba621f5763">tjhandle</a> handle, unsigned char **srcPlanes, int *strides, int subsamp, unsigned char *dstBuf, int width, int pitch, int height, int pixelFormat, int flags)</td></tr>
-<tr class="memdesc:ga6cb5b0e1101a2b20edea576e11faf93d"><td class="mdescLeft">&#160;</td><td class="mdescRight">Decode a set of Y, U (Cb), and V (Cr) image planes into an RGB or grayscale image.  <a href="#ga6cb5b0e1101a2b20edea576e11faf93d">More...</a><br/></td></tr>
-<tr class="separator:ga6cb5b0e1101a2b20edea576e11faf93d"><td class="memSeparator" colspan="2">&#160;</td></tr>
+<tr class="memitem:gad8026a417e16a76313bc0a6c9e8b2ba2"><td class="memItemLeft" align="right" valign="top">DLLEXPORT int DLLCALL&#160;</td><td class="memItemRight" valign="bottom"><a class="el" href="group___turbo_j_p_e_g.html#gad8026a417e16a76313bc0a6c9e8b2ba2">tjDecompress2</a> (<a class="el" href="group___turbo_j_p_e_g.html#ga758d2634ecb4949de7815cba621f5763">tjhandle</a> handle, const unsigned char *jpegBuf, unsigned long jpegSize, unsigned char *dstBuf, int width, int pitch, int height, int pixelFormat, int flags)</td></tr>
+<tr class="memdesc:gad8026a417e16a76313bc0a6c9e8b2ba2"><td class="mdescLeft">&#160;</td><td class="mdescRight">Decompress a JPEG image to an RGB, grayscale, or CMYK image.  <a href="#gad8026a417e16a76313bc0a6c9e8b2ba2">More...</a><br/></td></tr>
+<tr class="separator:gad8026a417e16a76313bc0a6c9e8b2ba2"><td class="memSeparator" colspan="2">&#160;</td></tr>
+<tr class="memitem:ga39e08906528db5a764670ea48d344b09"><td class="memItemLeft" align="right" valign="top">DLLEXPORT int DLLCALL&#160;</td><td class="memItemRight" valign="bottom"><a class="el" href="group___turbo_j_p_e_g.html#ga39e08906528db5a764670ea48d344b09">tjDecompressToYUV2</a> (<a class="el" href="group___turbo_j_p_e_g.html#ga758d2634ecb4949de7815cba621f5763">tjhandle</a> handle, const unsigned char *jpegBuf, unsigned long jpegSize, unsigned char *dstBuf, int width, int pad, int height, int flags)</td></tr>
+<tr class="memdesc:ga39e08906528db5a764670ea48d344b09"><td class="mdescLeft">&#160;</td><td class="mdescRight">Decompress a JPEG image to a YUV planar image.  <a href="#ga39e08906528db5a764670ea48d344b09">More...</a><br/></td></tr>
+<tr class="separator:ga39e08906528db5a764670ea48d344b09"><td class="memSeparator" colspan="2">&#160;</td></tr>
+<tr class="memitem:ga38d0ef90692663b3ffb5b16da2541512"><td class="memItemLeft" align="right" valign="top">DLLEXPORT int DLLCALL&#160;</td><td class="memItemRight" valign="bottom"><a class="el" href="group___turbo_j_p_e_g.html#ga38d0ef90692663b3ffb5b16da2541512">tjDecompressToYUVPlanes</a> (<a class="el" href="group___turbo_j_p_e_g.html#ga758d2634ecb4949de7815cba621f5763">tjhandle</a> handle, const unsigned char *jpegBuf, unsigned long jpegSize, unsigned char **dstPlanes, int width, int *strides, int height, int flags)</td></tr>
+<tr class="memdesc:ga38d0ef90692663b3ffb5b16da2541512"><td class="mdescLeft">&#160;</td><td class="mdescRight">Decompress a JPEG image into separate Y, U (Cb), and V (Cr) image planes.  <a href="#ga38d0ef90692663b3ffb5b16da2541512">More...</a><br/></td></tr>
+<tr class="separator:ga38d0ef90692663b3ffb5b16da2541512"><td class="memSeparator" colspan="2">&#160;</td></tr>
+<tr class="memitem:ga077c61027b875afecd5a1613bf18b3c1"><td class="memItemLeft" align="right" valign="top">DLLEXPORT int DLLCALL&#160;</td><td class="memItemRight" valign="bottom"><a class="el" href="group___turbo_j_p_e_g.html#ga077c61027b875afecd5a1613bf18b3c1">tjDecodeYUV</a> (<a class="el" href="group___turbo_j_p_e_g.html#ga758d2634ecb4949de7815cba621f5763">tjhandle</a> handle, const unsigned char *srcBuf, int pad, int subsamp, unsigned char *dstBuf, int width, int pitch, int height, int pixelFormat, int flags)</td></tr>
+<tr class="memdesc:ga077c61027b875afecd5a1613bf18b3c1"><td class="mdescLeft">&#160;</td><td class="mdescRight">Decode a YUV planar image into an RGB or grayscale image.  <a href="#ga077c61027b875afecd5a1613bf18b3c1">More...</a><br/></td></tr>
+<tr class="separator:ga077c61027b875afecd5a1613bf18b3c1"><td class="memSeparator" colspan="2">&#160;</td></tr>
+<tr class="memitem:gaf42f19b7a496eb18bdc84fe61ee6d3e2"><td class="memItemLeft" align="right" valign="top">DLLEXPORT int DLLCALL&#160;</td><td class="memItemRight" valign="bottom"><a class="el" href="group___turbo_j_p_e_g.html#gaf42f19b7a496eb18bdc84fe61ee6d3e2">tjDecodeYUVPlanes</a> (<a class="el" href="group___turbo_j_p_e_g.html#ga758d2634ecb4949de7815cba621f5763">tjhandle</a> handle, const unsigned char **srcPlanes, const int *strides, int subsamp, unsigned char *dstBuf, int width, int pitch, int height, int pixelFormat, int flags)</td></tr>
+<tr class="memdesc:gaf42f19b7a496eb18bdc84fe61ee6d3e2"><td class="mdescLeft">&#160;</td><td class="mdescRight">Decode a set of Y, U (Cb), and V (Cr) image planes into an RGB or grayscale image.  <a href="#gaf42f19b7a496eb18bdc84fe61ee6d3e2">More...</a><br/></td></tr>
+<tr class="separator:gaf42f19b7a496eb18bdc84fe61ee6d3e2"><td class="memSeparator" colspan="2">&#160;</td></tr>
 <tr class="memitem:ga3155b775bfbac9dbba869b95a0367902"><td class="memItemLeft" align="right" valign="top">DLLEXPORT <a class="el" href="group___turbo_j_p_e_g.html#ga758d2634ecb4949de7815cba621f5763">tjhandle</a> DLLCALL&#160;</td><td class="memItemRight" valign="bottom"><a class="el" href="group___turbo_j_p_e_g.html#ga3155b775bfbac9dbba869b95a0367902">tjInitTransform</a> (void)</td></tr>
 <tr class="memdesc:ga3155b775bfbac9dbba869b95a0367902"><td class="mdescLeft">&#160;</td><td class="mdescRight">Create a new TurboJPEG transformer instance.  <a href="#ga3155b775bfbac9dbba869b95a0367902">More...</a><br/></td></tr>
 <tr class="separator:ga3155b775bfbac9dbba869b95a0367902"><td class="memSeparator" colspan="2">&#160;</td></tr>
-<tr class="memitem:gae403193ceb4aafb7e0f56ab587b48616"><td class="memItemLeft" align="right" valign="top">DLLEXPORT int DLLCALL&#160;</td><td class="memItemRight" valign="bottom"><a class="el" href="group___turbo_j_p_e_g.html#gae403193ceb4aafb7e0f56ab587b48616">tjTransform</a> (<a class="el" href="group___turbo_j_p_e_g.html#ga758d2634ecb4949de7815cba621f5763">tjhandle</a> handle, unsigned char *jpegBuf, unsigned long jpegSize, int n, unsigned char **dstBufs, unsigned long *dstSizes, <a class="el" href="structtjtransform.html">tjtransform</a> *transforms, int flags)</td></tr>
-<tr class="memdesc:gae403193ceb4aafb7e0f56ab587b48616"><td class="mdescLeft">&#160;</td><td class="mdescRight">Losslessly transform a JPEG image into another JPEG image.  <a href="#gae403193ceb4aafb7e0f56ab587b48616">More...</a><br/></td></tr>
-<tr class="separator:gae403193ceb4aafb7e0f56ab587b48616"><td class="memSeparator" colspan="2">&#160;</td></tr>
+<tr class="memitem:gad02cd42b69f193a0623a9c801788df3a"><td class="memItemLeft" align="right" valign="top">DLLEXPORT int DLLCALL&#160;</td><td class="memItemRight" valign="bottom"><a class="el" href="group___turbo_j_p_e_g.html#gad02cd42b69f193a0623a9c801788df3a">tjTransform</a> (<a class="el" href="group___turbo_j_p_e_g.html#ga758d2634ecb4949de7815cba621f5763">tjhandle</a> handle, const unsigned char *jpegBuf, unsigned long jpegSize, int n, unsigned char **dstBufs, unsigned long *dstSizes, <a class="el" href="structtjtransform.html">tjtransform</a> *transforms, int flags)</td></tr>
+<tr class="memdesc:gad02cd42b69f193a0623a9c801788df3a"><td class="mdescLeft">&#160;</td><td class="mdescRight">Losslessly transform a JPEG image into another JPEG image.  <a href="#gad02cd42b69f193a0623a9c801788df3a">More...</a><br/></td></tr>
+<tr class="separator:gad02cd42b69f193a0623a9c801788df3a"><td class="memSeparator" colspan="2">&#160;</td></tr>
 <tr class="memitem:ga674adee917b95ad4a896f1ba39e12540"><td class="memItemLeft" align="right" valign="top">DLLEXPORT int DLLCALL&#160;</td><td class="memItemRight" valign="bottom"><a class="el" href="group___turbo_j_p_e_g.html#ga674adee917b95ad4a896f1ba39e12540">tjDestroy</a> (<a class="el" href="group___turbo_j_p_e_g.html#ga758d2634ecb4949de7815cba621f5763">tjhandle</a> handle)</td></tr>
 <tr class="memdesc:ga674adee917b95ad4a896f1ba39e12540"><td class="mdescLeft">&#160;</td><td class="mdescRight">Destroy a TurboJPEG compressor, decompressor, or transformer instance.  <a href="#ga674adee917b95ad4a896f1ba39e12540">More...</a><br/></td></tr>
 <tr class="separator:ga674adee917b95ad4a896f1ba39e12540"><td class="memSeparator" colspan="2">&#160;</td></tr>
@@ -455,7 +455,7 @@
 </div><div class="memdoc">
 
 <p>Disable buffer (re)allocation. </p>
-<p>If passed to <a class="el" href="group___turbo_j_p_e_g.html#gaba62b7a98f960839b588579898495cf2" title="Compress an RGB, grayscale, or CMYK image into a JPEG image.">tjCompress2()</a> or <a class="el" href="group___turbo_j_p_e_g.html#gae403193ceb4aafb7e0f56ab587b48616" title="Losslessly transform a JPEG image into another JPEG image.">tjTransform()</a>, this flag will cause those functions to generate an error if the JPEG image buffer is invalid or too small rather than attempting to allocate or reallocate that buffer. This reproduces the behavior of earlier versions of TurboJPEG. </p>
+<p>If passed to <a class="el" href="group___turbo_j_p_e_g.html#gaf38f2ed44bdc88e730e08b632fa6e88e" title="Compress an RGB, grayscale, or CMYK image into a JPEG image.">tjCompress2()</a> or <a class="el" href="group___turbo_j_p_e_g.html#gad02cd42b69f193a0623a9c801788df3a" title="Losslessly transform a JPEG image into another JPEG image.">tjTransform()</a>, this flag will cause those functions to generate an error if the JPEG image buffer is invalid or too small rather than attempting to allocate or reallocate that buffer. This reproduces the behavior of earlier versions of TurboJPEG. </p>
 
 </div>
 </div>
@@ -517,7 +517,7 @@
 </div><div class="memdoc">
 
 <p>This option will enable lossless cropping. </p>
-<p>See <a class="el" href="group___turbo_j_p_e_g.html#gae403193ceb4aafb7e0f56ab587b48616" title="Losslessly transform a JPEG image into another JPEG image.">tjTransform()</a> for more information. </p>
+<p>See <a class="el" href="group___turbo_j_p_e_g.html#gad02cd42b69f193a0623a9c801788df3a" title="Losslessly transform a JPEG image into another JPEG image.">tjTransform()</a> for more information. </p>
 
 </div>
 </div>
@@ -545,7 +545,7 @@
       </table>
 </div><div class="memdoc">
 
-<p>This option will prevent <a class="el" href="group___turbo_j_p_e_g.html#gae403193ceb4aafb7e0f56ab587b48616" title="Losslessly transform a JPEG image into another JPEG image.">tjTransform()</a> from outputting a JPEG image for this particular transform (this can be used in conjunction with a custom filter to capture the transformed DCT coefficients without transcoding them.) </p>
+<p>This option will prevent <a class="el" href="group___turbo_j_p_e_g.html#gad02cd42b69f193a0623a9c801788df3a" title="Losslessly transform a JPEG image into another JPEG image.">tjTransform()</a> from outputting a JPEG image for this particular transform (this can be used in conjunction with a custom filter to capture the transformed DCT coefficients without transcoding them.) </p>
 
 </div>
 </div>
@@ -559,7 +559,7 @@
       </table>
 </div><div class="memdoc">
 
-<p>This option will cause <a class="el" href="group___turbo_j_p_e_g.html#gae403193ceb4aafb7e0f56ab587b48616" title="Losslessly transform a JPEG image into another JPEG image.">tjTransform()</a> to return an error if the transform is not perfect. </p>
+<p>This option will cause <a class="el" href="group___turbo_j_p_e_g.html#gad02cd42b69f193a0623a9c801788df3a" title="Losslessly transform a JPEG image into another JPEG image.">tjTransform()</a> to return an error if the transform is not perfect. </p>
 <p>Lossless transforms operate on MCU blocks, whose size depends on the level of chrominance subsampling used (see <a class="el" href="group___turbo_j_p_e_g.html#ga9e61e7cd47a15a173283ba94e781308c" title="MCU block width (in pixels) for a given level of chrominance subsampling.">tjMCUWidth</a> and <a class="el" href="group___turbo_j_p_e_g.html#gabd247bb9fecb393eca57366feb8327bf" title="MCU block height (in pixels) for a given level of chrominance subsampling.">tjMCUHeight</a>.) If the image's width or height is not evenly divisible by the MCU block size, then there will be partial MCU blocks on the right and/or bottom edges. It is not possible to move these partial MCU blocks to the top or left of the image, so any transform that would require that is "imperfect." If this option is not specified, then any partial MCU blocks that cannot be transformed will be left in place, which will create odd-looking strips on the right or bottom edge of the image. </p>
 
 </div>
@@ -574,7 +574,7 @@
       </table>
 </div><div class="memdoc">
 
-<p>This option will cause <a class="el" href="group___turbo_j_p_e_g.html#gae403193ceb4aafb7e0f56ab587b48616" title="Losslessly transform a JPEG image into another JPEG image.">tjTransform()</a> to discard any partial MCU blocks that cannot be transformed. </p>
+<p>This option will cause <a class="el" href="group___turbo_j_p_e_g.html#gad02cd42b69f193a0623a9c801788df3a" title="Losslessly transform a JPEG image into another JPEG image.">tjTransform()</a> to discard any partial MCU blocks that cannot be transformed. </p>
 
 </div>
 </div>
@@ -761,7 +761,7 @@
       </table>
 </div><div class="memdoc">
 
-<p>Transform operations for <a class="el" href="group___turbo_j_p_e_g.html#gae403193ceb4aafb7e0f56ab587b48616" title="Losslessly transform a JPEG image into another JPEG image.">tjTransform()</a> </p>
+<p>Transform operations for <a class="el" href="group___turbo_j_p_e_g.html#gad02cd42b69f193a0623a9c801788df3a" title="Losslessly transform a JPEG image into another JPEG image.">tjTransform()</a> </p>
 <table class="fieldtable">
 <tr><th colspan="2">Enumerator</th></tr><tr><td class="fieldname"><em><a class="anchor" id="gga2de531af4e7e6c4f124908376b354866aad88c0366cd3f7d0eac9d7a3fa1c2c27"></a>TJXOP_NONE</em>&nbsp;</td><td class="fielddoc">
 <p>Do not transform the position of the image pixels. </p>
@@ -812,7 +812,7 @@
 </div><div class="memdoc">
 
 <p>Allocate an image buffer for use with TurboJPEG. </p>
-<p>You should always use this function to allocate the JPEG destination buffer(s) for <a class="el" href="group___turbo_j_p_e_g.html#gaba62b7a98f960839b588579898495cf2" title="Compress an RGB, grayscale, or CMYK image into a JPEG image.">tjCompress2()</a> and <a class="el" href="group___turbo_j_p_e_g.html#gae403193ceb4aafb7e0f56ab587b48616" title="Losslessly transform a JPEG image into another JPEG image.">tjTransform()</a> unless you are disabling automatic buffer (re)allocation (by setting <a class="el" href="group___turbo_j_p_e_g.html#ga8808d403c68b62aaa58a4c1e58e98963" title="Disable buffer (re)allocation.">TJFLAG_NOREALLOC</a>.)</p>
+<p>You should always use this function to allocate the JPEG destination buffer(s) for <a class="el" href="group___turbo_j_p_e_g.html#gaf38f2ed44bdc88e730e08b632fa6e88e" title="Compress an RGB, grayscale, or CMYK image into a JPEG image.">tjCompress2()</a> and <a class="el" href="group___turbo_j_p_e_g.html#gad02cd42b69f193a0623a9c801788df3a" title="Losslessly transform a JPEG image into another JPEG image.">tjTransform()</a> unless you are disabling automatic buffer (re)allocation (by setting <a class="el" href="group___turbo_j_p_e_g.html#ga8808d403c68b62aaa58a4c1e58e98963" title="Disable buffer (re)allocation.">TJFLAG_NOREALLOC</a>.)</p>
 <dl class="params"><dt>Parameters</dt><dd>
   <table class="params">
     <tr><td class="paramname">bytes</td><td>the number of bytes to allocate</td></tr>
@@ -918,7 +918,7 @@
 
 </div>
 </div>
-<a class="anchor" id="gaba62b7a98f960839b588579898495cf2"></a>
+<a class="anchor" id="gaf38f2ed44bdc88e730e08b632fa6e88e"></a>
 <div class="memitem">
 <div class="memproto">
       <table class="memname">
@@ -931,7 +931,7 @@
         <tr>
           <td class="paramkey"></td>
           <td></td>
-          <td class="paramtype">unsigned char *&#160;</td>
+          <td class="paramtype">const unsigned char *&#160;</td>
           <td class="paramname"><em>srcBuf</em>, </td>
         </tr>
         <tr>
@@ -1000,7 +1000,7 @@
 <dl class="params"><dt>Parameters</dt><dd>
   <table class="params">
     <tr><td class="paramname">handle</td><td>a handle to a TurboJPEG compressor or transformer instance</td></tr>
-    <tr><td class="paramname">srcBuf</td><td>pointer to an image buffer containing RGB, grayscale, or CMYK pixels to be compressed. This buffer is not modified.</td></tr>
+    <tr><td class="paramname">srcBuf</td><td>pointer to an image buffer containing RGB, grayscale, or CMYK pixels to be compressed</td></tr>
     <tr><td class="paramname">width</td><td>width (in pixels) of the source image</td></tr>
     <tr><td class="paramname">pitch</td><td>bytes per line in the source image. Normally, this should be <code>width * <a class="el" href="group___turbo_j_p_e_g.html#gad77cf8fe5b2bfd3cb3f53098146abb4c" title="Pixel size (in bytes) for a given pixel format.">tjPixelSize</a>[pixelFormat]</code> if the image is unpadded, or <code><a class="el" href="group___turbo_j_p_e_g.html#ga0aba955473315e405295d978f0c16511" title="Pad the given width to the nearest 32-bit boundary.">TJPAD</a>(width * <a class="el" href="group___turbo_j_p_e_g.html#gad77cf8fe5b2bfd3cb3f53098146abb4c" title="Pixel size (in bytes) for a given pixel format.">tjPixelSize</a>[pixelFormat])</code> if each line of the image is padded to the nearest 32-bit boundary, as is the case for Windows bitmaps. You can also be clever and use this parameter to skip lines, etc. Setting this parameter to 0 is the equivalent of setting it to <code>width * <a class="el" href="group___turbo_j_p_e_g.html#gad77cf8fe5b2bfd3cb3f53098146abb4c" title="Pixel size (in bytes) for a given pixel format.">tjPixelSize</a>[pixelFormat]</code>.</td></tr>
     <tr><td class="paramname">height</td><td>height (in pixels) of the source image</td></tr>
@@ -1022,7 +1022,7 @@
 
 </div>
 </div>
-<a class="anchor" id="ga0b931126c7a615ddc3bbd0cca6698d67"></a>
+<a class="anchor" id="ga6f6de375d6ec0020faba627e37e5a060"></a>
 <div class="memitem">
 <div class="memproto">
       <table class="memname">
@@ -1035,7 +1035,7 @@
         <tr>
           <td class="paramkey"></td>
           <td></td>
-          <td class="paramtype">unsigned char *&#160;</td>
+          <td class="paramtype">const unsigned char *&#160;</td>
           <td class="paramname"><em>srcBuf</em>, </td>
         </tr>
         <tr>
@@ -1098,7 +1098,7 @@
 <dl class="params"><dt>Parameters</dt><dd>
   <table class="params">
     <tr><td class="paramname">handle</td><td>a handle to a TurboJPEG compressor or transformer instance</td></tr>
-    <tr><td class="paramname">srcBuf</td><td>pointer to an image buffer containing a YUV planar image to be compressed. The size of this buffer should match the value returned by <a class="el" href="group___turbo_j_p_e_g.html#gaf451664a62c1f6c7cc5a6401f32908c9" title="The size of the buffer (in bytes) required to hold a YUV planar image with the given parameters...">tjBufSizeYUV2()</a> for the given image width, height, padding, and level of chrominance subsampling. The Y, U (Cb), and V (Cr) image planes should be stored sequentially in the source buffer (refer to <a class="el" href="group___turbo_j_p_e_g.html#YUVnotes">YUV Image Format Notes</a>.) This buffer is not modified.</td></tr>
+    <tr><td class="paramname">srcBuf</td><td>pointer to an image buffer containing a YUV planar image to be compressed. The size of this buffer should match the value returned by <a class="el" href="group___turbo_j_p_e_g.html#gaf451664a62c1f6c7cc5a6401f32908c9" title="The size of the buffer (in bytes) required to hold a YUV planar image with the given parameters...">tjBufSizeYUV2()</a> for the given image width, height, padding, and level of chrominance subsampling. The Y, U (Cb), and V (Cr) image planes should be stored sequentially in the source buffer (refer to <a class="el" href="group___turbo_j_p_e_g.html#YUVnotes">YUV Image Format Notes</a>.)</td></tr>
     <tr><td class="paramname">width</td><td>width (in pixels) of the source image. If the width is not an even multiple of the MCU block width (see <a class="el" href="group___turbo_j_p_e_g.html#ga9e61e7cd47a15a173283ba94e781308c" title="MCU block width (in pixels) for a given level of chrominance subsampling.">tjMCUWidth</a>), then an intermediate buffer copy will be performed within TurboJPEG.</td></tr>
     <tr><td class="paramname">pad</td><td>the line padding used in the source image. For instance, if each line in each plane of the YUV image is padded to the nearest multiple of 4 bytes, then <code>pad</code> should be set to 4.</td></tr>
     <tr><td class="paramname">height</td><td>height (in pixels) of the source image. If the height is not an even multiple of the MCU block height (see <a class="el" href="group___turbo_j_p_e_g.html#gabd247bb9fecb393eca57366feb8327bf" title="MCU block height (in pixels) for a given level of chrominance subsampling.">tjMCUHeight</a>), then an intermediate buffer copy will be performed within TurboJPEG.</td></tr>
@@ -1119,7 +1119,7 @@
 
 </div>
 </div>
-<a class="anchor" id="gaa89a1982cb4556b12ae7af4439991af6"></a>
+<a class="anchor" id="ga0b84c682d8accf097d7a743c965d3464"></a>
 <div class="memitem">
 <div class="memproto">
       <table class="memname">
@@ -1132,7 +1132,7 @@
         <tr>
           <td class="paramkey"></td>
           <td></td>
-          <td class="paramtype">unsigned char **&#160;</td>
+          <td class="paramtype">const unsigned char **&#160;</td>
           <td class="paramname"><em>srcPlanes</em>, </td>
         </tr>
         <tr>
@@ -1144,7 +1144,7 @@
         <tr>
           <td class="paramkey"></td>
           <td></td>
-          <td class="paramtype">int *&#160;</td>
+          <td class="paramtype">const int *&#160;</td>
           <td class="paramname"><em>strides</em>, </td>
         </tr>
         <tr>
@@ -1195,7 +1195,7 @@
 <dl class="params"><dt>Parameters</dt><dd>
   <table class="params">
     <tr><td class="paramname">handle</td><td>a handle to a TurboJPEG compressor or transformer instance</td></tr>
-    <tr><td class="paramname">srcPlanes</td><td>an array of pointers to Y, U (Cb), and V (Cr) image planes (or just a Y plane, if compressing a grayscale image) that contain a YUV image to be compressed. These planes can be contiguous or non-contiguous in memory. The size of each plane should match the value returned by <a class="el" href="group___turbo_j_p_e_g.html#ga6f98d977bfa9d167c97172e876ba61e2" title="The size of the buffer (in bytes) required to hold a YUV image plane with the given parameters...">tjPlaneSizeYUV()</a> for the given image width, height, strides, and level of chrominance subsampling. Refer to <a class="el" href="group___turbo_j_p_e_g.html#YUVnotes">YUV Image Format Notes</a> for more details. These image planes are not modified.</td></tr>
+    <tr><td class="paramname">srcPlanes</td><td>an array of pointers to Y, U (Cb), and V (Cr) image planes (or just a Y plane, if compressing a grayscale image) that contain a YUV image to be compressed. These planes can be contiguous or non-contiguous in memory. The size of each plane should match the value returned by <a class="el" href="group___turbo_j_p_e_g.html#ga6f98d977bfa9d167c97172e876ba61e2" title="The size of the buffer (in bytes) required to hold a YUV image plane with the given parameters...">tjPlaneSizeYUV()</a> for the given image width, height, strides, and level of chrominance subsampling. Refer to <a class="el" href="group___turbo_j_p_e_g.html#YUVnotes">YUV Image Format Notes</a> for more details.</td></tr>
     <tr><td class="paramname">width</td><td>width (in pixels) of the source image. If the width is not an even multiple of the MCU block width (see <a class="el" href="group___turbo_j_p_e_g.html#ga9e61e7cd47a15a173283ba94e781308c" title="MCU block width (in pixels) for a given level of chrominance subsampling.">tjMCUWidth</a>), then an intermediate buffer copy will be performed within TurboJPEG.</td></tr>
     <tr><td class="paramname">strides</td><td>an array of integers, each specifying the number of bytes per line in the corresponding plane of the YUV source image. Setting the stride for any plane to 0 is the same as setting it to the plane width (see <a class="el" href="group___turbo_j_p_e_g.html#YUVnotes">YUV Image Format Notes</a>.) If <code>strides</code> is NULL, then the strides for all planes will be set to their respective plane widths. You can adjust the strides in order to specify an arbitrary amount of line padding in each plane or to create a JPEG image from a subregion of a larger YUV planar image.</td></tr>
     <tr><td class="paramname">height</td><td>height (in pixels) of the source image. If the height is not an even multiple of the MCU block height (see <a class="el" href="group___turbo_j_p_e_g.html#gabd247bb9fecb393eca57366feb8327bf" title="MCU block height (in pixels) for a given level of chrominance subsampling.">tjMCUHeight</a>), then an intermediate buffer copy will be performed within TurboJPEG.</td></tr>
@@ -1216,7 +1216,7 @@
 
 </div>
 </div>
-<a class="anchor" id="ga132ae2c2cadcf64c8bb0f3bdf69da3ed"></a>
+<a class="anchor" id="ga077c61027b875afecd5a1613bf18b3c1"></a>
 <div class="memitem">
 <div class="memproto">
       <table class="memname">
@@ -1229,7 +1229,7 @@
         <tr>
           <td class="paramkey"></td>
           <td></td>
-          <td class="paramtype">unsigned char *&#160;</td>
+          <td class="paramtype">const unsigned char *&#160;</td>
           <td class="paramname"><em>srcBuf</em>, </td>
         </tr>
         <tr>
@@ -1293,7 +1293,7 @@
 <dl class="params"><dt>Parameters</dt><dd>
   <table class="params">
     <tr><td class="paramname">handle</td><td>a handle to a TurboJPEG decompressor or transformer instance</td></tr>
-    <tr><td class="paramname">srcBuf</td><td>pointer to an image buffer containing a YUV planar image to be decoded. The size of this buffer should match the value returned by <a class="el" href="group___turbo_j_p_e_g.html#gaf451664a62c1f6c7cc5a6401f32908c9" title="The size of the buffer (in bytes) required to hold a YUV planar image with the given parameters...">tjBufSizeYUV2()</a> for the given image width, height, padding, and level of chrominance subsampling. The Y, U (Cb), and V (Cr) image planes should be stored sequentially in the source buffer (refer to <a class="el" href="group___turbo_j_p_e_g.html#YUVnotes">YUV Image Format Notes</a>.) This buffer is not modified.</td></tr>
+    <tr><td class="paramname">srcBuf</td><td>pointer to an image buffer containing a YUV planar image to be decoded. The size of this buffer should match the value returned by <a class="el" href="group___turbo_j_p_e_g.html#gaf451664a62c1f6c7cc5a6401f32908c9" title="The size of the buffer (in bytes) required to hold a YUV planar image with the given parameters...">tjBufSizeYUV2()</a> for the given image width, height, padding, and level of chrominance subsampling. The Y, U (Cb), and V (Cr) image planes should be stored sequentially in the source buffer (refer to <a class="el" href="group___turbo_j_p_e_g.html#YUVnotes">YUV Image Format Notes</a>.)</td></tr>
     <tr><td class="paramname">pad</td><td>Use this parameter to specify that the width of each line in each plane of the YUV source image is padded to the nearest multiple of this number of bytes (must be a power of 2.)</td></tr>
     <tr><td class="paramname">subsamp</td><td>the level of chrominance subsampling used in the YUV source image (see <a class="el" href="group___turbo_j_p_e_g.html#ga1d047060ea80bb9820d540bb928e9074">Chrominance subsampling options</a>.)</td></tr>
     <tr><td class="paramname">dstBuf</td><td>pointer to an image buffer that will receive the decoded image. This buffer should normally be <code>pitch * height</code> bytes in size, but the <code>dstBuf</code> pointer can also be used to decode into a specific region of a larger buffer.</td></tr>
@@ -1309,7 +1309,7 @@
 
 </div>
 </div>
-<a class="anchor" id="ga6cb5b0e1101a2b20edea576e11faf93d"></a>
+<a class="anchor" id="gaf42f19b7a496eb18bdc84fe61ee6d3e2"></a>
 <div class="memitem">
 <div class="memproto">
       <table class="memname">
@@ -1322,13 +1322,13 @@
         <tr>
           <td class="paramkey"></td>
           <td></td>
-          <td class="paramtype">unsigned char **&#160;</td>
+          <td class="paramtype">const unsigned char **&#160;</td>
           <td class="paramname"><em>srcPlanes</em>, </td>
         </tr>
         <tr>
           <td class="paramkey"></td>
           <td></td>
-          <td class="paramtype">int *&#160;</td>
+          <td class="paramtype">const int *&#160;</td>
           <td class="paramname"><em>strides</em>, </td>
         </tr>
         <tr>
@@ -1386,7 +1386,7 @@
 <dl class="params"><dt>Parameters</dt><dd>
   <table class="params">
     <tr><td class="paramname">handle</td><td>a handle to a TurboJPEG decompressor or transformer instance</td></tr>
-    <tr><td class="paramname">srcPlanes</td><td>an array of pointers to Y, U (Cb), and V (Cr) image planes (or just a Y plane, if decoding a grayscale image) that contain a YUV image to be decoded. These planes can be contiguous or non-contiguous in memory. The size of each plane should match the value returned by <a class="el" href="group___turbo_j_p_e_g.html#ga6f98d977bfa9d167c97172e876ba61e2" title="The size of the buffer (in bytes) required to hold a YUV image plane with the given parameters...">tjPlaneSizeYUV()</a> for the given image width, height, strides, and level of chrominance subsampling. Refer to <a class="el" href="group___turbo_j_p_e_g.html#YUVnotes">YUV Image Format Notes</a> for more details. These image planes are not modified.</td></tr>
+    <tr><td class="paramname">srcPlanes</td><td>an array of pointers to Y, U (Cb), and V (Cr) image planes (or just a Y plane, if decoding a grayscale image) that contain a YUV image to be decoded. These planes can be contiguous or non-contiguous in memory. The size of each plane should match the value returned by <a class="el" href="group___turbo_j_p_e_g.html#ga6f98d977bfa9d167c97172e876ba61e2" title="The size of the buffer (in bytes) required to hold a YUV image plane with the given parameters...">tjPlaneSizeYUV()</a> for the given image width, height, strides, and level of chrominance subsampling. Refer to <a class="el" href="group___turbo_j_p_e_g.html#YUVnotes">YUV Image Format Notes</a> for more details.</td></tr>
     <tr><td class="paramname">strides</td><td>an array of integers, each specifying the number of bytes per line in the corresponding plane of the YUV source image. Setting the stride for any plane to 0 is the same as setting it to the plane width (see <a class="el" href="group___turbo_j_p_e_g.html#YUVnotes">YUV Image Format Notes</a>.) If <code>strides</code> is NULL, then the strides for all planes will be set to their respective plane widths. You can adjust the strides in order to specify an arbitrary amount of line padding in each plane or to decode a subregion of a larger YUV planar image.</td></tr>
     <tr><td class="paramname">subsamp</td><td>the level of chrominance subsampling used in the YUV source image (see <a class="el" href="group___turbo_j_p_e_g.html#ga1d047060ea80bb9820d540bb928e9074">Chrominance subsampling options</a>.)</td></tr>
     <tr><td class="paramname">dstBuf</td><td>pointer to an image buffer that will receive the decoded image. This buffer should normally be <code>pitch * height</code> bytes in size, but the <code>dstBuf</code> pointer can also be used to decode into a specific region of a larger buffer.</td></tr>
@@ -1402,7 +1402,7 @@
 
 </div>
 </div>
-<a class="anchor" id="gada69cc6443d1bb493b40f1626259e5e9"></a>
+<a class="anchor" id="gad8026a417e16a76313bc0a6c9e8b2ba2"></a>
 <div class="memitem">
 <div class="memproto">
       <table class="memname">
@@ -1415,7 +1415,7 @@
         <tr>
           <td class="paramkey"></td>
           <td></td>
-          <td class="paramtype">unsigned char *&#160;</td>
+          <td class="paramtype">const unsigned char *&#160;</td>
           <td class="paramname"><em>jpegBuf</em>, </td>
         </tr>
         <tr>
@@ -1472,7 +1472,7 @@
 <dl class="params"><dt>Parameters</dt><dd>
   <table class="params">
     <tr><td class="paramname">handle</td><td>a handle to a TurboJPEG decompressor or transformer instance</td></tr>
-    <tr><td class="paramname">jpegBuf</td><td>pointer to a buffer containing the JPEG image to decompress. This buffer is not modified.</td></tr>
+    <tr><td class="paramname">jpegBuf</td><td>pointer to a buffer containing the JPEG image to decompress</td></tr>
     <tr><td class="paramname">jpegSize</td><td>size of the JPEG image (in bytes)</td></tr>
     <tr><td class="paramname">dstBuf</td><td>pointer to an image buffer that will receive the decompressed image. This buffer should normally be <code>pitch * scaledHeight</code> bytes in size, where <code>scaledHeight</code> can be determined by calling <a class="el" href="group___turbo_j_p_e_g.html#ga84878bb65404204743aa18cac02781df" title="Compute the scaled value of dimension using the given scaling factor.">TJSCALED()</a> with the JPEG image height and one of the scaling factors returned by <a class="el" href="group___turbo_j_p_e_g.html#ga6449044b9af402999ccf52f401333be8" title="Returns a list of fractional scaling factors that the JPEG decompressor in this implementation of Tur...">tjGetScalingFactors()</a>. The <code>dstBuf</code> pointer may also be used to decompress into a specific region of a larger buffer.</td></tr>
     <tr><td class="paramname">width</td><td>desired width (in pixels) of the destination image. If this is different than the width of the JPEG image being decompressed, then TurboJPEG will use scaling in the JPEG decompressor to generate the largest possible image that will fit within the desired width. If <code>width</code> is set to 0, then only the height will be considered when determining the scaled image size.</td></tr>
@@ -1487,7 +1487,7 @@
 
 </div>
 </div>
-<a class="anchor" id="gacd0fac3af74b3511d39b4781b7103086"></a>
+<a class="anchor" id="ga3fced455e504e8ff4fbad28ba94a3020"></a>
 <div class="memitem">
 <div class="memproto">
       <table class="memname">
@@ -1500,7 +1500,7 @@
         <tr>
           <td class="paramkey"></td>
           <td></td>
-          <td class="paramtype">unsigned char *&#160;</td>
+          <td class="paramtype">const unsigned char *&#160;</td>
           <td class="paramname"><em>jpegBuf</em>, </td>
         </tr>
         <tr>
@@ -1545,7 +1545,7 @@
 <dl class="params"><dt>Parameters</dt><dd>
   <table class="params">
     <tr><td class="paramname">handle</td><td>a handle to a TurboJPEG decompressor or transformer instance</td></tr>
-    <tr><td class="paramname">jpegBuf</td><td>pointer to a buffer containing a JPEG image. This buffer is not modified.</td></tr>
+    <tr><td class="paramname">jpegBuf</td><td>pointer to a buffer containing a JPEG image</td></tr>
     <tr><td class="paramname">jpegSize</td><td>size of the JPEG image (in bytes)</td></tr>
     <tr><td class="paramname">width</td><td>pointer to an integer variable that will receive the width (in pixels) of the JPEG image</td></tr>
     <tr><td class="paramname">height</td><td>pointer to an integer variable that will receive the height (in pixels) of the JPEG image</td></tr>
@@ -1558,7 +1558,7 @@
 
 </div>
 </div>
-<a class="anchor" id="ga7c08b340ad7f8e85d407bd9e81d44d07"></a>
+<a class="anchor" id="ga39e08906528db5a764670ea48d344b09"></a>
 <div class="memitem">
 <div class="memproto">
       <table class="memname">
@@ -1571,7 +1571,7 @@
         <tr>
           <td class="paramkey"></td>
           <td></td>
-          <td class="paramtype">unsigned char *&#160;</td>
+          <td class="paramtype">const unsigned char *&#160;</td>
           <td class="paramname"><em>jpegBuf</em>, </td>
         </tr>
         <tr>
@@ -1623,7 +1623,7 @@
 <dl class="params"><dt>Parameters</dt><dd>
   <table class="params">
     <tr><td class="paramname">handle</td><td>a handle to a TurboJPEG decompressor or transformer instance</td></tr>
-    <tr><td class="paramname">jpegBuf</td><td>pointer to a buffer containing the JPEG image to decompress. This buffer is not modified.</td></tr>
+    <tr><td class="paramname">jpegBuf</td><td>pointer to a buffer containing the JPEG image to decompress</td></tr>
     <tr><td class="paramname">jpegSize</td><td>size of the JPEG image (in bytes)</td></tr>
     <tr><td class="paramname">dstBuf</td><td>pointer to an image buffer that will receive the YUV image. Use <a class="el" href="group___turbo_j_p_e_g.html#gaf451664a62c1f6c7cc5a6401f32908c9" title="The size of the buffer (in bytes) required to hold a YUV planar image with the given parameters...">tjBufSizeYUV2()</a> to determine the appropriate size for this buffer based on the image width, height, padding, and level of subsampling. The Y, U (Cb), and V (Cr) image planes will be stored sequentially in the buffer (refer to <a class="el" href="group___turbo_j_p_e_g.html#YUVnotes">YUV Image Format Notes</a>.)</td></tr>
     <tr><td class="paramname">width</td><td>desired width (in pixels) of the YUV image. If this is different than the width of the JPEG image being decompressed, then TurboJPEG will use scaling in the JPEG decompressor to generate the largest possible image that will fit within the desired width. If <code>width</code> is set to 0, then only the height will be considered when determining the scaled image size. If the scaled width is not an even multiple of the MCU block width (see <a class="el" href="group___turbo_j_p_e_g.html#ga9e61e7cd47a15a173283ba94e781308c" title="MCU block width (in pixels) for a given level of chrominance subsampling.">tjMCUWidth</a>), then an intermediate buffer copy will be performed within TurboJPEG.</td></tr>
@@ -1637,7 +1637,7 @@
 
 </div>
 </div>
-<a class="anchor" id="ga0828a38ae29631ac28b6857cefb0eebf"></a>
+<a class="anchor" id="ga38d0ef90692663b3ffb5b16da2541512"></a>
 <div class="memitem">
 <div class="memproto">
       <table class="memname">
@@ -1650,7 +1650,7 @@
         <tr>
           <td class="paramkey"></td>
           <td></td>
-          <td class="paramtype">unsigned char *&#160;</td>
+          <td class="paramtype">const unsigned char *&#160;</td>
           <td class="paramname"><em>jpegBuf</em>, </td>
         </tr>
         <tr>
@@ -1702,7 +1702,7 @@
 <dl class="params"><dt>Parameters</dt><dd>
   <table class="params">
     <tr><td class="paramname">handle</td><td>a handle to a TurboJPEG decompressor or transformer instance</td></tr>
-    <tr><td class="paramname">jpegBuf</td><td>pointer to a buffer containing the JPEG image to decompress. This buffer is not modified.</td></tr>
+    <tr><td class="paramname">jpegBuf</td><td>pointer to a buffer containing the JPEG image to decompress</td></tr>
     <tr><td class="paramname">jpegSize</td><td>size of the JPEG image (in bytes)</td></tr>
     <tr><td class="paramname">dstPlanes</td><td>an array of pointers to Y, U (Cb), and V (Cr) image planes (or just a Y plane, if decompressing a grayscale image) that will receive the YUV image. These planes can be contiguous or non-contiguous in memory. Use <a class="el" href="group___turbo_j_p_e_g.html#ga6f98d977bfa9d167c97172e876ba61e2" title="The size of the buffer (in bytes) required to hold a YUV image plane with the given parameters...">tjPlaneSizeYUV()</a> to determine the appropriate size for each plane based on the scaled image width, scaled image height, strides, and level of chrominance subsampling. Refer to <a class="el" href="group___turbo_j_p_e_g.html#YUVnotes">YUV Image Format Notes</a> for more details.</td></tr>
     <tr><td class="paramname">width</td><td>desired width (in pixels) of the YUV image. If this is different than the width of the JPEG image being decompressed, then TurboJPEG will use scaling in the JPEG decompressor to generate the largest possible image that will fit within the desired width. If <code>width</code> is set to 0, then only the height will be considered when determining the scaled image size. If the scaled width is not an even multiple of the MCU block width (see <a class="el" href="group___turbo_j_p_e_g.html#ga9e61e7cd47a15a173283ba94e781308c" title="MCU block width (in pixels) for a given level of chrominance subsampling.">tjMCUWidth</a>), then an intermediate buffer copy will be performed within TurboJPEG.</td></tr>
@@ -1741,7 +1741,7 @@
 
 </div>
 </div>
-<a class="anchor" id="ga0a5ffbf7cb58a5b6a8201114fe889360"></a>
+<a class="anchor" id="gaabe05acd734990053ad1294b5ef239aa"></a>
 <div class="memitem">
 <div class="memproto">
       <table class="memname">
@@ -1754,7 +1754,7 @@
         <tr>
           <td class="paramkey"></td>
           <td></td>
-          <td class="paramtype">unsigned char *&#160;</td>
+          <td class="paramtype">const unsigned char *&#160;</td>
           <td class="paramname"><em>srcBuf</em>, </td>
         </tr>
         <tr>
@@ -1818,7 +1818,7 @@
 <dl class="params"><dt>Parameters</dt><dd>
   <table class="params">
     <tr><td class="paramname">handle</td><td>a handle to a TurboJPEG compressor or transformer instance</td></tr>
-    <tr><td class="paramname">srcBuf</td><td>pointer to an image buffer containing RGB or grayscale pixels to be encoded. This buffer is not modified.</td></tr>
+    <tr><td class="paramname">srcBuf</td><td>pointer to an image buffer containing RGB or grayscale pixels to be encoded</td></tr>
     <tr><td class="paramname">width</td><td>width (in pixels) of the source image</td></tr>
     <tr><td class="paramname">pitch</td><td>bytes per line in the source image. Normally, this should be <code>width * <a class="el" href="group___turbo_j_p_e_g.html#gad77cf8fe5b2bfd3cb3f53098146abb4c" title="Pixel size (in bytes) for a given pixel format.">tjPixelSize</a>[pixelFormat]</code> if the image is unpadded, or <code><a class="el" href="group___turbo_j_p_e_g.html#ga0aba955473315e405295d978f0c16511" title="Pad the given width to the nearest 32-bit boundary.">TJPAD</a>(width * <a class="el" href="group___turbo_j_p_e_g.html#gad77cf8fe5b2bfd3cb3f53098146abb4c" title="Pixel size (in bytes) for a given pixel format.">tjPixelSize</a>[pixelFormat])</code> if each line of the image is padded to the nearest 32-bit boundary, as is the case for Windows bitmaps. You can also be clever and use this parameter to skip lines, etc. Setting this parameter to 0 is the equivalent of setting it to <code>width * <a class="el" href="group___turbo_j_p_e_g.html#gad77cf8fe5b2bfd3cb3f53098146abb4c" title="Pixel size (in bytes) for a given pixel format.">tjPixelSize</a>[pixelFormat]</code>.</td></tr>
     <tr><td class="paramname">height</td><td>height (in pixels) of the source image</td></tr>
@@ -1834,7 +1834,7 @@
 
 </div>
 </div>
-<a class="anchor" id="gaa791db8598853ddcad24e42897ef1269"></a>
+<a class="anchor" id="ga8a65ed3bd12df57c219d46afbc9008f1"></a>
 <div class="memitem">
 <div class="memproto">
       <table class="memname">
@@ -1847,7 +1847,7 @@
         <tr>
           <td class="paramkey"></td>
           <td></td>
-          <td class="paramtype">unsigned char *&#160;</td>
+          <td class="paramtype">const unsigned char *&#160;</td>
           <td class="paramname"><em>srcBuf</em>, </td>
         </tr>
         <tr>
@@ -1911,7 +1911,7 @@
 <dl class="params"><dt>Parameters</dt><dd>
   <table class="params">
     <tr><td class="paramname">handle</td><td>a handle to a TurboJPEG compressor or transformer instance</td></tr>
-    <tr><td class="paramname">srcBuf</td><td>pointer to an image buffer containing RGB or grayscale pixels to be encoded. This buffer is not modified.</td></tr>
+    <tr><td class="paramname">srcBuf</td><td>pointer to an image buffer containing RGB or grayscale pixels to be encoded</td></tr>
     <tr><td class="paramname">width</td><td>width (in pixels) of the source image</td></tr>
     <tr><td class="paramname">pitch</td><td>bytes per line in the source image. Normally, this should be <code>width * <a class="el" href="group___turbo_j_p_e_g.html#gad77cf8fe5b2bfd3cb3f53098146abb4c" title="Pixel size (in bytes) for a given pixel format.">tjPixelSize</a>[pixelFormat]</code> if the image is unpadded, or <code><a class="el" href="group___turbo_j_p_e_g.html#ga0aba955473315e405295d978f0c16511" title="Pad the given width to the nearest 32-bit boundary.">TJPAD</a>(width * <a class="el" href="group___turbo_j_p_e_g.html#gad77cf8fe5b2bfd3cb3f53098146abb4c" title="Pixel size (in bytes) for a given pixel format.">tjPixelSize</a>[pixelFormat])</code> if each line of the image is padded to the nearest 32-bit boundary, as is the case for Windows bitmaps. You can also be clever and use this parameter to skip lines, etc. Setting this parameter to 0 is the equivalent of setting it to <code>width * <a class="el" href="group___turbo_j_p_e_g.html#gad77cf8fe5b2bfd3cb3f53098146abb4c" title="Pixel size (in bytes) for a given pixel format.">tjPixelSize</a>[pixelFormat]</code>.</td></tr>
     <tr><td class="paramname">height</td><td>height (in pixels) of the source image</td></tr>
@@ -1942,7 +1942,7 @@
 </div><div class="memdoc">
 
 <p>Free an image buffer previously allocated by TurboJPEG. </p>
-<p>You should always use this function to free JPEG destination buffer(s) that were automatically (re)allocated by <a class="el" href="group___turbo_j_p_e_g.html#gaba62b7a98f960839b588579898495cf2" title="Compress an RGB, grayscale, or CMYK image into a JPEG image.">tjCompress2()</a> or <a class="el" href="group___turbo_j_p_e_g.html#gae403193ceb4aafb7e0f56ab587b48616" title="Losslessly transform a JPEG image into another JPEG image.">tjTransform()</a> or that were manually allocated using <a class="el" href="group___turbo_j_p_e_g.html#ga5c9234bda6d993cdaffdd89bf81a00ff" title="Allocate an image buffer for use with TurboJPEG.">tjAlloc()</a>.</p>
+<p>You should always use this function to free JPEG destination buffer(s) that were automatically (re)allocated by <a class="el" href="group___turbo_j_p_e_g.html#gaf38f2ed44bdc88e730e08b632fa6e88e" title="Compress an RGB, grayscale, or CMYK image into a JPEG image.">tjCompress2()</a> or <a class="el" href="group___turbo_j_p_e_g.html#gad02cd42b69f193a0623a9c801788df3a" title="Losslessly transform a JPEG image into another JPEG image.">tjTransform()</a> or that were manually allocated using <a class="el" href="group___turbo_j_p_e_g.html#ga5c9234bda6d993cdaffdd89bf81a00ff" title="Allocate an image buffer for use with TurboJPEG.">tjAlloc()</a>.</p>
 <dl class="params"><dt>Parameters</dt><dd>
   <table class="params">
     <tr><td class="paramname">buffer</td><td>address of the buffer to free</td></tr>
@@ -2199,7 +2199,7 @@
 
 </div>
 </div>
-<a class="anchor" id="gae403193ceb4aafb7e0f56ab587b48616"></a>
+<a class="anchor" id="gad02cd42b69f193a0623a9c801788df3a"></a>
 <div class="memitem">
 <div class="memproto">
       <table class="memname">
@@ -2212,7 +2212,7 @@
         <tr>
           <td class="paramkey"></td>
           <td></td>
-          <td class="paramtype">unsigned char *&#160;</td>
+          <td class="paramtype">const unsigned char *&#160;</td>
           <td class="paramname"><em>jpegBuf</em>, </td>
         </tr>
         <tr>
@@ -2264,7 +2264,7 @@
 <dl class="params"><dt>Parameters</dt><dd>
   <table class="params">
     <tr><td class="paramname">handle</td><td>a handle to a TurboJPEG transformer instance</td></tr>
-    <tr><td class="paramname">jpegBuf</td><td>pointer to a buffer containing the JPEG source image to transform. This buffer is not modified.</td></tr>
+    <tr><td class="paramname">jpegBuf</td><td>pointer to a buffer containing the JPEG source image to transform</td></tr>
     <tr><td class="paramname">jpegSize</td><td>size of the JPEG source image (in bytes)</td></tr>
     <tr><td class="paramname">n</td><td>the number of transformed JPEG images to generate</td></tr>
     <tr><td class="paramname">dstBufs</td><td>pointer to an array of n image buffers. <code>dstBufs[i]</code> will receive a JPEG image that has been transformed using the parameters in <code>transforms[i]</code>. TurboJPEG has the ability to reallocate the JPEG buffer to accommodate the size of the JPEG image. Thus, you can choose to:<ol type="1">
diff --git a/doc/html/index.html b/doc/html/index.html
index ccaa12e..3cc1b3e 100644
--- a/doc/html/index.html
+++ b/doc/html/index.html
@@ -24,7 +24,7 @@
  <tr style="height: 56px;">
   <td style="padding-left: 0.5em;">
    <div id="projectname">TurboJPEG
-   &#160;<span id="projectnumber">1.4</span>
+   &#160;<span id="projectnumber">1.5</span>
    </div>
   </td>
  </tr>
diff --git a/doc/html/modules.html b/doc/html/modules.html
index 9b41adb..8e6f815 100644
--- a/doc/html/modules.html
+++ b/doc/html/modules.html
@@ -24,7 +24,7 @@
  <tr style="height: 56px;">
   <td style="padding-left: 0.5em;">
    <div id="projectname">TurboJPEG
-   &#160;<span id="projectnumber">1.4</span>
+   &#160;<span id="projectnumber">1.5</span>
    </div>
   </td>
  </tr>
diff --git a/doc/html/search/all_74.js b/doc/html/search/all_74.js
index 5b46106..444aaef 100644
--- a/doc/html/search/all_74.js
+++ b/doc/html/search/all_74.js
@@ -8,24 +8,24 @@
   ['tjblueoffset',['tjBlueOffset',['../group___turbo_j_p_e_g.html#ga84e2e35d3f08025f976ec1ec53693dea',1,'turbojpeg.h']]],
   ['tjbufsize',['tjBufSize',['../group___turbo_j_p_e_g.html#gaccc5bca7f12fcdcc302e6e1c6d4b311b',1,'turbojpeg.h']]],
   ['tjbufsizeyuv2',['tjBufSizeYUV2',['../group___turbo_j_p_e_g.html#gaf451664a62c1f6c7cc5a6401f32908c9',1,'turbojpeg.h']]],
-  ['tjcompress2',['tjCompress2',['../group___turbo_j_p_e_g.html#gaba62b7a98f960839b588579898495cf2',1,'turbojpeg.h']]],
-  ['tjcompressfromyuv',['tjCompressFromYUV',['../group___turbo_j_p_e_g.html#ga0b931126c7a615ddc3bbd0cca6698d67',1,'turbojpeg.h']]],
-  ['tjcompressfromyuvplanes',['tjCompressFromYUVPlanes',['../group___turbo_j_p_e_g.html#gaa89a1982cb4556b12ae7af4439991af6',1,'turbojpeg.h']]],
+  ['tjcompress2',['tjCompress2',['../group___turbo_j_p_e_g.html#gaf38f2ed44bdc88e730e08b632fa6e88e',1,'turbojpeg.h']]],
+  ['tjcompressfromyuv',['tjCompressFromYUV',['../group___turbo_j_p_e_g.html#ga6f6de375d6ec0020faba627e37e5a060',1,'turbojpeg.h']]],
+  ['tjcompressfromyuvplanes',['tjCompressFromYUVPlanes',['../group___turbo_j_p_e_g.html#ga0b84c682d8accf097d7a743c965d3464',1,'turbojpeg.h']]],
   ['tjcs',['TJCS',['../group___turbo_j_p_e_g.html#ga4f83ad3368e0e29d1957be0efa7c3720',1,'turbojpeg.h']]],
   ['tjcs_5fcmyk',['TJCS_CMYK',['../group___turbo_j_p_e_g.html#gga4f83ad3368e0e29d1957be0efa7c3720a6c8b636152ac8195b869587db315ee53',1,'turbojpeg.h']]],
   ['tjcs_5fgray',['TJCS_GRAY',['../group___turbo_j_p_e_g.html#gga4f83ad3368e0e29d1957be0efa7c3720ab3e7d6a87f695e45b81c1b5262b5a50a',1,'turbojpeg.h']]],
   ['tjcs_5frgb',['TJCS_RGB',['../group___turbo_j_p_e_g.html#gga4f83ad3368e0e29d1957be0efa7c3720a677cb7ccb85c4038ac41964a2e09e555',1,'turbojpeg.h']]],
   ['tjcs_5fycbcr',['TJCS_YCbCr',['../group___turbo_j_p_e_g.html#gga4f83ad3368e0e29d1957be0efa7c3720a7389b8f65bb387ffedce3efd0d78ec75',1,'turbojpeg.h']]],
   ['tjcs_5fycck',['TJCS_YCCK',['../group___turbo_j_p_e_g.html#gga4f83ad3368e0e29d1957be0efa7c3720a53839e0fe867b76b58d16b0a1a7c598e',1,'turbojpeg.h']]],
-  ['tjdecodeyuv',['tjDecodeYUV',['../group___turbo_j_p_e_g.html#ga132ae2c2cadcf64c8bb0f3bdf69da3ed',1,'turbojpeg.h']]],
-  ['tjdecodeyuvplanes',['tjDecodeYUVPlanes',['../group___turbo_j_p_e_g.html#ga6cb5b0e1101a2b20edea576e11faf93d',1,'turbojpeg.h']]],
-  ['tjdecompress2',['tjDecompress2',['../group___turbo_j_p_e_g.html#gada69cc6443d1bb493b40f1626259e5e9',1,'turbojpeg.h']]],
-  ['tjdecompressheader3',['tjDecompressHeader3',['../group___turbo_j_p_e_g.html#gacd0fac3af74b3511d39b4781b7103086',1,'turbojpeg.h']]],
-  ['tjdecompresstoyuv2',['tjDecompressToYUV2',['../group___turbo_j_p_e_g.html#ga7c08b340ad7f8e85d407bd9e81d44d07',1,'turbojpeg.h']]],
-  ['tjdecompresstoyuvplanes',['tjDecompressToYUVPlanes',['../group___turbo_j_p_e_g.html#ga0828a38ae29631ac28b6857cefb0eebf',1,'turbojpeg.h']]],
+  ['tjdecodeyuv',['tjDecodeYUV',['../group___turbo_j_p_e_g.html#ga077c61027b875afecd5a1613bf18b3c1',1,'turbojpeg.h']]],
+  ['tjdecodeyuvplanes',['tjDecodeYUVPlanes',['../group___turbo_j_p_e_g.html#gaf42f19b7a496eb18bdc84fe61ee6d3e2',1,'turbojpeg.h']]],
+  ['tjdecompress2',['tjDecompress2',['../group___turbo_j_p_e_g.html#gad8026a417e16a76313bc0a6c9e8b2ba2',1,'turbojpeg.h']]],
+  ['tjdecompressheader3',['tjDecompressHeader3',['../group___turbo_j_p_e_g.html#ga3fced455e504e8ff4fbad28ba94a3020',1,'turbojpeg.h']]],
+  ['tjdecompresstoyuv2',['tjDecompressToYUV2',['../group___turbo_j_p_e_g.html#ga39e08906528db5a764670ea48d344b09',1,'turbojpeg.h']]],
+  ['tjdecompresstoyuvplanes',['tjDecompressToYUVPlanes',['../group___turbo_j_p_e_g.html#ga38d0ef90692663b3ffb5b16da2541512',1,'turbojpeg.h']]],
   ['tjdestroy',['tjDestroy',['../group___turbo_j_p_e_g.html#ga674adee917b95ad4a896f1ba39e12540',1,'turbojpeg.h']]],
-  ['tjencodeyuv3',['tjEncodeYUV3',['../group___turbo_j_p_e_g.html#ga0a5ffbf7cb58a5b6a8201114fe889360',1,'turbojpeg.h']]],
-  ['tjencodeyuvplanes',['tjEncodeYUVPlanes',['../group___turbo_j_p_e_g.html#gaa791db8598853ddcad24e42897ef1269',1,'turbojpeg.h']]],
+  ['tjencodeyuv3',['tjEncodeYUV3',['../group___turbo_j_p_e_g.html#gaabe05acd734990053ad1294b5ef239aa',1,'turbojpeg.h']]],
+  ['tjencodeyuvplanes',['tjEncodeYUVPlanes',['../group___turbo_j_p_e_g.html#ga8a65ed3bd12df57c219d46afbc9008f1',1,'turbojpeg.h']]],
   ['tjflag_5faccuratedct',['TJFLAG_ACCURATEDCT',['../group___turbo_j_p_e_g.html#gacb233cfd722d66d1ccbf48a7de81f0e0',1,'turbojpeg.h']]],
   ['tjflag_5fbottomup',['TJFLAG_BOTTOMUP',['../group___turbo_j_p_e_g.html#ga72ecf4ebe6eb702d3c6f5ca27455e1ec',1,'turbojpeg.h']]],
   ['tjflag_5ffastdct',['TJFLAG_FASTDCT',['../group___turbo_j_p_e_g.html#gaabce235db80d3f698b27f36cbd453da2',1,'turbojpeg.h']]],
@@ -70,7 +70,7 @@
   ['tjsamp_5fgray',['TJSAMP_GRAY',['../group___turbo_j_p_e_g.html#gga1d047060ea80bb9820d540bb928e9074a3f1c9504842ddc7a48d0f690754b6248',1,'turbojpeg.h']]],
   ['tjscaled',['TJSCALED',['../group___turbo_j_p_e_g.html#ga84878bb65404204743aa18cac02781df',1,'turbojpeg.h']]],
   ['tjscalingfactor',['tjscalingfactor',['../structtjscalingfactor.html',1,'']]],
-  ['tjtransform',['tjtransform',['../structtjtransform.html',1,'tjtransform'],['../group___turbo_j_p_e_g.html#gae403193ceb4aafb7e0f56ab587b48616',1,'tjTransform(tjhandle handle, unsigned char *jpegBuf, unsigned long jpegSize, int n, unsigned char **dstBufs, unsigned long *dstSizes, tjtransform *transforms, int flags):&#160;turbojpeg.h'],['../group___turbo_j_p_e_g.html#gaa29f3189c41be12ec5dee7caec318a31',1,'tjtransform():&#160;turbojpeg.h']]],
+  ['tjtransform',['tjtransform',['../structtjtransform.html',1,'tjtransform'],['../group___turbo_j_p_e_g.html#gad02cd42b69f193a0623a9c801788df3a',1,'tjTransform(tjhandle handle, const unsigned char *jpegBuf, unsigned long jpegSize, int n, unsigned char **dstBufs, unsigned long *dstSizes, tjtransform *transforms, int flags):&#160;turbojpeg.h'],['../group___turbo_j_p_e_g.html#gaa29f3189c41be12ec5dee7caec318a31',1,'tjtransform():&#160;turbojpeg.h']]],
   ['tjxop',['TJXOP',['../group___turbo_j_p_e_g.html#ga2de531af4e7e6c4f124908376b354866',1,'turbojpeg.h']]],
   ['tjxop_5fhflip',['TJXOP_HFLIP',['../group___turbo_j_p_e_g.html#gga2de531af4e7e6c4f124908376b354866aa0df69776caa30f0fa28e26332d311ce',1,'turbojpeg.h']]],
   ['tjxop_5fnone',['TJXOP_NONE',['../group___turbo_j_p_e_g.html#gga2de531af4e7e6c4f124908376b354866aad88c0366cd3f7d0eac9d7a3fa1c2c27',1,'turbojpeg.h']]],
diff --git a/doc/html/search/functions_74.js b/doc/html/search/functions_74.js
index 73b7ee9..69410b0 100644
--- a/doc/html/search/functions_74.js
+++ b/doc/html/search/functions_74.js
@@ -3,18 +3,18 @@
   ['tjalloc',['tjAlloc',['../group___turbo_j_p_e_g.html#ga5c9234bda6d993cdaffdd89bf81a00ff',1,'turbojpeg.h']]],
   ['tjbufsize',['tjBufSize',['../group___turbo_j_p_e_g.html#gaccc5bca7f12fcdcc302e6e1c6d4b311b',1,'turbojpeg.h']]],
   ['tjbufsizeyuv2',['tjBufSizeYUV2',['../group___turbo_j_p_e_g.html#gaf451664a62c1f6c7cc5a6401f32908c9',1,'turbojpeg.h']]],
-  ['tjcompress2',['tjCompress2',['../group___turbo_j_p_e_g.html#gaba62b7a98f960839b588579898495cf2',1,'turbojpeg.h']]],
-  ['tjcompressfromyuv',['tjCompressFromYUV',['../group___turbo_j_p_e_g.html#ga0b931126c7a615ddc3bbd0cca6698d67',1,'turbojpeg.h']]],
-  ['tjcompressfromyuvplanes',['tjCompressFromYUVPlanes',['../group___turbo_j_p_e_g.html#gaa89a1982cb4556b12ae7af4439991af6',1,'turbojpeg.h']]],
-  ['tjdecodeyuv',['tjDecodeYUV',['../group___turbo_j_p_e_g.html#ga132ae2c2cadcf64c8bb0f3bdf69da3ed',1,'turbojpeg.h']]],
-  ['tjdecodeyuvplanes',['tjDecodeYUVPlanes',['../group___turbo_j_p_e_g.html#ga6cb5b0e1101a2b20edea576e11faf93d',1,'turbojpeg.h']]],
-  ['tjdecompress2',['tjDecompress2',['../group___turbo_j_p_e_g.html#gada69cc6443d1bb493b40f1626259e5e9',1,'turbojpeg.h']]],
-  ['tjdecompressheader3',['tjDecompressHeader3',['../group___turbo_j_p_e_g.html#gacd0fac3af74b3511d39b4781b7103086',1,'turbojpeg.h']]],
-  ['tjdecompresstoyuv2',['tjDecompressToYUV2',['../group___turbo_j_p_e_g.html#ga7c08b340ad7f8e85d407bd9e81d44d07',1,'turbojpeg.h']]],
-  ['tjdecompresstoyuvplanes',['tjDecompressToYUVPlanes',['../group___turbo_j_p_e_g.html#ga0828a38ae29631ac28b6857cefb0eebf',1,'turbojpeg.h']]],
+  ['tjcompress2',['tjCompress2',['../group___turbo_j_p_e_g.html#gaf38f2ed44bdc88e730e08b632fa6e88e',1,'turbojpeg.h']]],
+  ['tjcompressfromyuv',['tjCompressFromYUV',['../group___turbo_j_p_e_g.html#ga6f6de375d6ec0020faba627e37e5a060',1,'turbojpeg.h']]],
+  ['tjcompressfromyuvplanes',['tjCompressFromYUVPlanes',['../group___turbo_j_p_e_g.html#ga0b84c682d8accf097d7a743c965d3464',1,'turbojpeg.h']]],
+  ['tjdecodeyuv',['tjDecodeYUV',['../group___turbo_j_p_e_g.html#ga077c61027b875afecd5a1613bf18b3c1',1,'turbojpeg.h']]],
+  ['tjdecodeyuvplanes',['tjDecodeYUVPlanes',['../group___turbo_j_p_e_g.html#gaf42f19b7a496eb18bdc84fe61ee6d3e2',1,'turbojpeg.h']]],
+  ['tjdecompress2',['tjDecompress2',['../group___turbo_j_p_e_g.html#gad8026a417e16a76313bc0a6c9e8b2ba2',1,'turbojpeg.h']]],
+  ['tjdecompressheader3',['tjDecompressHeader3',['../group___turbo_j_p_e_g.html#ga3fced455e504e8ff4fbad28ba94a3020',1,'turbojpeg.h']]],
+  ['tjdecompresstoyuv2',['tjDecompressToYUV2',['../group___turbo_j_p_e_g.html#ga39e08906528db5a764670ea48d344b09',1,'turbojpeg.h']]],
+  ['tjdecompresstoyuvplanes',['tjDecompressToYUVPlanes',['../group___turbo_j_p_e_g.html#ga38d0ef90692663b3ffb5b16da2541512',1,'turbojpeg.h']]],
   ['tjdestroy',['tjDestroy',['../group___turbo_j_p_e_g.html#ga674adee917b95ad4a896f1ba39e12540',1,'turbojpeg.h']]],
-  ['tjencodeyuv3',['tjEncodeYUV3',['../group___turbo_j_p_e_g.html#ga0a5ffbf7cb58a5b6a8201114fe889360',1,'turbojpeg.h']]],
-  ['tjencodeyuvplanes',['tjEncodeYUVPlanes',['../group___turbo_j_p_e_g.html#gaa791db8598853ddcad24e42897ef1269',1,'turbojpeg.h']]],
+  ['tjencodeyuv3',['tjEncodeYUV3',['../group___turbo_j_p_e_g.html#gaabe05acd734990053ad1294b5ef239aa',1,'turbojpeg.h']]],
+  ['tjencodeyuvplanes',['tjEncodeYUVPlanes',['../group___turbo_j_p_e_g.html#ga8a65ed3bd12df57c219d46afbc9008f1',1,'turbojpeg.h']]],
   ['tjfree',['tjFree',['../group___turbo_j_p_e_g.html#ga8c4a1231dc06a450514c835f6471f137',1,'turbojpeg.h']]],
   ['tjgeterrorstr',['tjGetErrorStr',['../group___turbo_j_p_e_g.html#ga9af79c908ec131b1ae8d52fe40375abf',1,'turbojpeg.h']]],
   ['tjgetscalingfactors',['tjGetScalingFactors',['../group___turbo_j_p_e_g.html#ga6449044b9af402999ccf52f401333be8',1,'turbojpeg.h']]],
@@ -24,5 +24,5 @@
   ['tjplaneheight',['tjPlaneHeight',['../group___turbo_j_p_e_g.html#ga1a209696c6a80748f20e134b3c64789f',1,'turbojpeg.h']]],
   ['tjplanesizeyuv',['tjPlaneSizeYUV',['../group___turbo_j_p_e_g.html#ga6f98d977bfa9d167c97172e876ba61e2',1,'turbojpeg.h']]],
   ['tjplanewidth',['tjPlaneWidth',['../group___turbo_j_p_e_g.html#ga63fb66bb1e36c74008c4634360becbb1',1,'turbojpeg.h']]],
-  ['tjtransform',['tjTransform',['../group___turbo_j_p_e_g.html#gae403193ceb4aafb7e0f56ab587b48616',1,'turbojpeg.h']]]
+  ['tjtransform',['tjTransform',['../group___turbo_j_p_e_g.html#gad02cd42b69f193a0623a9c801788df3a',1,'turbojpeg.h']]]
 ];
diff --git a/doc/html/structtjregion.html b/doc/html/structtjregion.html
index d22c09e..af2a473 100644
--- a/doc/html/structtjregion.html
+++ b/doc/html/structtjregion.html
@@ -24,7 +24,7 @@
  <tr style="height: 56px;">
   <td style="padding-left: 0.5em;">
    <div id="projectname">TurboJPEG
-   &#160;<span id="projectnumber">1.4</span>
+   &#160;<span id="projectnumber">1.5</span>
    </div>
   </td>
  </tr>
diff --git a/doc/html/structtjscalingfactor.html b/doc/html/structtjscalingfactor.html
index f7b7c84..3bb50f5 100644
--- a/doc/html/structtjscalingfactor.html
+++ b/doc/html/structtjscalingfactor.html
@@ -24,7 +24,7 @@
  <tr style="height: 56px;">
   <td style="padding-left: 0.5em;">
    <div id="projectname">TurboJPEG
-   &#160;<span id="projectnumber">1.4</span>
+   &#160;<span id="projectnumber">1.5</span>
    </div>
   </td>
  </tr>
diff --git a/doc/html/structtjtransform.html b/doc/html/structtjtransform.html
index 7a07c2a..9fd97f7 100644
--- a/doc/html/structtjtransform.html
+++ b/doc/html/structtjtransform.html
@@ -24,7 +24,7 @@
  <tr style="height: 56px;">
   <td style="padding-left: 0.5em;">
    <div id="projectname">TurboJPEG
-   &#160;<span id="projectnumber">1.4</span>
+   &#160;<span id="projectnumber">1.5</span>
    </div>
   </td>
  </tr>
@@ -133,7 +133,7 @@
     <tr><td class="paramname">arrayRegion</td><td><a class="el" href="structtjregion.html" title="Cropping region.">tjregion</a> structure containing the width and height of the array pointed to by <code>coeffs</code> as well as its offset relative to the component plane. TurboJPEG implementations may choose to split each component plane into multiple DCT coefficient arrays and call the callback function once for each array.</td></tr>
     <tr><td class="paramname">planeRegion</td><td><a class="el" href="structtjregion.html" title="Cropping region.">tjregion</a> structure containing the width and height of the component plane to which <code>coeffs</code> belongs</td></tr>
     <tr><td class="paramname">componentID</td><td>ID number of the component plane to which <code>coeffs</code> belongs (Y, Cb, and Cr have, respectively, ID's of 0, 1, and 2 in typical JPEG images.)</td></tr>
-    <tr><td class="paramname">transformID</td><td>ID number of the transformed image to which <code>coeffs</code> belongs. This is the same as the index of the transform in the <code>transforms</code> array that was passed to <a class="el" href="group___turbo_j_p_e_g.html#gae403193ceb4aafb7e0f56ab587b48616" title="Losslessly transform a JPEG image into another JPEG image.">tjTransform()</a>.</td></tr>
+    <tr><td class="paramname">transformID</td><td>ID number of the transformed image to which <code>coeffs</code> belongs. This is the same as the index of the transform in the <code>transforms</code> array that was passed to <a class="el" href="group___turbo_j_p_e_g.html#gad02cd42b69f193a0623a9c801788df3a" title="Losslessly transform a JPEG image into another JPEG image.">tjTransform()</a>.</td></tr>
     <tr><td class="paramname">transform</td><td>a pointer to a <a class="el" href="structtjtransform.html" title="Lossless transform.">tjtransform</a> structure that specifies the parameters and/or cropping region for this transform</td></tr>
   </table>
   </dd>
diff --git a/doxygen.config b/doxygen.config
index 4ffba68..1723123 100644
--- a/doxygen.config
+++ b/doxygen.config
@@ -1,5 +1,5 @@
 PROJECT_NAME = TurboJPEG
-PROJECT_NUMBER = 1.4
+PROJECT_NUMBER = 1.5
 OUTPUT_DIRECTORY = doc/
 USE_WINDOWS_ENCODING = NO
 OPTIMIZE_OUTPUT_FOR_C = YES
diff --git a/example.c b/example.c
index 0a65a6c..ac27f49 100644
--- a/example.c
+++ b/example.c
@@ -58,7 +58,7 @@
  * RGB color and is described by:
  */
 
-extern JSAMPLE * image_buffer;  /* Points to large array of R,G,B-order data */
+extern JSAMPLE *image_buffer;   /* Points to large array of R,G,B-order data */
 extern int image_height;        /* Number of rows in image */
 extern int image_width;         /* Number of columns in image */
 
@@ -69,7 +69,7 @@
  */
 
 GLOBAL(void)
-write_JPEG_file (char * filename, int quality)
+write_JPEG_file (char *filename, int quality)
 {
   /* This struct contains the JPEG compression parameters and pointers to
    * working space (which is allocated as needed by the JPEG library).
@@ -88,7 +88,7 @@
    */
   struct jpeg_error_mgr jerr;
   /* More stuff */
-  FILE * outfile;               /* target file */
+  FILE *outfile;                /* target file */
   JSAMPROW row_pointer[1];      /* pointer to JSAMPLE row[s] */
   int row_stride;               /* physical row width in image buffer */
 
@@ -253,7 +253,7 @@
   jmp_buf setjmp_buffer;        /* for return to caller */
 };
 
-typedef struct my_error_mgr * my_error_ptr;
+typedef struct my_error_mgr *my_error_ptr;
 
 /*
  * Here's the routine that will replace the standard error_exit method:
@@ -281,7 +281,7 @@
 
 
 GLOBAL(int)
-read_JPEG_file (char * filename)
+read_JPEG_file (char *filename)
 {
   /* This struct contains the JPEG decompression parameters and pointers to
    * working space (which is allocated as needed by the JPEG library).
@@ -293,7 +293,7 @@
    */
   struct my_error_mgr jerr;
   /* More stuff */
-  FILE * infile;                /* source file */
+  FILE *infile;                 /* source file */
   JSAMPARRAY buffer;            /* Output row buffer */
   int row_stride;               /* physical row width in output buffer */
 
diff --git a/jaricom.c b/jaricom.c
index f43e2ea..3bb557f 100644
--- a/jaricom.c
+++ b/jaricom.c
@@ -1,9 +1,12 @@
 /*
  * jaricom.c
  *
+ * This file was part of the Independent JPEG Group's software:
  * Developed 1997-2009 by Guido Vollbeding.
- * This file is part of the Independent JPEG Group's software.
- * For conditions of distribution and use, see the accompanying README file.
+ * libjpeg-turbo Modifications:
+ * Copyright (C) 2015, D. R. Commander.
+ * For conditions of distribution and use, see the accompanying README.ijg
+ * file.
  *
  * This file contains probability estimation tables for common use in
  * arithmetic entropy encoding and decoding routines.
@@ -18,7 +21,7 @@
 #include "jpeglib.h"
 
 /* The following #define specifies the packing of the four components
- * into the compact INT32 representation.
+ * into the compact JLONG representation.
  * Note that this formula must match the actual arithmetic encoder
  * and decoder implementation.  The implementation has to be changed
  * if this formula is changed.
@@ -26,9 +29,9 @@
  * implementation (jbig_tab.c).
  */
 
-#define V(i,a,b,c,d) (((INT32)a << 16) | ((INT32)c << 8) | ((INT32)d << 7) | b)
+#define V(i,a,b,c,d) (((JLONG)a << 16) | ((JLONG)c << 8) | ((JLONG)d << 7) | b)
 
-const INT32 jpeg_aritab[113+1] = {
+const JLONG jpeg_aritab[113+1] = {
 /*
  * Index, Qe_Value, Next_Index_LPS, Next_Index_MPS, Switch_MPS
  */
diff --git a/java/CMakeLists.txt b/java/CMakeLists.txt
index f9f9e57..0af8ae1 100644
--- a/java/CMakeLists.txt
+++ b/java/CMakeLists.txt
@@ -5,6 +5,7 @@
   org/libjpegturbo/turbojpeg/TJCompressor
   org/libjpegturbo/turbojpeg/TJCustomFilter
   org/libjpegturbo/turbojpeg/TJDecompressor
+  org/libjpegturbo/turbojpeg/TJException
   org/libjpegturbo/turbojpeg/TJScalingFactor
   org/libjpegturbo/turbojpeg/TJTransform
   org/libjpegturbo/turbojpeg/TJTransformer
diff --git a/java/Makefile.am b/java/Makefile.am
index 23e3412..d3fc59c 100644
--- a/java/Makefile.am
+++ b/java/Makefile.am
@@ -10,6 +10,7 @@
 	org/libjpegturbo/turbojpeg/TJCompressor.java \
 	org/libjpegturbo/turbojpeg/TJCustomFilter.java \
 	org/libjpegturbo/turbojpeg/TJDecompressor.java \
+	org/libjpegturbo/turbojpeg/TJException.java \
 	org/libjpegturbo/turbojpeg/TJScalingFactor.java \
 	org/libjpegturbo/turbojpeg/TJTransform.java \
 	org/libjpegturbo/turbojpeg/TJTransformer.java \
@@ -31,6 +32,7 @@
 	org/libjpegturbo/turbojpeg/TJCompressor.class \
 	org/libjpegturbo/turbojpeg/TJCustomFilter.class \
 	org/libjpegturbo/turbojpeg/TJDecompressor.class \
+	org/libjpegturbo/turbojpeg/TJException.class \
 	org/libjpegturbo/turbojpeg/TJLoader.class \
 	org/libjpegturbo/turbojpeg/TJScalingFactor.class \
 	org/libjpegturbo/turbojpeg/TJTransform.class \
@@ -42,7 +44,7 @@
 
 all: all-am turbojpeg.jar
 
-turbojpeg.jar: $(JAVA_CLASSES) ${srcdir}/MANIFEST.MF
+turbojpeg.jar: classnoinst.stamp ${srcdir}/MANIFEST.MF
 	$(JAR) cfm turbojpeg.jar ${srcdir}/MANIFEST.MF $(JAVA_CLASSES)
 
 clean-local:
diff --git a/java/TJBench.java b/java/TJBench.java
index c9d110c..19db789 100644
--- a/java/TJBench.java
+++ b/java/TJBench.java
@@ -1,5 +1,5 @@
 /*
- * Copyright (C)2009-2014 D. R. Commander.  All Rights Reserved.
+ * Copyright (C)2009-2014, 2016 D. R. Commander.  All Rights Reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions are met:
@@ -35,7 +35,7 @@
 class TJBench {
 
   static int flags = 0, quiet = 0, pf = TJ.PF_BGR, yuvpad = 1, warmup = 1;
-  static boolean compOnly, decompOnly, doTile, doYUV;
+  static boolean compOnly, decompOnly, doTile, doYUV, write;
 
   static final String[] pixFormatStr = {
     "RGB", "BGR", "RGBX", "BGRX", "XBGR", "XRGB", "GRAY"
@@ -223,6 +223,8 @@
       }
     }
 
+    if (!write) return;
+
     if (sf.getNum() != 1 || sf.getDenom() != 1)
       sizeStr = new String(sf.getNum() + "_" + sf.getDenom());
     else if (tilew != w || tileh != h)
@@ -394,7 +396,7 @@
         System.out.format("                  Output bit stream:  %f Megabits/sec\n",
           (double)totalJpegSize * 8. / 1000000. * (double)iter / elapsed);
       }
-      if (tilew == w && tileh == h) {
+      if (tilew == w && tileh == h && write) {
         String tempStr = fileName + "_" + subName[subsamp] + "_" + "Q" +
                          jpegQual + ".jpg";
         FileOutputStream fos = new FileOutputStream(tempStr);
@@ -659,7 +661,9 @@
     System.out.println("-benchtime <t> = Run each benchmark for at least <t> seconds (default = 5.0)");
     System.out.println("-warmup <w> = Execute each benchmark <w> times to prime the cache before");
     System.out.println("     taking performance measurements (default = 1)");
-    System.out.println("-componly = Stop after running compression tests.  Do not test decompression.\n");
+    System.out.println("-componly = Stop after running compression tests.  Do not test decompression.");
+    System.out.println("-nowrite = Do not write reference or output images (improves consistency");
+    System.out.println("     of performance measurements.)\n");
     System.out.println("NOTE:  If the quality is specified as a range (e.g. 90-100), a separate");
     System.out.println("test will be performed for all quality values in the range.\n");
     System.exit(1);
@@ -817,6 +821,8 @@
           }
           if (argv[i].equalsIgnoreCase("-componly"))
             compOnly = true;
+          if (argv[i].equalsIgnoreCase("-nowrite"))
+            write = false;
           if (argv[i].equalsIgnoreCase("-warmup") && i < argv.length - 1) {
             int temp = -1;
             try {
diff --git a/java/TJExample.java b/java/TJExample.java
index 7562114..da09807 100644
--- a/java/TJExample.java
+++ b/java/TJExample.java
@@ -1,5 +1,5 @@
 /*
- * Copyright (C)2011-2012, 2014 D. R. Commander.  All Rights Reserved.
+ * Copyright (C)2011-2012, 2014-2015 D. R. Commander.  All Rights Reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions are met:
@@ -351,7 +351,7 @@
   public void customFilter(ShortBuffer coeffBuffer, Rectangle bufferRegion,
                            Rectangle planeRegion, int componentIndex,
                            int transformIndex, TJTransform transform)
-                           throws Exception {
+                           throws TJException {
     for (int i = 0; i < bufferRegion.width * bufferRegion.height; i++) {
       coeffBuffer.put(i, (short)(-coeffBuffer.get(i)));
     }
diff --git a/java/TJUnitTest.java b/java/TJUnitTest.java
index d4726b3..444e798 100644
--- a/java/TJUnitTest.java
+++ b/java/TJUnitTest.java
@@ -1,5 +1,5 @@
 /*
- * Copyright (C)2011-2015 D. R. Commander.  All Rights Reserved.
+ * Copyright (C)2011-2016 D. R. Commander.  All Rights Reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions are met:
@@ -892,6 +892,9 @@
             else
               tjc.compress(dstBuf, 0);
           }
+          dstImage = null;
+          dstBuf = null;
+          System.gc();
         }
       }
       System.out.println("Done.      ");
diff --git a/java/doc/allclasses-frame.html b/java/doc/allclasses-frame.html
index 509ea50..fecac06 100644
--- a/java/doc/allclasses-frame.html
+++ b/java/doc/allclasses-frame.html
@@ -13,6 +13,7 @@
 <li><a href="org/libjpegturbo/turbojpeg/TJCompressor.html" title="class in org.libjpegturbo.turbojpeg" target="classFrame">TJCompressor</a></li>
 <li><a href="org/libjpegturbo/turbojpeg/TJCustomFilter.html" title="interface in org.libjpegturbo.turbojpeg" target="classFrame"><i>TJCustomFilter</i></a></li>
 <li><a href="org/libjpegturbo/turbojpeg/TJDecompressor.html" title="class in org.libjpegturbo.turbojpeg" target="classFrame">TJDecompressor</a></li>
+<li><a href="org/libjpegturbo/turbojpeg/TJException.html" title="class in org.libjpegturbo.turbojpeg" target="classFrame">TJException</a></li>
 <li><a href="org/libjpegturbo/turbojpeg/TJScalingFactor.html" title="class in org.libjpegturbo.turbojpeg" target="classFrame">TJScalingFactor</a></li>
 <li><a href="org/libjpegturbo/turbojpeg/TJTransform.html" title="class in org.libjpegturbo.turbojpeg" target="classFrame">TJTransform</a></li>
 <li><a href="org/libjpegturbo/turbojpeg/TJTransformer.html" title="class in org.libjpegturbo.turbojpeg" target="classFrame">TJTransformer</a></li>
diff --git a/java/doc/allclasses-noframe.html b/java/doc/allclasses-noframe.html
index 3eac18f..1f7fd3c 100644
--- a/java/doc/allclasses-noframe.html
+++ b/java/doc/allclasses-noframe.html
@@ -13,6 +13,7 @@
 <li><a href="org/libjpegturbo/turbojpeg/TJCompressor.html" title="class in org.libjpegturbo.turbojpeg">TJCompressor</a></li>
 <li><a href="org/libjpegturbo/turbojpeg/TJCustomFilter.html" title="interface in org.libjpegturbo.turbojpeg"><i>TJCustomFilter</i></a></li>
 <li><a href="org/libjpegturbo/turbojpeg/TJDecompressor.html" title="class in org.libjpegturbo.turbojpeg">TJDecompressor</a></li>
+<li><a href="org/libjpegturbo/turbojpeg/TJException.html" title="class in org.libjpegturbo.turbojpeg">TJException</a></li>
 <li><a href="org/libjpegturbo/turbojpeg/TJScalingFactor.html" title="class in org.libjpegturbo.turbojpeg">TJScalingFactor</a></li>
 <li><a href="org/libjpegturbo/turbojpeg/TJTransform.html" title="class in org.libjpegturbo.turbojpeg">TJTransform</a></li>
 <li><a href="org/libjpegturbo/turbojpeg/TJTransformer.html" title="class in org.libjpegturbo.turbojpeg">TJTransformer</a></li>
diff --git a/java/doc/index-all.html b/java/doc/index-all.html
index 1af78be..a02d9c4 100644
--- a/java/doc/index-all.html
+++ b/java/doc/index-all.html
@@ -821,6 +821,16 @@
  source image stored in <code>yuvImage</code> with the newly created
  instance.</div>
 </dd>
+<dt><a href="./org/libjpegturbo/turbojpeg/TJException.html" title="class in org.libjpegturbo.turbojpeg"><span class="strong">TJException</span></a> - Exception in <a href="./org/libjpegturbo/turbojpeg/package-summary.html">org.libjpegturbo.turbojpeg</a></dt>
+<dd>&nbsp;</dd>
+<dt><span class="strong"><a href="./org/libjpegturbo/turbojpeg/TJException.html#TJException()">TJException()</a></span> - Constructor for exception org.libjpegturbo.turbojpeg.<a href="./org/libjpegturbo/turbojpeg/TJException.html" title="class in org.libjpegturbo.turbojpeg">TJException</a></dt>
+<dd>&nbsp;</dd>
+<dt><span class="strong"><a href="./org/libjpegturbo/turbojpeg/TJException.html#TJException(java.lang.String,%20java.lang.Throwable)">TJException(String, Throwable)</a></span> - Constructor for exception org.libjpegturbo.turbojpeg.<a href="./org/libjpegturbo/turbojpeg/TJException.html" title="class in org.libjpegturbo.turbojpeg">TJException</a></dt>
+<dd>&nbsp;</dd>
+<dt><span class="strong"><a href="./org/libjpegturbo/turbojpeg/TJException.html#TJException(java.lang.String)">TJException(String)</a></span> - Constructor for exception org.libjpegturbo.turbojpeg.<a href="./org/libjpegturbo/turbojpeg/TJException.html" title="class in org.libjpegturbo.turbojpeg">TJException</a></dt>
+<dd>&nbsp;</dd>
+<dt><span class="strong"><a href="./org/libjpegturbo/turbojpeg/TJException.html#TJException(java.lang.Throwable)">TJException(Throwable)</a></span> - Constructor for exception org.libjpegturbo.turbojpeg.<a href="./org/libjpegturbo/turbojpeg/TJException.html" title="class in org.libjpegturbo.turbojpeg">TJException</a></dt>
+<dd>&nbsp;</dd>
 <dt><a href="./org/libjpegturbo/turbojpeg/TJScalingFactor.html" title="class in org.libjpegturbo.turbojpeg"><span class="strong">TJScalingFactor</span></a> - Class in <a href="./org/libjpegturbo/turbojpeg/package-summary.html">org.libjpegturbo.turbojpeg</a></dt>
 <dd>
 <div class="block">Fractional scaling factor</div>
diff --git a/java/doc/org/libjpegturbo/turbojpeg/TJ.html b/java/doc/org/libjpegturbo/turbojpeg/TJ.html
index f8342f2..ffef657 100644
--- a/java/doc/org/libjpegturbo/turbojpeg/TJ.html
+++ b/java/doc/org/libjpegturbo/turbojpeg/TJ.html
@@ -983,16 +983,13 @@
 <ul class="blockList">
 <li class="blockList">
 <h4>getMCUWidth</h4>
-<pre>public static&nbsp;int&nbsp;getMCUWidth(int&nbsp;subsamp)
-                       throws java.lang.Exception</pre>
+<pre>public static&nbsp;int&nbsp;getMCUWidth(int&nbsp;subsamp)</pre>
 <div class="block">Returns the MCU block width for the given level of chrominance
  subsampling.</div>
 <dl><dt><span class="strong">Parameters:</span></dt><dd><code>subsamp</code> - the level of chrominance subsampling (one of
  <code>SAMP_*</code>)</dd>
 <dt><span class="strong">Returns:</span></dt><dd>the MCU block width for the given level of chrominance
- subsampling.</dd>
-<dt><span class="strong">Throws:</span></dt>
-<dd><code>java.lang.Exception</code></dd></dl>
+ subsampling.</dd></dl>
 </li>
 </ul>
 <a name="getMCUHeight(int)">
@@ -1001,16 +998,13 @@
 <ul class="blockList">
 <li class="blockList">
 <h4>getMCUHeight</h4>
-<pre>public static&nbsp;int&nbsp;getMCUHeight(int&nbsp;subsamp)
-                        throws java.lang.Exception</pre>
+<pre>public static&nbsp;int&nbsp;getMCUHeight(int&nbsp;subsamp)</pre>
 <div class="block">Returns the MCU block height for the given level of chrominance
  subsampling.</div>
 <dl><dt><span class="strong">Parameters:</span></dt><dd><code>subsamp</code> - the level of chrominance subsampling (one of
  <code>SAMP_*</code>)</dd>
 <dt><span class="strong">Returns:</span></dt><dd>the MCU block height for the given level of chrominance
- subsampling.</dd>
-<dt><span class="strong">Throws:</span></dt>
-<dd><code>java.lang.Exception</code></dd></dl>
+ subsampling.</dd></dl>
 </li>
 </ul>
 <a name="getPixelSize(int)">
@@ -1019,13 +1013,10 @@
 <ul class="blockList">
 <li class="blockList">
 <h4>getPixelSize</h4>
-<pre>public static&nbsp;int&nbsp;getPixelSize(int&nbsp;pixelFormat)
-                        throws java.lang.Exception</pre>
+<pre>public static&nbsp;int&nbsp;getPixelSize(int&nbsp;pixelFormat)</pre>
 <div class="block">Returns the pixel size (in bytes) for the given pixel format.</div>
 <dl><dt><span class="strong">Parameters:</span></dt><dd><code>pixelFormat</code> - the pixel format (one of <code>PF_*</code>)</dd>
-<dt><span class="strong">Returns:</span></dt><dd>the pixel size (in bytes) for the given pixel format.</dd>
-<dt><span class="strong">Throws:</span></dt>
-<dd><code>java.lang.Exception</code></dd></dl>
+<dt><span class="strong">Returns:</span></dt><dd>the pixel size (in bytes) for the given pixel format.</dd></dl>
 </li>
 </ul>
 <a name="getRedOffset(int)">
@@ -1034,17 +1025,14 @@
 <ul class="blockList">
 <li class="blockList">
 <h4>getRedOffset</h4>
-<pre>public static&nbsp;int&nbsp;getRedOffset(int&nbsp;pixelFormat)
-                        throws java.lang.Exception</pre>
+<pre>public static&nbsp;int&nbsp;getRedOffset(int&nbsp;pixelFormat)</pre>
 <div class="block">For the given pixel format, returns the number of bytes that the red
  component is offset from the start of the pixel.  For instance, if a pixel
  of format <code>TJ.PF_BGRX</code> is stored in <code>char pixel[]</code>,
  then the red component will be
  <code>pixel[TJ.getRedOffset(TJ.PF_BGRX)]</code>.</div>
 <dl><dt><span class="strong">Parameters:</span></dt><dd><code>pixelFormat</code> - the pixel format (one of <code>PF_*</code>)</dd>
-<dt><span class="strong">Returns:</span></dt><dd>the red offset for the given pixel format.</dd>
-<dt><span class="strong">Throws:</span></dt>
-<dd><code>java.lang.Exception</code></dd></dl>
+<dt><span class="strong">Returns:</span></dt><dd>the red offset for the given pixel format.</dd></dl>
 </li>
 </ul>
 <a name="getGreenOffset(int)">
@@ -1053,17 +1041,14 @@
 <ul class="blockList">
 <li class="blockList">
 <h4>getGreenOffset</h4>
-<pre>public static&nbsp;int&nbsp;getGreenOffset(int&nbsp;pixelFormat)
-                          throws java.lang.Exception</pre>
+<pre>public static&nbsp;int&nbsp;getGreenOffset(int&nbsp;pixelFormat)</pre>
 <div class="block">For the given pixel format, returns the number of bytes that the green
  component is offset from the start of the pixel.  For instance, if a pixel
  of format <code>TJ.PF_BGRX</code> is stored in <code>char pixel[]</code>,
  then the green component will be
  <code>pixel[TJ.getGreenOffset(TJ.PF_BGRX)]</code>.</div>
 <dl><dt><span class="strong">Parameters:</span></dt><dd><code>pixelFormat</code> - the pixel format (one of <code>PF_*</code>)</dd>
-<dt><span class="strong">Returns:</span></dt><dd>the green offset for the given pixel format.</dd>
-<dt><span class="strong">Throws:</span></dt>
-<dd><code>java.lang.Exception</code></dd></dl>
+<dt><span class="strong">Returns:</span></dt><dd>the green offset for the given pixel format.</dd></dl>
 </li>
 </ul>
 <a name="getBlueOffset(int)">
@@ -1072,17 +1057,14 @@
 <ul class="blockList">
 <li class="blockList">
 <h4>getBlueOffset</h4>
-<pre>public static&nbsp;int&nbsp;getBlueOffset(int&nbsp;pixelFormat)
-                         throws java.lang.Exception</pre>
+<pre>public static&nbsp;int&nbsp;getBlueOffset(int&nbsp;pixelFormat)</pre>
 <div class="block">For the given pixel format, returns the number of bytes that the blue
  component is offset from the start of the pixel.  For instance, if a pixel
  of format <code>TJ.PF_BGRX</code> is stored in <code>char pixel[]</code>,
  then the blue component will be
  <code>pixel[TJ.getBlueOffset(TJ.PF_BGRX)]</code>.</div>
 <dl><dt><span class="strong">Parameters:</span></dt><dd><code>pixelFormat</code> - the pixel format (one of <code>PF_*</code>)</dd>
-<dt><span class="strong">Returns:</span></dt><dd>the blue offset for the given pixel format.</dd>
-<dt><span class="strong">Throws:</span></dt>
-<dd><code>java.lang.Exception</code></dd></dl>
+<dt><span class="strong">Returns:</span></dt><dd>the blue offset for the given pixel format.</dd></dl>
 </li>
 </ul>
 <a name="bufSize(int, int, int)">
@@ -1093,16 +1075,13 @@
 <h4>bufSize</h4>
 <pre>public static&nbsp;int&nbsp;bufSize(int&nbsp;width,
           int&nbsp;height,
-          int&nbsp;jpegSubsamp)
-                   throws java.lang.Exception</pre>
+          int&nbsp;jpegSubsamp)</pre>
 <div class="block">Returns the maximum size of the buffer (in bytes) required to hold a JPEG
  image with the given width, height, and level of chrominance subsampling.</div>
 <dl><dt><span class="strong">Parameters:</span></dt><dd><code>width</code> - the width (in pixels) of the JPEG image</dd><dd><code>height</code> - the height (in pixels) of the JPEG image</dd><dd><code>jpegSubsamp</code> - the level of chrominance subsampling to be used when
  generating the JPEG image (one of <a href="../../../org/libjpegturbo/turbojpeg/TJ.html" title="class in org.libjpegturbo.turbojpeg"><code>TJ.SAMP_*</code></a>)</dd>
 <dt><span class="strong">Returns:</span></dt><dd>the maximum size of the buffer (in bytes) required to hold a JPEG
- image with the given width, height, and level of chrominance subsampling.</dd>
-<dt><span class="strong">Throws:</span></dt>
-<dd><code>java.lang.Exception</code></dd></dl>
+ image with the given width, height, and level of chrominance subsampling.</dd></dl>
 </li>
 </ul>
 <a name="bufSizeYUV(int, int, int, int)">
@@ -1114,17 +1093,14 @@
 <pre>public static&nbsp;int&nbsp;bufSizeYUV(int&nbsp;width,
              int&nbsp;pad,
              int&nbsp;height,
-             int&nbsp;subsamp)
-                      throws java.lang.Exception</pre>
+             int&nbsp;subsamp)</pre>
 <div class="block">Returns the size of the buffer (in bytes) required to hold a YUV planar
  image with the given width, height, and level of chrominance subsampling.</div>
 <dl><dt><span class="strong">Parameters:</span></dt><dd><code>width</code> - the width (in pixels) of the YUV image</dd><dd><code>pad</code> - the width of each line in each plane of the image is padded to
  the nearest multiple of this number of bytes (must be a power of 2.)</dd><dd><code>height</code> - the height (in pixels) of the YUV image</dd><dd><code>subsamp</code> - the level of chrominance subsampling used in the YUV
  image (one of <a href="../../../org/libjpegturbo/turbojpeg/TJ.html" title="class in org.libjpegturbo.turbojpeg"><code>TJ.SAMP_*</code></a>)</dd>
 <dt><span class="strong">Returns:</span></dt><dd>the size of the buffer (in bytes) required to hold a YUV planar
- image with the given width, height, and level of chrominance subsampling.</dd>
-<dt><span class="strong">Throws:</span></dt>
-<dd><code>java.lang.Exception</code></dd></dl>
+ image with the given width, height, and level of chrominance subsampling.</dd></dl>
 </li>
 </ul>
 <a name="bufSizeYUV(int, int, int)">
@@ -1136,11 +1112,8 @@
 <pre>@Deprecated
 public static&nbsp;int&nbsp;bufSizeYUV(int&nbsp;width,
                         int&nbsp;height,
-                        int&nbsp;subsamp)
-                      throws java.lang.Exception</pre>
+                        int&nbsp;subsamp)</pre>
 <div class="block"><span class="strong">Deprecated.</span>&nbsp;<i>Use <a href="../../../org/libjpegturbo/turbojpeg/TJ.html#bufSizeYUV(int,%20int,%20int,%20int)"><code>bufSizeYUV(int, int, int, int)</code></a> instead.</i></div>
-<dl><dt><span class="strong">Throws:</span></dt>
-<dd><code>java.lang.Exception</code></dd></dl>
 </li>
 </ul>
 <a name="planeSizeYUV(int, int, int, int, int)">
@@ -1153,8 +1126,7 @@
                int&nbsp;width,
                int&nbsp;stride,
                int&nbsp;height,
-               int&nbsp;subsamp)
-                        throws java.lang.Exception</pre>
+               int&nbsp;subsamp)</pre>
 <div class="block">Returns the size of the buffer (in bytes) required to hold a YUV image
  plane with the given parameters.</div>
 <dl><dt><span class="strong">Parameters:</span></dt><dd><code>componentID</code> - ID number of the image plane (0 = Y, 1 = U/Cb,
@@ -1163,9 +1135,7 @@
  height of the whole image, not the plane height.</dd><dd><code>subsamp</code> - the level of chrominance subsampling used in the YUV
  image (one of <a href="../../../org/libjpegturbo/turbojpeg/TJ.html" title="class in org.libjpegturbo.turbojpeg"><code>TJ.SAMP_*</code></a>)</dd>
 <dt><span class="strong">Returns:</span></dt><dd>the size of the buffer (in bytes) required to hold a YUV planar
- image with the given parameters.</dd>
-<dt><span class="strong">Throws:</span></dt>
-<dd><code>java.lang.Exception</code></dd></dl>
+ image with the given parameters.</dd></dl>
 </li>
 </ul>
 <a name="planeWidth(int, int, int)">
@@ -1176,16 +1146,13 @@
 <h4>planeWidth</h4>
 <pre>public static&nbsp;int&nbsp;planeWidth(int&nbsp;componentID,
              int&nbsp;width,
-             int&nbsp;subsamp)
-                      throws java.lang.Exception</pre>
+             int&nbsp;subsamp)</pre>
 <div class="block">Returns the plane width of a YUV image plane with the given parameters.
  Refer to <a href="../../../org/libjpegturbo/turbojpeg/YUVImage.html" title="class in org.libjpegturbo.turbojpeg"><code>YUVImage</code></a> for a description of plane width.</div>
 <dl><dt><span class="strong">Parameters:</span></dt><dd><code>componentID</code> - ID number of the image plane (0 = Y, 1 = U/Cb,
  2 = V/Cr)</dd><dd><code>width</code> - width (in pixels) of the YUV image</dd><dd><code>subsamp</code> - the level of chrominance subsampling used in the YUV image
  (one of <a href="../../../org/libjpegturbo/turbojpeg/TJ.html" title="class in org.libjpegturbo.turbojpeg"><code>TJ.SAMP_*</code></a>)</dd>
-<dt><span class="strong">Returns:</span></dt><dd>the plane width of a YUV image plane with the given parameters.</dd>
-<dt><span class="strong">Throws:</span></dt>
-<dd><code>java.lang.Exception</code></dd></dl>
+<dt><span class="strong">Returns:</span></dt><dd>the plane width of a YUV image plane with the given parameters.</dd></dl>
 </li>
 </ul>
 <a name="planeHeight(int, int, int)">
@@ -1196,16 +1163,13 @@
 <h4>planeHeight</h4>
 <pre>public static&nbsp;int&nbsp;planeHeight(int&nbsp;componentID,
               int&nbsp;height,
-              int&nbsp;subsamp)
-                       throws java.lang.Exception</pre>
+              int&nbsp;subsamp)</pre>
 <div class="block">Returns the plane height of a YUV image plane with the given parameters.
  Refer to <a href="../../../org/libjpegturbo/turbojpeg/YUVImage.html" title="class in org.libjpegturbo.turbojpeg"><code>YUVImage</code></a> for a description of plane height.</div>
 <dl><dt><span class="strong">Parameters:</span></dt><dd><code>componentID</code> - ID number of the image plane (0 = Y, 1 = U/Cb,
  2 = V/Cr)</dd><dd><code>height</code> - height (in pixels) of the YUV image</dd><dd><code>subsamp</code> - the level of chrominance subsampling used in the YUV image
  (one of <a href="../../../org/libjpegturbo/turbojpeg/TJ.html" title="class in org.libjpegturbo.turbojpeg"><code>TJ.SAMP_*</code></a>)</dd>
-<dt><span class="strong">Returns:</span></dt><dd>the plane height of a YUV image plane with the given parameters.</dd>
-<dt><span class="strong">Throws:</span></dt>
-<dd><code>java.lang.Exception</code></dd></dl>
+<dt><span class="strong">Returns:</span></dt><dd>the plane height of a YUV image plane with the given parameters.</dd></dl>
 </li>
 </ul>
 <a name="getScalingFactors()">
@@ -1214,14 +1178,11 @@
 <ul class="blockListLast">
 <li class="blockList">
 <h4>getScalingFactors</h4>
-<pre>public static&nbsp;<a href="../../../org/libjpegturbo/turbojpeg/TJScalingFactor.html" title="class in org.libjpegturbo.turbojpeg">TJScalingFactor</a>[]&nbsp;getScalingFactors()
-                                           throws java.lang.Exception</pre>
+<pre>public static&nbsp;<a href="../../../org/libjpegturbo/turbojpeg/TJScalingFactor.html" title="class in org.libjpegturbo.turbojpeg">TJScalingFactor</a>[]&nbsp;getScalingFactors()</pre>
 <div class="block">Returns a list of fractional scaling factors that the JPEG decompressor in
  this implementation of TurboJPEG supports.</div>
 <dl><dt><span class="strong">Returns:</span></dt><dd>a list of fractional scaling factors that the JPEG decompressor in
- this implementation of TurboJPEG supports.</dd>
-<dt><span class="strong">Throws:</span></dt>
-<dd><code>java.lang.Exception</code></dd></dl>
+ this implementation of TurboJPEG supports.</dd></dl>
 </li>
 </ul>
 </li>
diff --git a/java/doc/org/libjpegturbo/turbojpeg/TJCompressor.html b/java/doc/org/libjpegturbo/turbojpeg/TJCompressor.html
index b7fa3db..29f12b7 100644
--- a/java/doc/org/libjpegturbo/turbojpeg/TJCompressor.html
+++ b/java/doc/org/libjpegturbo/turbojpeg/TJCompressor.html
@@ -90,10 +90,15 @@
 <div class="description">
 <ul class="blockList">
 <li class="blockList">
+<dl>
+<dt>All Implemented Interfaces:</dt>
+<dd>java.io.Closeable, java.lang.AutoCloseable</dd>
+</dl>
 <hr>
 <br>
 <pre>public class <span class="strong">TJCompressor</span>
-extends java.lang.Object</pre>
+extends java.lang.Object
+implements java.io.Closeable</pre>
 <div class="block">TurboJPEG compressor</div>
 </li>
 </ul>
@@ -376,10 +381,10 @@
 <li class="blockList">
 <h4>TJCompressor</h4>
 <pre>public&nbsp;TJCompressor()
-             throws java.lang.Exception</pre>
+             throws <a href="../../../org/libjpegturbo/turbojpeg/TJException.html" title="class in org.libjpegturbo.turbojpeg">TJException</a></pre>
 <div class="block">Create a TurboJPEG compressor instance.</div>
 <dl><dt><span class="strong">Throws:</span></dt>
-<dd><code>java.lang.Exception</code></dd></dl>
+<dd><code><a href="../../../org/libjpegturbo/turbojpeg/TJException.html" title="class in org.libjpegturbo.turbojpeg">TJException</a></code></dd></dl>
 </li>
 </ul>
 <a name="TJCompressor(byte[], int, int, int, int, int, int)">
@@ -395,14 +400,14 @@
             int&nbsp;pitch,
             int&nbsp;height,
             int&nbsp;pixelFormat)
-             throws java.lang.Exception</pre>
+             throws <a href="../../../org/libjpegturbo/turbojpeg/TJException.html" title="class in org.libjpegturbo.turbojpeg">TJException</a></pre>
 <div class="block">Create a TurboJPEG compressor instance and associate the uncompressed
  source image stored in <code>srcImage</code> with the newly created
  instance.</div>
 <dl><dt><span class="strong">Parameters:</span></dt><dd><code>srcImage</code> - see <a href="../../../org/libjpegturbo/turbojpeg/TJCompressor.html#setSourceImage(byte[],%20int,%20int,%20int,%20int,%20int,%20int)"><code>setSourceImage(byte[], int, int, int, int, int, int)</code></a> for description</dd><dd><code>x</code> - see <a href="../../../org/libjpegturbo/turbojpeg/TJCompressor.html#setSourceImage(byte[],%20int,%20int,%20int,%20int,%20int,%20int)"><code>setSourceImage(byte[], int, int, int, int, int, int)</code></a> for description</dd><dd><code>y</code> - see <a href="../../../org/libjpegturbo/turbojpeg/TJCompressor.html#setSourceImage(byte[],%20int,%20int,%20int,%20int,%20int,%20int)"><code>setSourceImage(byte[], int, int, int, int, int, int)</code></a> for description</dd><dd><code>width</code> - see <a href="../../../org/libjpegturbo/turbojpeg/TJCompressor.html#setSourceImage(byte[],%20int,%20int,%20int,%20int,%20int,%20int)"><code>setSourceImage(byte[], int, int, int, int, int, int)</code></a> for description</dd><dd><code>pitch</code> - see <a href="../../../org/libjpegturbo/turbojpeg/TJCompressor.html#setSourceImage(byte[],%20int,%20int,%20int,%20int,%20int,%20int)"><code>setSourceImage(byte[], int, int, int, int, int, int)</code></a> for description</dd><dd><code>height</code> - see <a href="../../../org/libjpegturbo/turbojpeg/TJCompressor.html#setSourceImage(byte[],%20int,%20int,%20int,%20int,%20int,%20int)"><code>setSourceImage(byte[], int, int, int, int, int, int)</code></a> for description</dd><dd><code>pixelFormat</code> - pixel format of the source image (one of
  <a href="../../../org/libjpegturbo/turbojpeg/TJ.html#PF_RGB"><code>TJ.PF_*</code></a>)</dd>
 <dt><span class="strong">Throws:</span></dt>
-<dd><code>java.lang.Exception</code></dd></dl>
+<dd><code><a href="../../../org/libjpegturbo/turbojpeg/TJException.html" title="class in org.libjpegturbo.turbojpeg">TJException</a></code></dd></dl>
 </li>
 </ul>
 <a name="TJCompressor(byte[], int, int, int, int)">
@@ -417,11 +422,11 @@
                        int&nbsp;pitch,
                        int&nbsp;height,
                        int&nbsp;pixelFormat)
-             throws java.lang.Exception</pre>
+             throws <a href="../../../org/libjpegturbo/turbojpeg/TJException.html" title="class in org.libjpegturbo.turbojpeg">TJException</a></pre>
 <div class="block"><span class="strong">Deprecated.</span>&nbsp;<i>Use
  <a href="../../../org/libjpegturbo/turbojpeg/TJCompressor.html#TJCompressor(byte[],%20int,%20int,%20int,%20int,%20int,%20int)"><code>TJCompressor(byte[], int, int, int, int, int, int)</code></a> instead.</i></div>
 <dl><dt><span class="strong">Throws:</span></dt>
-<dd><code>java.lang.Exception</code></dd></dl>
+<dd><code><a href="../../../org/libjpegturbo/turbojpeg/TJException.html" title="class in org.libjpegturbo.turbojpeg">TJException</a></code></dd></dl>
 </li>
 </ul>
 <a name="TJCompressor(java.awt.image.BufferedImage, int, int, int, int)">
@@ -435,7 +440,7 @@
             int&nbsp;y,
             int&nbsp;width,
             int&nbsp;height)
-             throws java.lang.Exception</pre>
+             throws <a href="../../../org/libjpegturbo/turbojpeg/TJException.html" title="class in org.libjpegturbo.turbojpeg">TJException</a></pre>
 <div class="block">Create a TurboJPEG compressor instance and associate the uncompressed
  source image stored in <code>srcImage</code> with the newly created
  instance.</div>
@@ -446,7 +451,7 @@
  <a href="../../../org/libjpegturbo/turbojpeg/TJCompressor.html#setSourceImage(java.awt.image.BufferedImage,%20int,%20int,%20int,%20int)"><code>setSourceImage(BufferedImage, int, int, int, int)</code></a> for description</dd><dd><code>height</code> - see
  <a href="../../../org/libjpegturbo/turbojpeg/TJCompressor.html#setSourceImage(java.awt.image.BufferedImage,%20int,%20int,%20int,%20int)"><code>setSourceImage(BufferedImage, int, int, int, int)</code></a> for description</dd>
 <dt><span class="strong">Throws:</span></dt>
-<dd><code>java.lang.Exception</code></dd></dl>
+<dd><code><a href="../../../org/libjpegturbo/turbojpeg/TJException.html" title="class in org.libjpegturbo.turbojpeg">TJException</a></code></dd></dl>
 </li>
 </ul>
 </li>
@@ -470,7 +475,7 @@
                   int&nbsp;pitch,
                   int&nbsp;height,
                   int&nbsp;pixelFormat)
-                    throws java.lang.Exception</pre>
+                    throws <a href="../../../org/libjpegturbo/turbojpeg/TJException.html" title="class in org.libjpegturbo.turbojpeg">TJException</a></pre>
 <div class="block">Associate an uncompressed RGB, grayscale, or CMYK source image with this
  compressor instance.</div>
 <dl><dt><span class="strong">Parameters:</span></dt><dd><code>srcImage</code> - image buffer containing RGB, grayscale, or CMYK pixels to
@@ -488,7 +493,7 @@
  which the JPEG or YUV image should be compressed/encoded</dd><dd><code>pixelFormat</code> - pixel format of the source image (one of
  <a href="../../../org/libjpegturbo/turbojpeg/TJ.html#PF_RGB"><code>TJ.PF_*</code></a>)</dd>
 <dt><span class="strong">Throws:</span></dt>
-<dd><code>java.lang.Exception</code></dd></dl>
+<dd><code><a href="../../../org/libjpegturbo/turbojpeg/TJException.html" title="class in org.libjpegturbo.turbojpeg">TJException</a></code></dd></dl>
 </li>
 </ul>
 <a name="setSourceImage(byte[], int, int, int, int)">
@@ -503,11 +508,11 @@
                              int&nbsp;pitch,
                              int&nbsp;height,
                              int&nbsp;pixelFormat)
-                    throws java.lang.Exception</pre>
+                    throws <a href="../../../org/libjpegturbo/turbojpeg/TJException.html" title="class in org.libjpegturbo.turbojpeg">TJException</a></pre>
 <div class="block"><span class="strong">Deprecated.</span>&nbsp;<i>Use
  <a href="../../../org/libjpegturbo/turbojpeg/TJCompressor.html#setSourceImage(byte[],%20int,%20int,%20int,%20int,%20int,%20int)"><code>setSourceImage(byte[], int, int, int, int, int, int)</code></a> instead.</i></div>
 <dl><dt><span class="strong">Throws:</span></dt>
-<dd><code>java.lang.Exception</code></dd></dl>
+<dd><code><a href="../../../org/libjpegturbo/turbojpeg/TJException.html" title="class in org.libjpegturbo.turbojpeg">TJException</a></code></dd></dl>
 </li>
 </ul>
 <a name="setSourceImage(java.awt.image.BufferedImage, int, int, int, int)">
@@ -521,7 +526,7 @@
                   int&nbsp;y,
                   int&nbsp;width,
                   int&nbsp;height)
-                    throws java.lang.Exception</pre>
+                    throws <a href="../../../org/libjpegturbo/turbojpeg/TJException.html" title="class in org.libjpegturbo.turbojpeg">TJException</a></pre>
 <div class="block">Associate an uncompressed RGB or grayscale source image with this
  compressor instance.</div>
 <dl><dt><span class="strong">Parameters:</span></dt><dd><code>srcImage</code> - a <code>BufferedImage</code> instance containing RGB or
@@ -533,7 +538,7 @@
  which the JPEG or YUV image should be compressed/encoded (0 = use the
  height of the source image)</dd>
 <dt><span class="strong">Throws:</span></dt>
-<dd><code>java.lang.Exception</code></dd></dl>
+<dd><code><a href="../../../org/libjpegturbo/turbojpeg/TJException.html" title="class in org.libjpegturbo.turbojpeg">TJException</a></code></dd></dl>
 </li>
 </ul>
 <a name="setSourceImage(org.libjpegturbo.turbojpeg.YUVImage)">
@@ -543,13 +548,13 @@
 <li class="blockList">
 <h4>setSourceImage</h4>
 <pre>public&nbsp;void&nbsp;setSourceImage(<a href="../../../org/libjpegturbo/turbojpeg/YUVImage.html" title="class in org.libjpegturbo.turbojpeg">YUVImage</a>&nbsp;srcImage)
-                    throws java.lang.Exception</pre>
+                    throws <a href="../../../org/libjpegturbo/turbojpeg/TJException.html" title="class in org.libjpegturbo.turbojpeg">TJException</a></pre>
 <div class="block">Associate an uncompressed YUV planar source image with this compressor
  instance.</div>
 <dl><dt><span class="strong">Parameters:</span></dt><dd><code>srcImage</code> - YUV planar image to be compressed.  This image is not
  modified.</dd>
 <dt><span class="strong">Throws:</span></dt>
-<dd><code>java.lang.Exception</code></dd></dl>
+<dd><code><a href="../../../org/libjpegturbo/turbojpeg/TJException.html" title="class in org.libjpegturbo.turbojpeg">TJException</a></code></dd></dl>
 </li>
 </ul>
 <a name="setSubsamp(int)">
@@ -558,8 +563,7 @@
 <ul class="blockList">
 <li class="blockList">
 <h4>setSubsamp</h4>
-<pre>public&nbsp;void&nbsp;setSubsamp(int&nbsp;newSubsamp)
-                throws java.lang.Exception</pre>
+<pre>public&nbsp;void&nbsp;setSubsamp(int&nbsp;newSubsamp)</pre>
 <div class="block">Set the level of chrominance subsampling for subsequent compress/encode
  operations.  When pixels are converted from RGB to YCbCr (see
  <a href="../../../org/libjpegturbo/turbojpeg/TJ.html#CS_YCbCr"><code>TJ.CS_YCbCr</code></a>) or from CMYK to YCCK (see <a href="../../../org/libjpegturbo/turbojpeg/TJ.html#CS_YCCK"><code>TJ.CS_YCCK</code></a>) as part
@@ -577,9 +581,7 @@
  destination.</div>
 <dl><dt><span class="strong">Parameters:</span></dt><dd><code>newSubsamp</code> - the level of chrominance subsampling to use in
  subsequent compress/encode oeprations (one of
- <a href="../../../org/libjpegturbo/turbojpeg/TJ.html#SAMP_444"><code>TJ.SAMP_*</code></a>)</dd>
-<dt><span class="strong">Throws:</span></dt>
-<dd><code>java.lang.Exception</code></dd></dl>
+ <a href="../../../org/libjpegturbo/turbojpeg/TJ.html#SAMP_444"><code>TJ.SAMP_*</code></a>)</dd></dl>
 </li>
 </ul>
 <a name="setJPEGQuality(int)">
@@ -588,13 +590,10 @@
 <ul class="blockList">
 <li class="blockList">
 <h4>setJPEGQuality</h4>
-<pre>public&nbsp;void&nbsp;setJPEGQuality(int&nbsp;quality)
-                    throws java.lang.Exception</pre>
+<pre>public&nbsp;void&nbsp;setJPEGQuality(int&nbsp;quality)</pre>
 <div class="block">Set the JPEG image quality level for subsequent compress operations.</div>
 <dl><dt><span class="strong">Parameters:</span></dt><dd><code>quality</code> - the new JPEG image quality level (1 to 100, 1 = worst,
- 100 = best)</dd>
-<dt><span class="strong">Throws:</span></dt>
-<dd><code>java.lang.Exception</code></dd></dl>
+ 100 = best)</dd></dl>
 </li>
 </ul>
 <a name="compress(byte[], int)">
@@ -605,7 +604,7 @@
 <h4>compress</h4>
 <pre>public&nbsp;void&nbsp;compress(byte[]&nbsp;dstBuf,
             int&nbsp;flags)
-              throws java.lang.Exception</pre>
+              throws <a href="../../../org/libjpegturbo/turbojpeg/TJException.html" title="class in org.libjpegturbo.turbojpeg">TJException</a></pre>
 <div class="block">Compress the uncompressed source image associated with this compressor
  instance and output a JPEG image to the given destination buffer.</div>
 <dl><dt><span class="strong">Parameters:</span></dt><dd><code>dstBuf</code> - buffer that will receive the JPEG image.  Use
@@ -614,7 +613,7 @@
  subsampling.</dd><dd><code>flags</code> - the bitwise OR of one or more of
  <a href="../../../org/libjpegturbo/turbojpeg/TJ.html#FLAG_BOTTOMUP"><code>TJ.FLAG_*</code></a></dd>
 <dt><span class="strong">Throws:</span></dt>
-<dd><code>java.lang.Exception</code></dd></dl>
+<dd><code><a href="../../../org/libjpegturbo/turbojpeg/TJException.html" title="class in org.libjpegturbo.turbojpeg">TJException</a></code></dd></dl>
 </li>
 </ul>
 <a name="compress(int)">
@@ -624,7 +623,7 @@
 <li class="blockList">
 <h4>compress</h4>
 <pre>public&nbsp;byte[]&nbsp;compress(int&nbsp;flags)
-                throws java.lang.Exception</pre>
+                throws <a href="../../../org/libjpegturbo/turbojpeg/TJException.html" title="class in org.libjpegturbo.turbojpeg">TJException</a></pre>
 <div class="block">Compress the uncompressed source image associated with this compressor
  instance and return a buffer containing a JPEG image.</div>
 <dl><dt><span class="strong">Parameters:</span></dt><dd><code>flags</code> - the bitwise OR of one or more of
@@ -632,7 +631,7 @@
 <dt><span class="strong">Returns:</span></dt><dd>a buffer containing a JPEG image.  The length of this buffer will
  not be equal to the size of the JPEG image.  Use <a href="../../../org/libjpegturbo/turbojpeg/TJCompressor.html#getCompressedSize()"><code>getCompressedSize()</code></a> to obtain the size of the JPEG image.</dd>
 <dt><span class="strong">Throws:</span></dt>
-<dd><code>java.lang.Exception</code></dd></dl>
+<dd><code><a href="../../../org/libjpegturbo/turbojpeg/TJException.html" title="class in org.libjpegturbo.turbojpeg">TJException</a></code></dd></dl>
 </li>
 </ul>
 <a name="compress(java.awt.image.BufferedImage, byte[], int)">
@@ -645,12 +644,12 @@
 public&nbsp;void&nbsp;compress(java.awt.image.BufferedImage&nbsp;srcImage,
                        byte[]&nbsp;dstBuf,
                        int&nbsp;flags)
-              throws java.lang.Exception</pre>
+              throws <a href="../../../org/libjpegturbo/turbojpeg/TJException.html" title="class in org.libjpegturbo.turbojpeg">TJException</a></pre>
 <div class="block"><span class="strong">Deprecated.</span>&nbsp;<i>Use
  <a href="../../../org/libjpegturbo/turbojpeg/TJCompressor.html#setSourceImage(java.awt.image.BufferedImage,%20int,%20int,%20int,%20int)"><code>setSourceImage(BufferedImage, int, int, int, int)</code></a> and
  <a href="../../../org/libjpegturbo/turbojpeg/TJCompressor.html#compress(byte[],%20int)"><code>compress(byte[], int)</code></a> instead.</i></div>
 <dl><dt><span class="strong">Throws:</span></dt>
-<dd><code>java.lang.Exception</code></dd></dl>
+<dd><code><a href="../../../org/libjpegturbo/turbojpeg/TJException.html" title="class in org.libjpegturbo.turbojpeg">TJException</a></code></dd></dl>
 </li>
 </ul>
 <a name="compress(java.awt.image.BufferedImage, int)">
@@ -662,12 +661,12 @@
 <pre>@Deprecated
 public&nbsp;byte[]&nbsp;compress(java.awt.image.BufferedImage&nbsp;srcImage,
                          int&nbsp;flags)
-                throws java.lang.Exception</pre>
+                throws <a href="../../../org/libjpegturbo/turbojpeg/TJException.html" title="class in org.libjpegturbo.turbojpeg">TJException</a></pre>
 <div class="block"><span class="strong">Deprecated.</span>&nbsp;<i>Use
  <a href="../../../org/libjpegturbo/turbojpeg/TJCompressor.html#setSourceImage(java.awt.image.BufferedImage,%20int,%20int,%20int,%20int)"><code>setSourceImage(BufferedImage, int, int, int, int)</code></a> and
  <a href="../../../org/libjpegturbo/turbojpeg/TJCompressor.html#compress(int)"><code>compress(int)</code></a> instead.</i></div>
 <dl><dt><span class="strong">Throws:</span></dt>
-<dd><code>java.lang.Exception</code></dd></dl>
+<dd><code><a href="../../../org/libjpegturbo/turbojpeg/TJException.html" title="class in org.libjpegturbo.turbojpeg">TJException</a></code></dd></dl>
 </li>
 </ul>
 <a name="encodeYUV(org.libjpegturbo.turbojpeg.YUVImage, int)">
@@ -678,7 +677,7 @@
 <h4>encodeYUV</h4>
 <pre>public&nbsp;void&nbsp;encodeYUV(<a href="../../../org/libjpegturbo/turbojpeg/YUVImage.html" title="class in org.libjpegturbo.turbojpeg">YUVImage</a>&nbsp;dstImage,
              int&nbsp;flags)
-               throws java.lang.Exception</pre>
+               throws <a href="../../../org/libjpegturbo/turbojpeg/TJException.html" title="class in org.libjpegturbo.turbojpeg">TJException</a></pre>
 <div class="block">Encode the uncompressed source image associated with this compressor
  instance into a YUV planar image and store it in the given
  <code>YUVImage</code> instance.   This method uses the accelerated color
@@ -689,7 +688,7 @@
  image</dd><dd><code>flags</code> - the bitwise OR of one or more of
  <a href="../../../org/libjpegturbo/turbojpeg/TJ.html#FLAG_BOTTOMUP"><code>TJ.FLAG_*</code></a></dd>
 <dt><span class="strong">Throws:</span></dt>
-<dd><code>java.lang.Exception</code></dd></dl>
+<dd><code><a href="../../../org/libjpegturbo/turbojpeg/TJException.html" title="class in org.libjpegturbo.turbojpeg">TJException</a></code></dd></dl>
 </li>
 </ul>
 <a name="encodeYUV(byte[], int)">
@@ -701,10 +700,10 @@
 <pre>@Deprecated
 public&nbsp;void&nbsp;encodeYUV(byte[]&nbsp;dstBuf,
                         int&nbsp;flags)
-               throws java.lang.Exception</pre>
+               throws <a href="../../../org/libjpegturbo/turbojpeg/TJException.html" title="class in org.libjpegturbo.turbojpeg">TJException</a></pre>
 <div class="block"><span class="strong">Deprecated.</span>&nbsp;<i>Use <a href="../../../org/libjpegturbo/turbojpeg/TJCompressor.html#encodeYUV(org.libjpegturbo.turbojpeg.YUVImage,%20int)"><code>encodeYUV(YUVImage, int)</code></a> instead.</i></div>
 <dl><dt><span class="strong">Throws:</span></dt>
-<dd><code>java.lang.Exception</code></dd></dl>
+<dd><code><a href="../../../org/libjpegturbo/turbojpeg/TJException.html" title="class in org.libjpegturbo.turbojpeg">TJException</a></code></dd></dl>
 </li>
 </ul>
 <a name="encodeYUV(int, int)">
@@ -715,7 +714,7 @@
 <h4>encodeYUV</h4>
 <pre>public&nbsp;<a href="../../../org/libjpegturbo/turbojpeg/YUVImage.html" title="class in org.libjpegturbo.turbojpeg">YUVImage</a>&nbsp;encodeYUV(int&nbsp;pad,
                  int&nbsp;flags)
-                   throws java.lang.Exception</pre>
+                   throws <a href="../../../org/libjpegturbo/turbojpeg/TJException.html" title="class in org.libjpegturbo.turbojpeg">TJException</a></pre>
 <div class="block">Encode the uncompressed source image associated with this compressor
  instance into a unified YUV planar image buffer and return a
  <code>YUVImage</code> instance containing the encoded image.  This method
@@ -728,7 +727,7 @@
  <a href="../../../org/libjpegturbo/turbojpeg/TJ.html#FLAG_BOTTOMUP"><code>TJ.FLAG_*</code></a></dd>
 <dt><span class="strong">Returns:</span></dt><dd>a YUV planar image.</dd>
 <dt><span class="strong">Throws:</span></dt>
-<dd><code>java.lang.Exception</code></dd></dl>
+<dd><code><a href="../../../org/libjpegturbo/turbojpeg/TJException.html" title="class in org.libjpegturbo.turbojpeg">TJException</a></code></dd></dl>
 </li>
 </ul>
 <a name="encodeYUV(int[], int)">
@@ -739,7 +738,7 @@
 <h4>encodeYUV</h4>
 <pre>public&nbsp;<a href="../../../org/libjpegturbo/turbojpeg/YUVImage.html" title="class in org.libjpegturbo.turbojpeg">YUVImage</a>&nbsp;encodeYUV(int[]&nbsp;strides,
                  int&nbsp;flags)
-                   throws java.lang.Exception</pre>
+                   throws <a href="../../../org/libjpegturbo/turbojpeg/TJException.html" title="class in org.libjpegturbo.turbojpeg">TJException</a></pre>
 <div class="block">Encode the uncompressed source image associated with this compressor
  instance into separate Y, U (Cb), and V (Cr) image planes and return a
  <code>YUVImage</code> instance containing the encoded image planes.  This
@@ -756,7 +755,7 @@
  <a href="../../../org/libjpegturbo/turbojpeg/TJ.html#FLAG_BOTTOMUP"><code>TJ.FLAG_*</code></a></dd>
 <dt><span class="strong">Returns:</span></dt><dd>a YUV planar image.</dd>
 <dt><span class="strong">Throws:</span></dt>
-<dd><code>java.lang.Exception</code></dd></dl>
+<dd><code><a href="../../../org/libjpegturbo/turbojpeg/TJException.html" title="class in org.libjpegturbo.turbojpeg">TJException</a></code></dd></dl>
 </li>
 </ul>
 <a name="encodeYUV(int)">
@@ -767,10 +766,10 @@
 <h4>encodeYUV</h4>
 <pre>@Deprecated
 public&nbsp;byte[]&nbsp;encodeYUV(int&nbsp;flags)
-                 throws java.lang.Exception</pre>
+                 throws <a href="../../../org/libjpegturbo/turbojpeg/TJException.html" title="class in org.libjpegturbo.turbojpeg">TJException</a></pre>
 <div class="block"><span class="strong">Deprecated.</span>&nbsp;<i>Use <a href="../../../org/libjpegturbo/turbojpeg/TJCompressor.html#encodeYUV(int,%20int)"><code>encodeYUV(int, int)</code></a> instead.</i></div>
 <dl><dt><span class="strong">Throws:</span></dt>
-<dd><code>java.lang.Exception</code></dd></dl>
+<dd><code><a href="../../../org/libjpegturbo/turbojpeg/TJException.html" title="class in org.libjpegturbo.turbojpeg">TJException</a></code></dd></dl>
 </li>
 </ul>
 <a name="encodeYUV(java.awt.image.BufferedImage, byte[], int)">
@@ -783,12 +782,12 @@
 public&nbsp;void&nbsp;encodeYUV(java.awt.image.BufferedImage&nbsp;srcImage,
                         byte[]&nbsp;dstBuf,
                         int&nbsp;flags)
-               throws java.lang.Exception</pre>
+               throws <a href="../../../org/libjpegturbo/turbojpeg/TJException.html" title="class in org.libjpegturbo.turbojpeg">TJException</a></pre>
 <div class="block"><span class="strong">Deprecated.</span>&nbsp;<i>Use
  <a href="../../../org/libjpegturbo/turbojpeg/TJCompressor.html#setSourceImage(java.awt.image.BufferedImage,%20int,%20int,%20int,%20int)"><code>setSourceImage(BufferedImage, int, int, int, int)</code></a> and
  <a href="../../../org/libjpegturbo/turbojpeg/TJCompressor.html#encodeYUV(byte[],%20int)"><code>encodeYUV(byte[], int)</code></a> instead.</i></div>
 <dl><dt><span class="strong">Throws:</span></dt>
-<dd><code>java.lang.Exception</code></dd></dl>
+<dd><code><a href="../../../org/libjpegturbo/turbojpeg/TJException.html" title="class in org.libjpegturbo.turbojpeg">TJException</a></code></dd></dl>
 </li>
 </ul>
 <a name="encodeYUV(java.awt.image.BufferedImage, int)">
@@ -800,12 +799,12 @@
 <pre>@Deprecated
 public&nbsp;byte[]&nbsp;encodeYUV(java.awt.image.BufferedImage&nbsp;srcImage,
                           int&nbsp;flags)
-                 throws java.lang.Exception</pre>
+                 throws <a href="../../../org/libjpegturbo/turbojpeg/TJException.html" title="class in org.libjpegturbo.turbojpeg">TJException</a></pre>
 <div class="block"><span class="strong">Deprecated.</span>&nbsp;<i>Use
  <a href="../../../org/libjpegturbo/turbojpeg/TJCompressor.html#setSourceImage(java.awt.image.BufferedImage,%20int,%20int,%20int,%20int)"><code>setSourceImage(BufferedImage, int, int, int, int)</code></a> and
  <a href="../../../org/libjpegturbo/turbojpeg/TJCompressor.html#encodeYUV(int,%20int)"><code>encodeYUV(int, int)</code></a> instead.</i></div>
 <dl><dt><span class="strong">Throws:</span></dt>
-<dd><code>java.lang.Exception</code></dd></dl>
+<dd><code><a href="../../../org/libjpegturbo/turbojpeg/TJException.html" title="class in org.libjpegturbo.turbojpeg">TJException</a></code></dd></dl>
 </li>
 </ul>
 <a name="getCompressedSize()">
@@ -828,10 +827,15 @@
 <li class="blockList">
 <h4>close</h4>
 <pre>public&nbsp;void&nbsp;close()
-           throws java.lang.Exception</pre>
+           throws <a href="../../../org/libjpegturbo/turbojpeg/TJException.html" title="class in org.libjpegturbo.turbojpeg">TJException</a></pre>
 <div class="block">Free the native structures associated with this compressor instance.</div>
-<dl><dt><span class="strong">Throws:</span></dt>
-<dd><code>java.lang.Exception</code></dd></dl>
+<dl>
+<dt><strong>Specified by:</strong></dt>
+<dd><code>close</code>&nbsp;in interface&nbsp;<code>java.io.Closeable</code></dd>
+<dt><strong>Specified by:</strong></dt>
+<dd><code>close</code>&nbsp;in interface&nbsp;<code>java.lang.AutoCloseable</code></dd>
+<dt><span class="strong">Throws:</span></dt>
+<dd><code><a href="../../../org/libjpegturbo/turbojpeg/TJException.html" title="class in org.libjpegturbo.turbojpeg">TJException</a></code></dd></dl>
 </li>
 </ul>
 <a name="finalize()">
diff --git a/java/doc/org/libjpegturbo/turbojpeg/TJCustomFilter.html b/java/doc/org/libjpegturbo/turbojpeg/TJCustomFilter.html
index c2b6e61..6bd6fd2 100644
--- a/java/doc/org/libjpegturbo/turbojpeg/TJCustomFilter.html
+++ b/java/doc/org/libjpegturbo/turbojpeg/TJCustomFilter.html
@@ -144,7 +144,7 @@
                 int&nbsp;componentID,
                 int&nbsp;transformID,
                 <a href="../../../org/libjpegturbo/turbojpeg/TJTransform.html" title="class in org.libjpegturbo.turbojpeg">TJTransform</a>&nbsp;transform)
-                  throws java.lang.Exception</pre>
+                  throws <a href="../../../org/libjpegturbo/turbojpeg/TJException.html" title="class in org.libjpegturbo.turbojpeg">TJException</a></pre>
 <div class="block">A callback function that can be used to modify the DCT coefficients after
  they are losslessly transformed but before they are transcoded to a new
  JPEG image.  This allows for custom filters or other transformations to be
@@ -165,7 +165,7 @@
  transform in the <code>transforms</code> array that was passed to <a href="../../../org/libjpegturbo/turbojpeg/TJTransformer.html#transform(byte[][],%20org.libjpegturbo.turbojpeg.TJTransform[],%20int)"><code>TJTransformer.transform()</code></a>.</dd><dd><code>transform</code> - a <a href="../../../org/libjpegturbo/turbojpeg/TJTransform.html" title="class in org.libjpegturbo.turbojpeg"><code>TJTransform</code></a> instance that specifies the
  parameters and/or cropping region for this transform</dd>
 <dt><span class="strong">Throws:</span></dt>
-<dd><code>java.lang.Exception</code></dd></dl>
+<dd><code><a href="../../../org/libjpegturbo/turbojpeg/TJException.html" title="class in org.libjpegturbo.turbojpeg">TJException</a></code></dd></dl>
 </li>
 </ul>
 </li>
diff --git a/java/doc/org/libjpegturbo/turbojpeg/TJDecompressor.html b/java/doc/org/libjpegturbo/turbojpeg/TJDecompressor.html
index dc1dcbd..a914de9 100644
--- a/java/doc/org/libjpegturbo/turbojpeg/TJDecompressor.html
+++ b/java/doc/org/libjpegturbo/turbojpeg/TJDecompressor.html
@@ -33,7 +33,7 @@
 <div class="subNav">
 <ul class="navList">
 <li><a href="../../../org/libjpegturbo/turbojpeg/TJCustomFilter.html" title="interface in org.libjpegturbo.turbojpeg"><span class="strong">Prev Class</span></a></li>
-<li><a href="../../../org/libjpegturbo/turbojpeg/TJScalingFactor.html" title="class in org.libjpegturbo.turbojpeg"><span class="strong">Next Class</span></a></li>
+<li><a href="../../../org/libjpegturbo/turbojpeg/TJException.html" title="class in org.libjpegturbo.turbojpeg"><span class="strong">Next Class</span></a></li>
 </ul>
 <ul class="navList">
 <li><a href="../../../index.html?org/libjpegturbo/turbojpeg/TJDecompressor.html" target="_top">Frames</a></li>
@@ -91,13 +91,18 @@
 <ul class="blockList">
 <li class="blockList">
 <dl>
+<dt>All Implemented Interfaces:</dt>
+<dd>java.io.Closeable, java.lang.AutoCloseable</dd>
+</dl>
+<dl>
 <dt>Direct Known Subclasses:</dt>
 <dd><a href="../../../org/libjpegturbo/turbojpeg/TJTransformer.html" title="class in org.libjpegturbo.turbojpeg">TJTransformer</a></dd>
 </dl>
 <hr>
 <br>
 <pre>public class <span class="strong">TJDecompressor</span>
-extends java.lang.Object</pre>
+extends java.lang.Object
+implements java.io.Closeable</pre>
 <div class="block">TurboJPEG decompressor</div>
 </li>
 </ul>
@@ -529,10 +534,10 @@
 <li class="blockList">
 <h4>TJDecompressor</h4>
 <pre>public&nbsp;TJDecompressor()
-               throws java.lang.Exception</pre>
+               throws <a href="../../../org/libjpegturbo/turbojpeg/TJException.html" title="class in org.libjpegturbo.turbojpeg">TJException</a></pre>
 <div class="block">Create a TurboJPEG decompresssor instance.</div>
 <dl><dt><span class="strong">Throws:</span></dt>
-<dd><code>java.lang.Exception</code></dd></dl>
+<dd><code><a href="../../../org/libjpegturbo/turbojpeg/TJException.html" title="class in org.libjpegturbo.turbojpeg">TJException</a></code></dd></dl>
 </li>
 </ul>
 <a name="TJDecompressor(byte[])">
@@ -542,13 +547,13 @@
 <li class="blockList">
 <h4>TJDecompressor</h4>
 <pre>public&nbsp;TJDecompressor(byte[]&nbsp;jpegImage)
-               throws java.lang.Exception</pre>
+               throws <a href="../../../org/libjpegturbo/turbojpeg/TJException.html" title="class in org.libjpegturbo.turbojpeg">TJException</a></pre>
 <div class="block">Create a TurboJPEG decompressor instance and associate the JPEG source
  image stored in <code>jpegImage</code> with the newly created instance.</div>
 <dl><dt><span class="strong">Parameters:</span></dt><dd><code>jpegImage</code> - JPEG image buffer (size of the JPEG image is assumed to
  be the length of the array.)  This buffer is not modified.</dd>
 <dt><span class="strong">Throws:</span></dt>
-<dd><code>java.lang.Exception</code></dd></dl>
+<dd><code><a href="../../../org/libjpegturbo/turbojpeg/TJException.html" title="class in org.libjpegturbo.turbojpeg">TJException</a></code></dd></dl>
 </li>
 </ul>
 <a name="TJDecompressor(byte[], int)">
@@ -559,13 +564,13 @@
 <h4>TJDecompressor</h4>
 <pre>public&nbsp;TJDecompressor(byte[]&nbsp;jpegImage,
               int&nbsp;imageSize)
-               throws java.lang.Exception</pre>
+               throws <a href="../../../org/libjpegturbo/turbojpeg/TJException.html" title="class in org.libjpegturbo.turbojpeg">TJException</a></pre>
 <div class="block">Create a TurboJPEG decompressor instance and associate the JPEG source
  image of length <code>imageSize</code> bytes stored in
  <code>jpegImage</code> with the newly created instance.</div>
 <dl><dt><span class="strong">Parameters:</span></dt><dd><code>jpegImage</code> - JPEG image buffer.  This buffer is not modified.</dd><dd><code>imageSize</code> - size of the JPEG image (in bytes)</dd>
 <dt><span class="strong">Throws:</span></dt>
-<dd><code>java.lang.Exception</code></dd></dl>
+<dd><code><a href="../../../org/libjpegturbo/turbojpeg/TJException.html" title="class in org.libjpegturbo.turbojpeg">TJException</a></code></dd></dl>
 </li>
 </ul>
 <a name="TJDecompressor(org.libjpegturbo.turbojpeg.YUVImage)">
@@ -575,14 +580,14 @@
 <li class="blockList">
 <h4>TJDecompressor</h4>
 <pre>public&nbsp;TJDecompressor(<a href="../../../org/libjpegturbo/turbojpeg/YUVImage.html" title="class in org.libjpegturbo.turbojpeg">YUVImage</a>&nbsp;yuvImage)
-               throws java.lang.Exception</pre>
+               throws <a href="../../../org/libjpegturbo/turbojpeg/TJException.html" title="class in org.libjpegturbo.turbojpeg">TJException</a></pre>
 <div class="block">Create a TurboJPEG decompressor instance and associate the YUV planar
  source image stored in <code>yuvImage</code> with the newly created
  instance.</div>
 <dl><dt><span class="strong">Parameters:</span></dt><dd><code>yuvImage</code> - <a href="../../../org/libjpegturbo/turbojpeg/YUVImage.html" title="class in org.libjpegturbo.turbojpeg"><code>YUVImage</code></a> instance containing a YUV planar
  image to be decoded.  This image is not modified.</dd>
 <dt><span class="strong">Throws:</span></dt>
-<dd><code>java.lang.Exception</code></dd></dl>
+<dd><code><a href="../../../org/libjpegturbo/turbojpeg/TJException.html" title="class in org.libjpegturbo.turbojpeg">TJException</a></code></dd></dl>
 </li>
 </ul>
 </li>
@@ -601,13 +606,13 @@
 <h4>setSourceImage</h4>
 <pre>public&nbsp;void&nbsp;setSourceImage(byte[]&nbsp;jpegImage,
                   int&nbsp;imageSize)
-                    throws java.lang.Exception</pre>
+                    throws <a href="../../../org/libjpegturbo/turbojpeg/TJException.html" title="class in org.libjpegturbo.turbojpeg">TJException</a></pre>
 <div class="block">Associate the JPEG image of length <code>imageSize</code> bytes stored in
  <code>jpegImage</code> with this decompressor instance.  This image will
  be used as the source image for subsequent decompress operations.</div>
 <dl><dt><span class="strong">Parameters:</span></dt><dd><code>jpegImage</code> - JPEG image buffer.  This buffer is not modified.</dd><dd><code>imageSize</code> - size of the JPEG image (in bytes)</dd>
 <dt><span class="strong">Throws:</span></dt>
-<dd><code>java.lang.Exception</code></dd></dl>
+<dd><code><a href="../../../org/libjpegturbo/turbojpeg/TJException.html" title="class in org.libjpegturbo.turbojpeg">TJException</a></code></dd></dl>
 </li>
 </ul>
 <a name="setJPEGImage(byte[], int)">
@@ -619,10 +624,10 @@
 <pre>@Deprecated
 public&nbsp;void&nbsp;setJPEGImage(byte[]&nbsp;jpegImage,
                            int&nbsp;imageSize)
-                  throws java.lang.Exception</pre>
+                  throws <a href="../../../org/libjpegturbo/turbojpeg/TJException.html" title="class in org.libjpegturbo.turbojpeg">TJException</a></pre>
 <div class="block"><span class="strong">Deprecated.</span>&nbsp;<i>Use <a href="../../../org/libjpegturbo/turbojpeg/TJDecompressor.html#setSourceImage(byte[],%20int)"><code>setSourceImage(byte[], int)</code></a> instead.</i></div>
 <dl><dt><span class="strong">Throws:</span></dt>
-<dd><code>java.lang.Exception</code></dd></dl>
+<dd><code><a href="../../../org/libjpegturbo/turbojpeg/TJException.html" title="class in org.libjpegturbo.turbojpeg">TJException</a></code></dd></dl>
 </li>
 </ul>
 <a name="setSourceImage(org.libjpegturbo.turbojpeg.YUVImage)">
@@ -631,15 +636,12 @@
 <ul class="blockList">
 <li class="blockList">
 <h4>setSourceImage</h4>
-<pre>public&nbsp;void&nbsp;setSourceImage(<a href="../../../org/libjpegturbo/turbojpeg/YUVImage.html" title="class in org.libjpegturbo.turbojpeg">YUVImage</a>&nbsp;srcImage)
-                    throws java.lang.Exception</pre>
+<pre>public&nbsp;void&nbsp;setSourceImage(<a href="../../../org/libjpegturbo/turbojpeg/YUVImage.html" title="class in org.libjpegturbo.turbojpeg">YUVImage</a>&nbsp;srcImage)</pre>
 <div class="block">Associate the specified YUV planar source image with this decompressor
  instance.  Subsequent decompress operations will decode this image into an
  RGB or grayscale destination image.</div>
 <dl><dt><span class="strong">Parameters:</span></dt><dd><code>srcImage</code> - <a href="../../../org/libjpegturbo/turbojpeg/YUVImage.html" title="class in org.libjpegturbo.turbojpeg"><code>YUVImage</code></a> instance containing a YUV planar image to
- be decoded.  This image is not modified.</dd>
-<dt><span class="strong">Throws:</span></dt>
-<dd><code>java.lang.Exception</code></dd></dl>
+ be decoded.  This image is not modified.</dd></dl>
 </li>
 </ul>
 <a name="getWidth()">
@@ -648,14 +650,11 @@
 <ul class="blockList">
 <li class="blockList">
 <h4>getWidth</h4>
-<pre>public&nbsp;int&nbsp;getWidth()
-             throws java.lang.Exception</pre>
+<pre>public&nbsp;int&nbsp;getWidth()</pre>
 <div class="block">Returns the width of the source image (JPEG or YUV) associated with this
  decompressor instance.</div>
 <dl><dt><span class="strong">Returns:</span></dt><dd>the width of the source image (JPEG or YUV) associated with this
- decompressor instance.</dd>
-<dt><span class="strong">Throws:</span></dt>
-<dd><code>java.lang.Exception</code></dd></dl>
+ decompressor instance.</dd></dl>
 </li>
 </ul>
 <a name="getHeight()">
@@ -664,14 +663,11 @@
 <ul class="blockList">
 <li class="blockList">
 <h4>getHeight</h4>
-<pre>public&nbsp;int&nbsp;getHeight()
-              throws java.lang.Exception</pre>
+<pre>public&nbsp;int&nbsp;getHeight()</pre>
 <div class="block">Returns the height of the source image (JPEG or YUV) associated with this
  decompressor instance.</div>
 <dl><dt><span class="strong">Returns:</span></dt><dd>the height of the source image (JPEG or YUV) associated with this
- decompressor instance.</dd>
-<dt><span class="strong">Throws:</span></dt>
-<dd><code>java.lang.Exception</code></dd></dl>
+ decompressor instance.</dd></dl>
 </li>
 </ul>
 <a name="getSubsamp()">
@@ -680,15 +676,12 @@
 <ul class="blockList">
 <li class="blockList">
 <h4>getSubsamp</h4>
-<pre>public&nbsp;int&nbsp;getSubsamp()
-               throws java.lang.Exception</pre>
+<pre>public&nbsp;int&nbsp;getSubsamp()</pre>
 <div class="block">Returns the level of chrominance subsampling used in the source image
  (JPEG or YUV) associated with this decompressor instance.  See
  <a href="../../../org/libjpegturbo/turbojpeg/TJ.html#SAMP_444"><code>TJ.SAMP_*</code></a>.</div>
 <dl><dt><span class="strong">Returns:</span></dt><dd>the level of chrominance subsampling used in the source image
- (JPEG or YUV) associated with this decompressor instance.</dd>
-<dt><span class="strong">Throws:</span></dt>
-<dd><code>java.lang.Exception</code></dd></dl>
+ (JPEG or YUV) associated with this decompressor instance.</dd></dl>
 </li>
 </ul>
 <a name="getColorspace()">
@@ -697,15 +690,12 @@
 <ul class="blockList">
 <li class="blockList">
 <h4>getColorspace</h4>
-<pre>public&nbsp;int&nbsp;getColorspace()
-                  throws java.lang.Exception</pre>
+<pre>public&nbsp;int&nbsp;getColorspace()</pre>
 <div class="block">Returns the colorspace used in the source image (JPEG or YUV) associated
  with this decompressor instance.  See <a href="../../../org/libjpegturbo/turbojpeg/TJ.html#CS_RGB"><code>TJ.CS_*</code></a>.  If the
  source image is YUV, then this always returns <a href="../../../org/libjpegturbo/turbojpeg/TJ.html#CS_YCbCr"><code>TJ.CS_YCbCr</code></a>.</div>
 <dl><dt><span class="strong">Returns:</span></dt><dd>the colorspace used in the source image (JPEG or YUV) associated
- with this decompressor instance.</dd>
-<dt><span class="strong">Throws:</span></dt>
-<dd><code>java.lang.Exception</code></dd></dl>
+ with this decompressor instance.</dd></dl>
 </li>
 </ul>
 <a name="getJPEGBuf()">
@@ -714,12 +704,9 @@
 <ul class="blockList">
 <li class="blockList">
 <h4>getJPEGBuf</h4>
-<pre>public&nbsp;byte[]&nbsp;getJPEGBuf()
-                  throws java.lang.Exception</pre>
+<pre>public&nbsp;byte[]&nbsp;getJPEGBuf()</pre>
 <div class="block">Returns the JPEG image buffer associated with this decompressor instance.</div>
-<dl><dt><span class="strong">Returns:</span></dt><dd>the JPEG image buffer associated with this decompressor instance.</dd>
-<dt><span class="strong">Throws:</span></dt>
-<dd><code>java.lang.Exception</code></dd></dl>
+<dl><dt><span class="strong">Returns:</span></dt><dd>the JPEG image buffer associated with this decompressor instance.</dd></dl>
 </li>
 </ul>
 <a name="getJPEGSize()">
@@ -728,14 +715,11 @@
 <ul class="blockList">
 <li class="blockList">
 <h4>getJPEGSize</h4>
-<pre>public&nbsp;int&nbsp;getJPEGSize()
-                throws java.lang.Exception</pre>
+<pre>public&nbsp;int&nbsp;getJPEGSize()</pre>
 <div class="block">Returns the size of the JPEG image (in bytes) associated with this
  decompressor instance.</div>
 <dl><dt><span class="strong">Returns:</span></dt><dd>the size of the JPEG image (in bytes) associated with this
- decompressor instance.</dd>
-<dt><span class="strong">Throws:</span></dt>
-<dd><code>java.lang.Exception</code></dd></dl>
+ decompressor instance.</dd></dl>
 </li>
 </ul>
 <a name="getScaledWidth(int, int)">
@@ -745,8 +729,7 @@
 <li class="blockList">
 <h4>getScaledWidth</h4>
 <pre>public&nbsp;int&nbsp;getScaledWidth(int&nbsp;desiredWidth,
-                 int&nbsp;desiredHeight)
-                   throws java.lang.Exception</pre>
+                 int&nbsp;desiredHeight)</pre>
 <div class="block">Returns the width of the largest scaled-down image that the TurboJPEG
  decompressor can generate without exceeding the desired image width and
  height.</div>
@@ -759,9 +742,7 @@
  the scaled image size.)</dd>
 <dt><span class="strong">Returns:</span></dt><dd>the width of the largest scaled-down image that the TurboJPEG
  decompressor can generate without exceeding the desired image width and
- height.</dd>
-<dt><span class="strong">Throws:</span></dt>
-<dd><code>java.lang.Exception</code></dd></dl>
+ height.</dd></dl>
 </li>
 </ul>
 <a name="getScaledHeight(int, int)">
@@ -771,8 +752,7 @@
 <li class="blockList">
 <h4>getScaledHeight</h4>
 <pre>public&nbsp;int&nbsp;getScaledHeight(int&nbsp;desiredWidth,
-                  int&nbsp;desiredHeight)
-                    throws java.lang.Exception</pre>
+                  int&nbsp;desiredHeight)</pre>
 <div class="block">Returns the height of the largest scaled-down image that the TurboJPEG
  decompressor can generate without exceeding the desired image width and
  height.</div>
@@ -785,9 +765,7 @@
  the scaled image size.)</dd>
 <dt><span class="strong">Returns:</span></dt><dd>the height of the largest scaled-down image that the TurboJPEG
  decompressor can generate without exceeding the desired image width and
- height.</dd>
-<dt><span class="strong">Throws:</span></dt>
-<dd><code>java.lang.Exception</code></dd></dl>
+ height.</dd></dl>
 </li>
 </ul>
 <a name="decompress(byte[], int, int, int, int, int, int, int)">
@@ -804,7 +782,7 @@
               int&nbsp;desiredHeight,
               int&nbsp;pixelFormat,
               int&nbsp;flags)
-                throws java.lang.Exception</pre>
+                throws <a href="../../../org/libjpegturbo/turbojpeg/TJException.html" title="class in org.libjpegturbo.turbojpeg">TJException</a></pre>
 <div class="block">Decompress the JPEG source image or decode the YUV source image associated
  with this decompressor instance and output a grayscale, RGB, or CMYK image
  to the given destination buffer.</div>
@@ -852,7 +830,7 @@
  <a href="../../../org/libjpegturbo/turbojpeg/TJ.html#PF_RGB"><code>TJ.PF_*</code></a>)</dd><dd><code>flags</code> - the bitwise OR of one or more of
  <a href="../../../org/libjpegturbo/turbojpeg/TJ.html#FLAG_BOTTOMUP"><code>TJ.FLAG_*</code></a></dd>
 <dt><span class="strong">Throws:</span></dt>
-<dd><code>java.lang.Exception</code></dd></dl>
+<dd><code><a href="../../../org/libjpegturbo/turbojpeg/TJException.html" title="class in org.libjpegturbo.turbojpeg">TJException</a></code></dd></dl>
 </li>
 </ul>
 <a name="decompress(byte[], int, int, int, int, int)">
@@ -868,11 +846,11 @@
                          int&nbsp;desiredHeight,
                          int&nbsp;pixelFormat,
                          int&nbsp;flags)
-                throws java.lang.Exception</pre>
+                throws <a href="../../../org/libjpegturbo/turbojpeg/TJException.html" title="class in org.libjpegturbo.turbojpeg">TJException</a></pre>
 <div class="block"><span class="strong">Deprecated.</span>&nbsp;<i>Use
  <a href="../../../org/libjpegturbo/turbojpeg/TJDecompressor.html#decompress(byte[],%20int,%20int,%20int,%20int,%20int,%20int,%20int)"><code>decompress(byte[], int, int, int, int, int, int, int)</code></a> instead.</i></div>
 <dl><dt><span class="strong">Throws:</span></dt>
-<dd><code>java.lang.Exception</code></dd></dl>
+<dd><code><a href="../../../org/libjpegturbo/turbojpeg/TJException.html" title="class in org.libjpegturbo.turbojpeg">TJException</a></code></dd></dl>
 </li>
 </ul>
 <a name="decompress(int, int, int, int, int)">
@@ -886,7 +864,7 @@
                 int&nbsp;desiredHeight,
                 int&nbsp;pixelFormat,
                 int&nbsp;flags)
-                  throws java.lang.Exception</pre>
+                  throws <a href="../../../org/libjpegturbo/turbojpeg/TJException.html" title="class in org.libjpegturbo.turbojpeg">TJException</a></pre>
 <div class="block">Decompress the JPEG source image associated with this decompressor
  instance and return a buffer containing the decompressed image.</div>
 <dl><dt><span class="strong">Parameters:</span></dt><dd><code>desiredWidth</code> - see
@@ -900,7 +878,7 @@
  <a href="../../../org/libjpegturbo/turbojpeg/TJ.html#FLAG_BOTTOMUP"><code>TJ.FLAG_*</code></a></dd>
 <dt><span class="strong">Returns:</span></dt><dd>a buffer containing the decompressed image.</dd>
 <dt><span class="strong">Throws:</span></dt>
-<dd><code>java.lang.Exception</code></dd></dl>
+<dd><code><a href="../../../org/libjpegturbo/turbojpeg/TJException.html" title="class in org.libjpegturbo.turbojpeg">TJException</a></code></dd></dl>
 </li>
 </ul>
 <a name="decompressToYUV(org.libjpegturbo.turbojpeg.YUVImage, int)">
@@ -911,7 +889,7 @@
 <h4>decompressToYUV</h4>
 <pre>public&nbsp;void&nbsp;decompressToYUV(<a href="../../../org/libjpegturbo/turbojpeg/YUVImage.html" title="class in org.libjpegturbo.turbojpeg">YUVImage</a>&nbsp;dstImage,
                    int&nbsp;flags)
-                     throws java.lang.Exception</pre>
+                     throws <a href="../../../org/libjpegturbo/turbojpeg/TJException.html" title="class in org.libjpegturbo.turbojpeg">TJException</a></pre>
 <div class="block">Decompress the JPEG source image associated with this decompressor
  instance into a YUV planar image and store it in the given
  <code>YUVImage</code> instance.  This method performs JPEG decompression
@@ -926,7 +904,7 @@
  source image.</dd><dd><code>flags</code> - the bitwise OR of one or more of
  <a href="../../../org/libjpegturbo/turbojpeg/TJ.html#FLAG_BOTTOMUP"><code>TJ.FLAG_*</code></a></dd>
 <dt><span class="strong">Throws:</span></dt>
-<dd><code>java.lang.Exception</code></dd></dl>
+<dd><code><a href="../../../org/libjpegturbo/turbojpeg/TJException.html" title="class in org.libjpegturbo.turbojpeg">TJException</a></code></dd></dl>
 </li>
 </ul>
 <a name="decompressToYUV(byte[], int)">
@@ -938,10 +916,10 @@
 <pre>@Deprecated
 public&nbsp;void&nbsp;decompressToYUV(byte[]&nbsp;dstBuf,
                               int&nbsp;flags)
-                     throws java.lang.Exception</pre>
+                     throws <a href="../../../org/libjpegturbo/turbojpeg/TJException.html" title="class in org.libjpegturbo.turbojpeg">TJException</a></pre>
 <div class="block"><span class="strong">Deprecated.</span>&nbsp;<i>Use <a href="../../../org/libjpegturbo/turbojpeg/TJDecompressor.html#decompressToYUV(org.libjpegturbo.turbojpeg.YUVImage,%20int)"><code>decompressToYUV(YUVImage, int)</code></a> instead.</i></div>
 <dl><dt><span class="strong">Throws:</span></dt>
-<dd><code>java.lang.Exception</code></dd></dl>
+<dd><code><a href="../../../org/libjpegturbo/turbojpeg/TJException.html" title="class in org.libjpegturbo.turbojpeg">TJException</a></code></dd></dl>
 </li>
 </ul>
 <a name="decompressToYUV(int, int[], int, int)">
@@ -954,7 +932,7 @@
                        int[]&nbsp;strides,
                        int&nbsp;desiredHeight,
                        int&nbsp;flags)
-                         throws java.lang.Exception</pre>
+                         throws <a href="../../../org/libjpegturbo/turbojpeg/TJException.html" title="class in org.libjpegturbo.turbojpeg">TJException</a></pre>
 <div class="block">Decompress the JPEG source image associated with this decompressor
  instance into a set of Y, U (Cb), and V (Cr) image planes and return a
  <code>YUVImage</code> instance containing the decompressed image planes.
@@ -984,7 +962,7 @@
  <a href="../../../org/libjpegturbo/turbojpeg/TJ.html#FLAG_BOTTOMUP"><code>TJ.FLAG_*</code></a></dd>
 <dt><span class="strong">Returns:</span></dt><dd>a YUV planar image.</dd>
 <dt><span class="strong">Throws:</span></dt>
-<dd><code>java.lang.Exception</code></dd></dl>
+<dd><code><a href="../../../org/libjpegturbo/turbojpeg/TJException.html" title="class in org.libjpegturbo.turbojpeg">TJException</a></code></dd></dl>
 </li>
 </ul>
 <a name="decompressToYUV(int, int, int, int)">
@@ -997,7 +975,7 @@
                        int&nbsp;pad,
                        int&nbsp;desiredHeight,
                        int&nbsp;flags)
-                         throws java.lang.Exception</pre>
+                         throws <a href="../../../org/libjpegturbo/turbojpeg/TJException.html" title="class in org.libjpegturbo.turbojpeg">TJException</a></pre>
 <div class="block">Decompress the JPEG source image associated with this decompressor
  instance into a unified YUV planar image buffer and return a
  <code>YUVImage</code> instance containing the decompressed image.  This
@@ -1023,7 +1001,7 @@
  <a href="../../../org/libjpegturbo/turbojpeg/TJ.html#FLAG_BOTTOMUP"><code>TJ.FLAG_*</code></a></dd>
 <dt><span class="strong">Returns:</span></dt><dd>a YUV planar image.</dd>
 <dt><span class="strong">Throws:</span></dt>
-<dd><code>java.lang.Exception</code></dd></dl>
+<dd><code><a href="../../../org/libjpegturbo/turbojpeg/TJException.html" title="class in org.libjpegturbo.turbojpeg">TJException</a></code></dd></dl>
 </li>
 </ul>
 <a name="decompressToYUV(int)">
@@ -1034,10 +1012,10 @@
 <h4>decompressToYUV</h4>
 <pre>@Deprecated
 public&nbsp;byte[]&nbsp;decompressToYUV(int&nbsp;flags)
-                       throws java.lang.Exception</pre>
+                       throws <a href="../../../org/libjpegturbo/turbojpeg/TJException.html" title="class in org.libjpegturbo.turbojpeg">TJException</a></pre>
 <div class="block"><span class="strong">Deprecated.</span>&nbsp;<i>Use <a href="../../../org/libjpegturbo/turbojpeg/TJDecompressor.html#decompressToYUV(int,%20int,%20int,%20int)"><code>decompressToYUV(int, int, int, int)</code></a> instead.</i></div>
 <dl><dt><span class="strong">Throws:</span></dt>
-<dd><code>java.lang.Exception</code></dd></dl>
+<dd><code><a href="../../../org/libjpegturbo/turbojpeg/TJException.html" title="class in org.libjpegturbo.turbojpeg">TJException</a></code></dd></dl>
 </li>
 </ul>
 <a name="decompress(int[], int, int, int, int, int, int, int)">
@@ -1054,7 +1032,7 @@
               int&nbsp;desiredHeight,
               int&nbsp;pixelFormat,
               int&nbsp;flags)
-                throws java.lang.Exception</pre>
+                throws <a href="../../../org/libjpegturbo/turbojpeg/TJException.html" title="class in org.libjpegturbo.turbojpeg">TJException</a></pre>
 <div class="block">Decompress the JPEG source image or decode the YUV source image associated
  with this decompressor instance and output a grayscale, RGB, or CMYK image
  to the given destination buffer.</div>
@@ -1100,7 +1078,7 @@
  <a href="../../../org/libjpegturbo/turbojpeg/TJ.html#PF_RGB"><code>TJ.PF_*</code></a>)</dd><dd><code>flags</code> - the bitwise OR of one or more of
  <a href="../../../org/libjpegturbo/turbojpeg/TJ.html#FLAG_BOTTOMUP"><code>TJ.FLAG_*</code></a></dd>
 <dt><span class="strong">Throws:</span></dt>
-<dd><code>java.lang.Exception</code></dd></dl>
+<dd><code><a href="../../../org/libjpegturbo/turbojpeg/TJException.html" title="class in org.libjpegturbo.turbojpeg">TJException</a></code></dd></dl>
 </li>
 </ul>
 <a name="decompress(java.awt.image.BufferedImage, int)">
@@ -1111,7 +1089,7 @@
 <h4>decompress</h4>
 <pre>public&nbsp;void&nbsp;decompress(java.awt.image.BufferedImage&nbsp;dstImage,
               int&nbsp;flags)
-                throws java.lang.Exception</pre>
+                throws <a href="../../../org/libjpegturbo/turbojpeg/TJException.html" title="class in org.libjpegturbo.turbojpeg">TJException</a></pre>
 <div class="block">Decompress the JPEG source image or decode the YUV source image associated
  with this decompressor instance and output a decompressed/decoded image to
  the given <code>BufferedImage</code> instance.</div>
@@ -1124,7 +1102,7 @@
  height of the YUV image.</dd><dd><code>flags</code> - the bitwise OR of one or more of
  <a href="../../../org/libjpegturbo/turbojpeg/TJ.html#FLAG_BOTTOMUP"><code>TJ.FLAG_*</code></a></dd>
 <dt><span class="strong">Throws:</span></dt>
-<dd><code>java.lang.Exception</code></dd></dl>
+<dd><code><a href="../../../org/libjpegturbo/turbojpeg/TJException.html" title="class in org.libjpegturbo.turbojpeg">TJException</a></code></dd></dl>
 </li>
 </ul>
 <a name="decompress(int, int, int, int)">
@@ -1137,7 +1115,7 @@
                                       int&nbsp;desiredHeight,
                                       int&nbsp;bufferedImageType,
                                       int&nbsp;flags)
-                                        throws java.lang.Exception</pre>
+                                        throws <a href="../../../org/libjpegturbo/turbojpeg/TJException.html" title="class in org.libjpegturbo.turbojpeg">TJException</a></pre>
 <div class="block">Decompress the JPEG source image or decode the YUV source image associated
  with this decompressor instance and return a <code>BufferedImage</code>
  instance containing the decompressed/decoded image.</div>
@@ -1152,7 +1130,7 @@
 <dt><span class="strong">Returns:</span></dt><dd>a <code>BufferedImage</code> instance containing the
  decompressed/decoded image.</dd>
 <dt><span class="strong">Throws:</span></dt>
-<dd><code>java.lang.Exception</code></dd></dl>
+<dd><code><a href="../../../org/libjpegturbo/turbojpeg/TJException.html" title="class in org.libjpegturbo.turbojpeg">TJException</a></code></dd></dl>
 </li>
 </ul>
 <a name="close()">
@@ -1162,10 +1140,15 @@
 <li class="blockList">
 <h4>close</h4>
 <pre>public&nbsp;void&nbsp;close()
-           throws java.lang.Exception</pre>
+           throws <a href="../../../org/libjpegturbo/turbojpeg/TJException.html" title="class in org.libjpegturbo.turbojpeg">TJException</a></pre>
 <div class="block">Free the native structures associated with this decompressor instance.</div>
-<dl><dt><span class="strong">Throws:</span></dt>
-<dd><code>java.lang.Exception</code></dd></dl>
+<dl>
+<dt><strong>Specified by:</strong></dt>
+<dd><code>close</code>&nbsp;in interface&nbsp;<code>java.io.Closeable</code></dd>
+<dt><strong>Specified by:</strong></dt>
+<dd><code>close</code>&nbsp;in interface&nbsp;<code>java.lang.AutoCloseable</code></dd>
+<dt><span class="strong">Throws:</span></dt>
+<dd><code><a href="../../../org/libjpegturbo/turbojpeg/TJException.html" title="class in org.libjpegturbo.turbojpeg">TJException</a></code></dd></dl>
 </li>
 </ul>
 <a name="finalize()">
@@ -1208,7 +1191,7 @@
 <div class="subNav">
 <ul class="navList">
 <li><a href="../../../org/libjpegturbo/turbojpeg/TJCustomFilter.html" title="interface in org.libjpegturbo.turbojpeg"><span class="strong">Prev Class</span></a></li>
-<li><a href="../../../org/libjpegturbo/turbojpeg/TJScalingFactor.html" title="class in org.libjpegturbo.turbojpeg"><span class="strong">Next Class</span></a></li>
+<li><a href="../../../org/libjpegturbo/turbojpeg/TJException.html" title="class in org.libjpegturbo.turbojpeg"><span class="strong">Next Class</span></a></li>
 </ul>
 <ul class="navList">
 <li><a href="../../../index.html?org/libjpegturbo/turbojpeg/TJDecompressor.html" target="_top">Frames</a></li>
diff --git a/java/doc/org/libjpegturbo/turbojpeg/TJException.html b/java/doc/org/libjpegturbo/turbojpeg/TJException.html
new file mode 100644
index 0000000..6088066
--- /dev/null
+++ b/java/doc/org/libjpegturbo/turbojpeg/TJException.html
@@ -0,0 +1,287 @@
+<!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 4.01 Transitional//EN" "http://www.w3.org/TR/html4/loose.dtd">
+<!-- NewPage -->
+<html lang="en">
+<head>
+<title>TJException</title>
+<link rel="stylesheet" type="text/css" href="../../../stylesheet.css" title="Style">
+</head>
+<body>
+<script type="text/javascript"><!--
+    if (location.href.indexOf('is-external=true') == -1) {
+        parent.document.title="TJException";
+    }
+//-->
+</script>
+<noscript>
+<div>JavaScript is disabled on your browser.</div>
+</noscript>
+<!-- ========= START OF TOP NAVBAR ======= -->
+<div class="topNav"><a name="navbar_top">
+<!--   -->
+</a><a href="#skip-navbar_top" title="Skip navigation links"></a><a name="navbar_top_firstrow">
+<!--   -->
+</a>
+<ul class="navList" title="Navigation">
+<li><a href="../../../org/libjpegturbo/turbojpeg/package-summary.html">Package</a></li>
+<li class="navBarCell1Rev">Class</li>
+<li><a href="package-tree.html">Tree</a></li>
+<li><a href="../../../deprecated-list.html">Deprecated</a></li>
+<li><a href="../../../index-all.html">Index</a></li>
+<li><a href="../../../help-doc.html">Help</a></li>
+</ul>
+</div>
+<div class="subNav">
+<ul class="navList">
+<li><a href="../../../org/libjpegturbo/turbojpeg/TJDecompressor.html" title="class in org.libjpegturbo.turbojpeg"><span class="strong">Prev Class</span></a></li>
+<li><a href="../../../org/libjpegturbo/turbojpeg/TJScalingFactor.html" title="class in org.libjpegturbo.turbojpeg"><span class="strong">Next Class</span></a></li>
+</ul>
+<ul class="navList">
+<li><a href="../../../index.html?org/libjpegturbo/turbojpeg/TJException.html" target="_top">Frames</a></li>
+<li><a href="TJException.html" target="_top">No Frames</a></li>
+</ul>
+<ul class="navList" id="allclasses_navbar_top">
+<li><a href="../../../allclasses-noframe.html">All Classes</a></li>
+</ul>
+<div>
+<script type="text/javascript"><!--
+  allClassesLink = document.getElementById("allclasses_navbar_top");
+  if(window==top) {
+    allClassesLink.style.display = "block";
+  }
+  else {
+    allClassesLink.style.display = "none";
+  }
+  //-->
+</script>
+</div>
+<div>
+<ul class="subNavList">
+<li>Summary:&nbsp;</li>
+<li>Nested&nbsp;|&nbsp;</li>
+<li>Field&nbsp;|&nbsp;</li>
+<li><a href="#constructor_summary">Constr</a>&nbsp;|&nbsp;</li>
+<li><a href="#methods_inherited_from_class_java.lang.Throwable">Method</a></li>
+</ul>
+<ul class="subNavList">
+<li>Detail:&nbsp;</li>
+<li>Field&nbsp;|&nbsp;</li>
+<li><a href="#constructor_detail">Constr</a>&nbsp;|&nbsp;</li>
+<li>Method</li>
+</ul>
+</div>
+<a name="skip-navbar_top">
+<!--   -->
+</a></div>
+<!-- ========= END OF TOP NAVBAR ========= -->
+<!-- ======== START OF CLASS DATA ======== -->
+<div class="header">
+<div class="subTitle">org.libjpegturbo.turbojpeg</div>
+<h2 title="Class TJException" class="title">Class TJException</h2>
+</div>
+<div class="contentContainer">
+<ul class="inheritance">
+<li>java.lang.Object</li>
+<li>
+<ul class="inheritance">
+<li>java.lang.Throwable</li>
+<li>
+<ul class="inheritance">
+<li>java.lang.Exception</li>
+<li>
+<ul class="inheritance">
+<li>java.io.IOException</li>
+<li>
+<ul class="inheritance">
+<li>org.libjpegturbo.turbojpeg.TJException</li>
+</ul>
+</li>
+</ul>
+</li>
+</ul>
+</li>
+</ul>
+</li>
+</ul>
+<div class="description">
+<ul class="blockList">
+<li class="blockList">
+<dl>
+<dt>All Implemented Interfaces:</dt>
+<dd>java.io.Serializable</dd>
+</dl>
+<hr>
+<br>
+<pre>public class <span class="strong">TJException</span>
+extends java.io.IOException</pre>
+<dl><dt><span class="strong">See Also:</span></dt><dd><a href="../../../serialized-form.html#org.libjpegturbo.turbojpeg.TJException">Serialized Form</a></dd></dl>
+</li>
+</ul>
+</div>
+<div class="summary">
+<ul class="blockList">
+<li class="blockList">
+<!-- ======== CONSTRUCTOR SUMMARY ======== -->
+<ul class="blockList">
+<li class="blockList"><a name="constructor_summary">
+<!--   -->
+</a>
+<h3>Constructor Summary</h3>
+<table class="overviewSummary" border="0" cellpadding="3" cellspacing="0" summary="Constructor Summary table, listing constructors, and an explanation">
+<caption><span>Constructors</span><span class="tabEnd">&nbsp;</span></caption>
+<tr>
+<th class="colOne" scope="col">Constructor and Description</th>
+</tr>
+<tr class="altColor">
+<td class="colOne"><code><strong><a href="../../../org/libjpegturbo/turbojpeg/TJException.html#TJException()">TJException</a></strong>()</code>&nbsp;</td>
+</tr>
+<tr class="rowColor">
+<td class="colOne"><code><strong><a href="../../../org/libjpegturbo/turbojpeg/TJException.html#TJException(java.lang.String)">TJException</a></strong>(java.lang.String&nbsp;message)</code>&nbsp;</td>
+</tr>
+<tr class="altColor">
+<td class="colOne"><code><strong><a href="../../../org/libjpegturbo/turbojpeg/TJException.html#TJException(java.lang.String,%20java.lang.Throwable)">TJException</a></strong>(java.lang.String&nbsp;message,
+           java.lang.Throwable&nbsp;cause)</code>&nbsp;</td>
+</tr>
+<tr class="rowColor">
+<td class="colOne"><code><strong><a href="../../../org/libjpegturbo/turbojpeg/TJException.html#TJException(java.lang.Throwable)">TJException</a></strong>(java.lang.Throwable&nbsp;cause)</code>&nbsp;</td>
+</tr>
+</table>
+</li>
+</ul>
+<!-- ========== METHOD SUMMARY =========== -->
+<ul class="blockList">
+<li class="blockList"><a name="method_summary">
+<!--   -->
+</a>
+<h3>Method Summary</h3>
+<ul class="blockList">
+<li class="blockList"><a name="methods_inherited_from_class_java.lang.Throwable">
+<!--   -->
+</a>
+<h3>Methods inherited from class&nbsp;java.lang.Throwable</h3>
+<code>addSuppressed, fillInStackTrace, getCause, getLocalizedMessage, getMessage, getStackTrace, getSuppressed, initCause, printStackTrace, printStackTrace, printStackTrace, setStackTrace, toString</code></li>
+</ul>
+<ul class="blockList">
+<li class="blockList"><a name="methods_inherited_from_class_java.lang.Object">
+<!--   -->
+</a>
+<h3>Methods inherited from class&nbsp;java.lang.Object</h3>
+<code>clone, equals, finalize, getClass, hashCode, notify, notifyAll, wait, wait, wait</code></li>
+</ul>
+</li>
+</ul>
+</li>
+</ul>
+</div>
+<div class="details">
+<ul class="blockList">
+<li class="blockList">
+<!-- ========= CONSTRUCTOR DETAIL ======== -->
+<ul class="blockList">
+<li class="blockList"><a name="constructor_detail">
+<!--   -->
+</a>
+<h3>Constructor Detail</h3>
+<a name="TJException()">
+<!--   -->
+</a>
+<ul class="blockList">
+<li class="blockList">
+<h4>TJException</h4>
+<pre>public&nbsp;TJException()</pre>
+</li>
+</ul>
+<a name="TJException(java.lang.String, java.lang.Throwable)">
+<!--   -->
+</a>
+<ul class="blockList">
+<li class="blockList">
+<h4>TJException</h4>
+<pre>public&nbsp;TJException(java.lang.String&nbsp;message,
+           java.lang.Throwable&nbsp;cause)</pre>
+</li>
+</ul>
+<a name="TJException(java.lang.String)">
+<!--   -->
+</a>
+<ul class="blockList">
+<li class="blockList">
+<h4>TJException</h4>
+<pre>public&nbsp;TJException(java.lang.String&nbsp;message)</pre>
+</li>
+</ul>
+<a name="TJException(java.lang.Throwable)">
+<!--   -->
+</a>
+<ul class="blockListLast">
+<li class="blockList">
+<h4>TJException</h4>
+<pre>public&nbsp;TJException(java.lang.Throwable&nbsp;cause)</pre>
+</li>
+</ul>
+</li>
+</ul>
+</li>
+</ul>
+</div>
+</div>
+<!-- ========= END OF CLASS DATA ========= -->
+<!-- ======= START OF BOTTOM NAVBAR ====== -->
+<div class="bottomNav"><a name="navbar_bottom">
+<!--   -->
+</a><a href="#skip-navbar_bottom" title="Skip navigation links"></a><a name="navbar_bottom_firstrow">
+<!--   -->
+</a>
+<ul class="navList" title="Navigation">
+<li><a href="../../../org/libjpegturbo/turbojpeg/package-summary.html">Package</a></li>
+<li class="navBarCell1Rev">Class</li>
+<li><a href="package-tree.html">Tree</a></li>
+<li><a href="../../../deprecated-list.html">Deprecated</a></li>
+<li><a href="../../../index-all.html">Index</a></li>
+<li><a href="../../../help-doc.html">Help</a></li>
+</ul>
+</div>
+<div class="subNav">
+<ul class="navList">
+<li><a href="../../../org/libjpegturbo/turbojpeg/TJDecompressor.html" title="class in org.libjpegturbo.turbojpeg"><span class="strong">Prev Class</span></a></li>
+<li><a href="../../../org/libjpegturbo/turbojpeg/TJScalingFactor.html" title="class in org.libjpegturbo.turbojpeg"><span class="strong">Next Class</span></a></li>
+</ul>
+<ul class="navList">
+<li><a href="../../../index.html?org/libjpegturbo/turbojpeg/TJException.html" target="_top">Frames</a></li>
+<li><a href="TJException.html" target="_top">No Frames</a></li>
+</ul>
+<ul class="navList" id="allclasses_navbar_bottom">
+<li><a href="../../../allclasses-noframe.html">All Classes</a></li>
+</ul>
+<div>
+<script type="text/javascript"><!--
+  allClassesLink = document.getElementById("allclasses_navbar_bottom");
+  if(window==top) {
+    allClassesLink.style.display = "block";
+  }
+  else {
+    allClassesLink.style.display = "none";
+  }
+  //-->
+</script>
+</div>
+<div>
+<ul class="subNavList">
+<li>Summary:&nbsp;</li>
+<li>Nested&nbsp;|&nbsp;</li>
+<li>Field&nbsp;|&nbsp;</li>
+<li><a href="#constructor_summary">Constr</a>&nbsp;|&nbsp;</li>
+<li><a href="#methods_inherited_from_class_java.lang.Throwable">Method</a></li>
+</ul>
+<ul class="subNavList">
+<li>Detail:&nbsp;</li>
+<li>Field&nbsp;|&nbsp;</li>
+<li><a href="#constructor_detail">Constr</a>&nbsp;|&nbsp;</li>
+<li>Method</li>
+</ul>
+</div>
+<a name="skip-navbar_bottom">
+<!--   -->
+</a></div>
+<!-- ======== END OF BOTTOM NAVBAR ======= -->
+</body>
+</html>
diff --git a/java/doc/org/libjpegturbo/turbojpeg/TJScalingFactor.html b/java/doc/org/libjpegturbo/turbojpeg/TJScalingFactor.html
index c28c00c..35d6882 100644
--- a/java/doc/org/libjpegturbo/turbojpeg/TJScalingFactor.html
+++ b/java/doc/org/libjpegturbo/turbojpeg/TJScalingFactor.html
@@ -32,7 +32,7 @@
 </div>
 <div class="subNav">
 <ul class="navList">
-<li><a href="../../../org/libjpegturbo/turbojpeg/TJDecompressor.html" title="class in org.libjpegturbo.turbojpeg"><span class="strong">Prev Class</span></a></li>
+<li><a href="../../../org/libjpegturbo/turbojpeg/TJException.html" title="class in org.libjpegturbo.turbojpeg"><span class="strong">Prev Class</span></a></li>
 <li><a href="../../../org/libjpegturbo/turbojpeg/TJTransform.html" title="class in org.libjpegturbo.turbojpeg"><span class="strong">Next Class</span></a></li>
 </ul>
 <ul class="navList">
@@ -192,10 +192,7 @@
 <li class="blockList">
 <h4>TJScalingFactor</h4>
 <pre>public&nbsp;TJScalingFactor(int&nbsp;num,
-               int&nbsp;denom)
-                throws java.lang.Exception</pre>
-<dl><dt><span class="strong">Throws:</span></dt>
-<dd><code>java.lang.Exception</code></dd></dl>
+               int&nbsp;denom)</pre>
 </li>
 </ul>
 </li>
@@ -291,7 +288,7 @@
 </div>
 <div class="subNav">
 <ul class="navList">
-<li><a href="../../../org/libjpegturbo/turbojpeg/TJDecompressor.html" title="class in org.libjpegturbo.turbojpeg"><span class="strong">Prev Class</span></a></li>
+<li><a href="../../../org/libjpegturbo/turbojpeg/TJException.html" title="class in org.libjpegturbo.turbojpeg"><span class="strong">Prev Class</span></a></li>
 <li><a href="../../../org/libjpegturbo/turbojpeg/TJTransform.html" title="class in org.libjpegturbo.turbojpeg"><span class="strong">Next Class</span></a></li>
 </ul>
 <ul class="navList">
diff --git a/java/doc/org/libjpegturbo/turbojpeg/TJTransform.html b/java/doc/org/libjpegturbo/turbojpeg/TJTransform.html
index daabfb3..cf65bd2 100644
--- a/java/doc/org/libjpegturbo/turbojpeg/TJTransform.html
+++ b/java/doc/org/libjpegturbo/turbojpeg/TJTransform.html
@@ -608,8 +608,7 @@
            int&nbsp;h,
            int&nbsp;op,
            int&nbsp;options,
-           <a href="../../../org/libjpegturbo/turbojpeg/TJCustomFilter.html" title="interface in org.libjpegturbo.turbojpeg">TJCustomFilter</a>&nbsp;cf)
-            throws java.lang.Exception</pre>
+           <a href="../../../org/libjpegturbo/turbojpeg/TJCustomFilter.html" title="interface in org.libjpegturbo.turbojpeg">TJCustomFilter</a>&nbsp;cf)</pre>
 <div class="block">Create a new lossless transform instance with the given parameters.</div>
 <dl><dt><span class="strong">Parameters:</span></dt><dd><code>x</code> - the left boundary of the cropping region.  This must be evenly
  divisible by the MCU block width (see <a href="../../../org/libjpegturbo/turbojpeg/TJ.html#getMCUWidth(int)"><code>TJ.getMCUWidth(int)</code></a>)</dd><dd><code>y</code> - the upper boundary of the cropping region.  This must be evenly
@@ -618,9 +617,7 @@
  <code>x</code>).</dd><dd><code>h</code> - the height of the cropping region.  Setting this to 0 is the
  equivalent of setting it to (height of the source JPEG image -
  <code>y</code>).</dd><dd><code>op</code> - one of the transform operations (<code>OP_*</code>)</dd><dd><code>options</code> - the bitwise OR of one or more of the transform options
- (<code>OPT_*</code>)</dd><dd><code>cf</code> - an instance of an object that implements the <a href="../../../org/libjpegturbo/turbojpeg/TJCustomFilter.html" title="interface in org.libjpegturbo.turbojpeg"><code>TJCustomFilter</code></a> interface, or null if no custom filter is needed</dd>
-<dt><span class="strong">Throws:</span></dt>
-<dd><code>java.lang.Exception</code></dd></dl>
+ (<code>OPT_*</code>)</dd><dd><code>cf</code> - an instance of an object that implements the <a href="../../../org/libjpegturbo/turbojpeg/TJCustomFilter.html" title="interface in org.libjpegturbo.turbojpeg"><code>TJCustomFilter</code></a> interface, or null if no custom filter is needed</dd></dl>
 </li>
 </ul>
 <a name="TJTransform(java.awt.Rectangle, int, int, org.libjpegturbo.turbojpeg.TJCustomFilter)">
@@ -632,15 +629,12 @@
 <pre>public&nbsp;TJTransform(java.awt.Rectangle&nbsp;r,
            int&nbsp;op,
            int&nbsp;options,
-           <a href="../../../org/libjpegturbo/turbojpeg/TJCustomFilter.html" title="interface in org.libjpegturbo.turbojpeg">TJCustomFilter</a>&nbsp;cf)
-            throws java.lang.Exception</pre>
+           <a href="../../../org/libjpegturbo/turbojpeg/TJCustomFilter.html" title="interface in org.libjpegturbo.turbojpeg">TJCustomFilter</a>&nbsp;cf)</pre>
 <div class="block">Create a new lossless transform instance with the given parameters.</div>
 <dl><dt><span class="strong">Parameters:</span></dt><dd><code>r</code> - a <code>Rectangle</code> instance that specifies the cropping
  region.  See <a href="../../../org/libjpegturbo/turbojpeg/TJTransform.html#TJTransform(int,%20int,%20int,%20int,%20int,%20int,%20org.libjpegturbo.turbojpeg.TJCustomFilter)"><code>TJTransform(int, int, int, int, int, int, TJCustomFilter)</code></a> for more
  detail.</dd><dd><code>op</code> - one of the transform operations (<code>OP_*</code>)</dd><dd><code>options</code> - the bitwise OR of one or more of the transform options
- (<code>OPT_*</code>)</dd><dd><code>cf</code> - an instance of an object that implements the <a href="../../../org/libjpegturbo/turbojpeg/TJCustomFilter.html" title="interface in org.libjpegturbo.turbojpeg"><code>TJCustomFilter</code></a> interface, or null if no custom filter is needed</dd>
-<dt><span class="strong">Throws:</span></dt>
-<dd><code>java.lang.Exception</code></dd></dl>
+ (<code>OPT_*</code>)</dd><dd><code>cf</code> - an instance of an object that implements the <a href="../../../org/libjpegturbo/turbojpeg/TJCustomFilter.html" title="interface in org.libjpegturbo.turbojpeg"><code>TJCustomFilter</code></a> interface, or null if no custom filter is needed</dd></dl>
 </li>
 </ul>
 </li>
diff --git a/java/doc/org/libjpegturbo/turbojpeg/TJTransformer.html b/java/doc/org/libjpegturbo/turbojpeg/TJTransformer.html
index 32c92bb..36cbdb1 100644
--- a/java/doc/org/libjpegturbo/turbojpeg/TJTransformer.html
+++ b/java/doc/org/libjpegturbo/turbojpeg/TJTransformer.html
@@ -95,6 +95,10 @@
 <div class="description">
 <ul class="blockList">
 <li class="blockList">
+<dl>
+<dt>All Implemented Interfaces:</dt>
+<dd>java.io.Closeable, java.lang.AutoCloseable</dd>
+</dl>
 <hr>
 <br>
 <pre>public class <span class="strong">TJTransformer</span>
@@ -228,10 +232,10 @@
 <li class="blockList">
 <h4>TJTransformer</h4>
 <pre>public&nbsp;TJTransformer()
-              throws java.lang.Exception</pre>
+              throws <a href="../../../org/libjpegturbo/turbojpeg/TJException.html" title="class in org.libjpegturbo.turbojpeg">TJException</a></pre>
 <div class="block">Create a TurboJPEG lossless transformer instance.</div>
 <dl><dt><span class="strong">Throws:</span></dt>
-<dd><code>java.lang.Exception</code></dd></dl>
+<dd><code><a href="../../../org/libjpegturbo/turbojpeg/TJException.html" title="class in org.libjpegturbo.turbojpeg">TJException</a></code></dd></dl>
 </li>
 </ul>
 <a name="TJTransformer(byte[])">
@@ -241,13 +245,13 @@
 <li class="blockList">
 <h4>TJTransformer</h4>
 <pre>public&nbsp;TJTransformer(byte[]&nbsp;jpegImage)
-              throws java.lang.Exception</pre>
+              throws <a href="../../../org/libjpegturbo/turbojpeg/TJException.html" title="class in org.libjpegturbo.turbojpeg">TJException</a></pre>
 <div class="block">Create a TurboJPEG lossless transformer instance and associate the JPEG
  image stored in <code>jpegImage</code> with the newly created instance.</div>
 <dl><dt><span class="strong">Parameters:</span></dt><dd><code>jpegImage</code> - JPEG image buffer (size of the JPEG image is assumed to
  be the length of the array.)  This buffer is not modified.</dd>
 <dt><span class="strong">Throws:</span></dt>
-<dd><code>java.lang.Exception</code></dd></dl>
+<dd><code><a href="../../../org/libjpegturbo/turbojpeg/TJException.html" title="class in org.libjpegturbo.turbojpeg">TJException</a></code></dd></dl>
 </li>
 </ul>
 <a name="TJTransformer(byte[], int)">
@@ -258,13 +262,13 @@
 <h4>TJTransformer</h4>
 <pre>public&nbsp;TJTransformer(byte[]&nbsp;jpegImage,
              int&nbsp;imageSize)
-              throws java.lang.Exception</pre>
+              throws <a href="../../../org/libjpegturbo/turbojpeg/TJException.html" title="class in org.libjpegturbo.turbojpeg">TJException</a></pre>
 <div class="block">Create a TurboJPEG lossless transformer instance and associate the JPEG
  image of length <code>imageSize</code> bytes stored in
  <code>jpegImage</code> with the newly created instance.</div>
 <dl><dt><span class="strong">Parameters:</span></dt><dd><code>jpegImage</code> - JPEG image buffer.  This buffer is not modified.</dd><dd><code>imageSize</code> - size of the JPEG image (in bytes)</dd>
 <dt><span class="strong">Throws:</span></dt>
-<dd><code>java.lang.Exception</code></dd></dl>
+<dd><code><a href="../../../org/libjpegturbo/turbojpeg/TJException.html" title="class in org.libjpegturbo.turbojpeg">TJException</a></code></dd></dl>
 </li>
 </ul>
 </li>
@@ -284,7 +288,7 @@
 <pre>public&nbsp;void&nbsp;transform(byte[][]&nbsp;dstBufs,
              <a href="../../../org/libjpegturbo/turbojpeg/TJTransform.html" title="class in org.libjpegturbo.turbojpeg">TJTransform</a>[]&nbsp;transforms,
              int&nbsp;flags)
-               throws java.lang.Exception</pre>
+               throws <a href="../../../org/libjpegturbo/turbojpeg/TJException.html" title="class in org.libjpegturbo.turbojpeg">TJException</a></pre>
 <div class="block">Losslessly transform the JPEG image associated with this transformer
  instance into one or more JPEG images stored in the given destination
  buffers.  Lossless transforms work by moving the raw coefficients from one
@@ -306,7 +310,7 @@
  corresponding transformed output image</dd><dd><code>flags</code> - the bitwise OR of one or more of
  <a href="../../../org/libjpegturbo/turbojpeg/TJ.html#FLAG_BOTTOMUP"><code>TJ.FLAG_*</code></a></dd>
 <dt><span class="strong">Throws:</span></dt>
-<dd><code>java.lang.Exception</code></dd></dl>
+<dd><code><a href="../../../org/libjpegturbo/turbojpeg/TJException.html" title="class in org.libjpegturbo.turbojpeg">TJException</a></code></dd></dl>
 </li>
 </ul>
 <a name="transform(org.libjpegturbo.turbojpeg.TJTransform[], int)">
@@ -317,7 +321,7 @@
 <h4>transform</h4>
 <pre>public&nbsp;<a href="../../../org/libjpegturbo/turbojpeg/TJDecompressor.html" title="class in org.libjpegturbo.turbojpeg">TJDecompressor</a>[]&nbsp;transform(<a href="../../../org/libjpegturbo/turbojpeg/TJTransform.html" title="class in org.libjpegturbo.turbojpeg">TJTransform</a>[]&nbsp;transforms,
                          int&nbsp;flags)
-                           throws java.lang.Exception</pre>
+                           throws <a href="../../../org/libjpegturbo/turbojpeg/TJException.html" title="class in org.libjpegturbo.turbojpeg">TJException</a></pre>
 <div class="block">Losslessly transform the JPEG image associated with this transformer
  instance and return an array of <a href="../../../org/libjpegturbo/turbojpeg/TJDecompressor.html" title="class in org.libjpegturbo.turbojpeg"><code>TJDecompressor</code></a> instances, each of
  which has a transformed JPEG image associated with it.</div>
@@ -328,7 +332,7 @@
 <dt><span class="strong">Returns:</span></dt><dd>an array of <a href="../../../org/libjpegturbo/turbojpeg/TJDecompressor.html" title="class in org.libjpegturbo.turbojpeg"><code>TJDecompressor</code></a> instances, each of
  which has a transformed JPEG image associated with it.</dd>
 <dt><span class="strong">Throws:</span></dt>
-<dd><code>java.lang.Exception</code></dd></dl>
+<dd><code><a href="../../../org/libjpegturbo/turbojpeg/TJException.html" title="class in org.libjpegturbo.turbojpeg">TJException</a></code></dd></dl>
 </li>
 </ul>
 <a name="getTransformedSizes()">
@@ -337,14 +341,11 @@
 <ul class="blockListLast">
 <li class="blockList">
 <h4>getTransformedSizes</h4>
-<pre>public&nbsp;int[]&nbsp;getTransformedSizes()
-                          throws java.lang.Exception</pre>
+<pre>public&nbsp;int[]&nbsp;getTransformedSizes()</pre>
 <div class="block">Returns an array containing the sizes of the transformed JPEG images
  generated by the most recent transform operation.</div>
 <dl><dt><span class="strong">Returns:</span></dt><dd>an array containing the sizes of the transformed JPEG images
- generated by the most recent transform operation.</dd>
-<dt><span class="strong">Throws:</span></dt>
-<dd><code>java.lang.Exception</code></dd></dl>
+ generated by the most recent transform operation.</dd></dl>
 </li>
 </ul>
 </li>
diff --git a/java/doc/org/libjpegturbo/turbojpeg/YUVImage.html b/java/doc/org/libjpegturbo/turbojpeg/YUVImage.html
index 0a3e0a5..b2be0a0 100644
--- a/java/doc/org/libjpegturbo/turbojpeg/YUVImage.html
+++ b/java/doc/org/libjpegturbo/turbojpeg/YUVImage.html
@@ -434,8 +434,7 @@
 <pre>public&nbsp;YUVImage(int&nbsp;width,
         int[]&nbsp;strides,
         int&nbsp;height,
-        int&nbsp;subsamp)
-         throws java.lang.Exception</pre>
+        int&nbsp;subsamp)</pre>
 <div class="block">Create a new <code>YUVImage</code> instance backed by separate image
  planes, and allocate memory for the image planes.</div>
 <dl><dt><span class="strong">Parameters:</span></dt><dd><code>width</code> - width (in pixels) of the YUV image</dd><dd><code>strides</code> - an array of integers, each specifying the number of bytes
@@ -445,9 +444,7 @@
  strides for all planes will be set to their respective plane widths.  When
  using this constructor, the stride for each plane must be equal to or
  greater than the plane width.</dd><dd><code>height</code> - height (in pixels) of the YUV image</dd><dd><code>subsamp</code> - the level of chrominance subsampling to be used in the YUV
- image (one of <a href="../../../org/libjpegturbo/turbojpeg/TJ.html#SAMP_444"><code>TJ.SAMP_*</code></a>)</dd>
-<dt><span class="strong">Throws:</span></dt>
-<dd><code>java.lang.Exception</code></dd></dl>
+ image (one of <a href="../../../org/libjpegturbo/turbojpeg/TJ.html#SAMP_444"><code>TJ.SAMP_*</code></a>)</dd></dl>
 </li>
 </ul>
 <a name="YUVImage(int, int, int, int)">
@@ -459,15 +456,12 @@
 <pre>public&nbsp;YUVImage(int&nbsp;width,
         int&nbsp;pad,
         int&nbsp;height,
-        int&nbsp;subsamp)
-         throws java.lang.Exception</pre>
+        int&nbsp;subsamp)</pre>
 <div class="block">Create a new <code>YUVImage</code> instance backed by a unified image
  buffer, and allocate memory for the image buffer.</div>
 <dl><dt><span class="strong">Parameters:</span></dt><dd><code>width</code> - width (in pixels) of the YUV image</dd><dd><code>pad</code> - Each line of each plane in the YUV image buffer will be padded
  to this number of bytes (must be a power of 2.)</dd><dd><code>height</code> - height (in pixels) of the YUV image</dd><dd><code>subsamp</code> - the level of chrominance subsampling to be used in the YUV
- image (one of <a href="../../../org/libjpegturbo/turbojpeg/TJ.html#SAMP_444"><code>TJ.SAMP_*</code></a>)</dd>
-<dt><span class="strong">Throws:</span></dt>
-<dd><code>java.lang.Exception</code></dd></dl>
+ image (one of <a href="../../../org/libjpegturbo/turbojpeg/TJ.html#SAMP_444"><code>TJ.SAMP_*</code></a>)</dd></dl>
 </li>
 </ul>
 <a name="YUVImage(byte[][], int[], int, int[], int, int)">
@@ -481,8 +475,7 @@
         int&nbsp;width,
         int[]&nbsp;strides,
         int&nbsp;height,
-        int&nbsp;subsamp)
-         throws java.lang.Exception</pre>
+        int&nbsp;subsamp)</pre>
 <div class="block">Create a new <code>YUVImage</code> instance from a set of existing image
  planes.</div>
 <dl><dt><span class="strong">Parameters:</span></dt><dd><code>planes</code> - an array of buffers representing the Y, U (Cb), and V (Cr)
@@ -503,9 +496,7 @@
  to each plane or to specify that this <code>YUVImage</code> instance is a
  subregion of a larger image (in which case, <code>strides[i]</code> should
  be set to the plane width of plane <code>i</code> in the larger image.)</dd><dd><code>height</code> - height (in pixels) of the new YUV image (or subregion)</dd><dd><code>subsamp</code> - the level of chrominance subsampling used in the YUV
- image (one of <a href="../../../org/libjpegturbo/turbojpeg/TJ.html#SAMP_444"><code>TJ.SAMP_*</code></a>)</dd>
-<dt><span class="strong">Throws:</span></dt>
-<dd><code>java.lang.Exception</code></dd></dl>
+ image (one of <a href="../../../org/libjpegturbo/turbojpeg/TJ.html#SAMP_444"><code>TJ.SAMP_*</code></a>)</dd></dl>
 </li>
 </ul>
 <a name="YUVImage(byte[], int, int, int, int)">
@@ -518,8 +509,7 @@
         int&nbsp;width,
         int&nbsp;pad,
         int&nbsp;height,
-        int&nbsp;subsamp)
-         throws java.lang.Exception</pre>
+        int&nbsp;subsamp)</pre>
 <div class="block">Create a new <code>YUVImage</code> instance from an existing unified image
  buffer.</div>
 <dl><dt><span class="strong">Parameters:</span></dt><dd><code>yuvImage</code> - image buffer that contains or will contain YUV planar
@@ -529,9 +519,7 @@
  of the image format.)</dd><dd><code>width</code> - width (in pixels) of the YUV image</dd><dd><code>pad</code> - the line padding used in the YUV image buffer.  For
  instance, if each line in each plane of the buffer is padded to the
  nearest multiple of 4 bytes, then <code>pad</code> should be set to 4.</dd><dd><code>height</code> - height (in pixels) of the YUV image</dd><dd><code>subsamp</code> - the level of chrominance subsampling used in the YUV
- image (one of <a href="../../../org/libjpegturbo/turbojpeg/TJ.html#SAMP_444"><code>TJ.SAMP_*</code></a>)</dd>
-<dt><span class="strong">Throws:</span></dt>
-<dd><code>java.lang.Exception</code></dd></dl>
+ image (one of <a href="../../../org/libjpegturbo/turbojpeg/TJ.html#SAMP_444"><code>TJ.SAMP_*</code></a>)</dd></dl>
 </li>
 </ul>
 </li>
@@ -553,8 +541,7 @@
           int&nbsp;width,
           int[]&nbsp;strides,
           int&nbsp;height,
-          int&nbsp;subsamp)
-            throws java.lang.Exception</pre>
+          int&nbsp;subsamp)</pre>
 <div class="block">Assign a set of image planes to this <code>YUVImage</code> instance.</div>
 <dl><dt><span class="strong">Parameters:</span></dt><dd><code>planes</code> - an array of buffers representing the Y, U (Cb), and V (Cr)
  image planes (or just the Y plane, if the image is grayscale.)  These
@@ -574,9 +561,7 @@
  to each plane or to specify that this <code>YUVImage</code> image is a
  subregion of a larger image (in which case, <code>strides[i]</code> should
  be set to the plane width of plane <code>i</code> in the larger image.)</dd><dd><code>height</code> - height (in pixels) of the YUV image (or subregion)</dd><dd><code>subsamp</code> - the level of chrominance subsampling used in the YUV
- image (one of <a href="../../../org/libjpegturbo/turbojpeg/TJ.html#SAMP_444"><code>TJ.SAMP_*</code></a>)</dd>
-<dt><span class="strong">Throws:</span></dt>
-<dd><code>java.lang.Exception</code></dd></dl>
+ image (one of <a href="../../../org/libjpegturbo/turbojpeg/TJ.html#SAMP_444"><code>TJ.SAMP_*</code></a>)</dd></dl>
 </li>
 </ul>
 <a name="setBuf(byte[], int, int, int, int)">
@@ -589,8 +574,7 @@
           int&nbsp;width,
           int&nbsp;pad,
           int&nbsp;height,
-          int&nbsp;subsamp)
-            throws java.lang.Exception</pre>
+          int&nbsp;subsamp)</pre>
 <div class="block">Assign a unified image buffer to this <code>YUVImage</code> instance.</div>
 <dl><dt><span class="strong">Parameters:</span></dt><dd><code>yuvImage</code> - image buffer that contains or will contain YUV planar
  image data.  Use <a href="../../../org/libjpegturbo/turbojpeg/TJ.html#bufSizeYUV(int,%20int,%20int,%20int)"><code>TJ.bufSizeYUV(int, int, int, int)</code></a> to determine the minimum size for
@@ -599,9 +583,7 @@
  of the image format.)</dd><dd><code>width</code> - width (in pixels) of the YUV image</dd><dd><code>pad</code> - the line padding used in the YUV image buffer.  For
  instance, if each line in each plane of the buffer is padded to the
  nearest multiple of 4 bytes, then <code>pad</code> should be set to 4.</dd><dd><code>height</code> - height (in pixels) of the YUV image</dd><dd><code>subsamp</code> - the level of chrominance subsampling used in the YUV
- image (one of <a href="../../../org/libjpegturbo/turbojpeg/TJ.html#SAMP_444"><code>TJ.SAMP_*</code></a>)</dd>
-<dt><span class="strong">Throws:</span></dt>
-<dd><code>java.lang.Exception</code></dd></dl>
+ image (one of <a href="../../../org/libjpegturbo/turbojpeg/TJ.html#SAMP_444"><code>TJ.SAMP_*</code></a>)</dd></dl>
 </li>
 </ul>
 <a name="getWidth()">
@@ -610,12 +592,9 @@
 <ul class="blockList">
 <li class="blockList">
 <h4>getWidth</h4>
-<pre>public&nbsp;int&nbsp;getWidth()
-             throws java.lang.Exception</pre>
+<pre>public&nbsp;int&nbsp;getWidth()</pre>
 <div class="block">Returns the width of the YUV image (or subregion.)</div>
-<dl><dt><span class="strong">Returns:</span></dt><dd>the width of the YUV image (or subregion)</dd>
-<dt><span class="strong">Throws:</span></dt>
-<dd><code>java.lang.Exception</code></dd></dl>
+<dl><dt><span class="strong">Returns:</span></dt><dd>the width of the YUV image (or subregion)</dd></dl>
 </li>
 </ul>
 <a name="getHeight()">
@@ -624,12 +603,9 @@
 <ul class="blockList">
 <li class="blockList">
 <h4>getHeight</h4>
-<pre>public&nbsp;int&nbsp;getHeight()
-              throws java.lang.Exception</pre>
+<pre>public&nbsp;int&nbsp;getHeight()</pre>
 <div class="block">Returns the height of the YUV image (or subregion.)</div>
-<dl><dt><span class="strong">Returns:</span></dt><dd>the height of the YUV image (or subregion)</dd>
-<dt><span class="strong">Throws:</span></dt>
-<dd><code>java.lang.Exception</code></dd></dl>
+<dl><dt><span class="strong">Returns:</span></dt><dd>the height of the YUV image (or subregion)</dd></dl>
 </li>
 </ul>
 <a name="getPad()">
@@ -638,13 +614,10 @@
 <ul class="blockList">
 <li class="blockList">
 <h4>getPad</h4>
-<pre>public&nbsp;int&nbsp;getPad()
-           throws java.lang.Exception</pre>
+<pre>public&nbsp;int&nbsp;getPad()</pre>
 <div class="block">Returns the line padding used in the YUV image buffer (if this image is
  stored in a unified buffer rather than separate image planes.)</div>
-<dl><dt><span class="strong">Returns:</span></dt><dd>the line padding used in the YUV image buffer</dd>
-<dt><span class="strong">Throws:</span></dt>
-<dd><code>java.lang.Exception</code></dd></dl>
+<dl><dt><span class="strong">Returns:</span></dt><dd>the line padding used in the YUV image buffer</dd></dl>
 </li>
 </ul>
 <a name="getStrides()">
@@ -653,12 +626,9 @@
 <ul class="blockList">
 <li class="blockList">
 <h4>getStrides</h4>
-<pre>public&nbsp;int[]&nbsp;getStrides()
-                 throws java.lang.Exception</pre>
+<pre>public&nbsp;int[]&nbsp;getStrides()</pre>
 <div class="block">Returns the number of bytes per line of each plane in the YUV image.</div>
-<dl><dt><span class="strong">Returns:</span></dt><dd>the number of bytes per line of each plane in the YUV image</dd>
-<dt><span class="strong">Throws:</span></dt>
-<dd><code>java.lang.Exception</code></dd></dl>
+<dl><dt><span class="strong">Returns:</span></dt><dd>the number of bytes per line of each plane in the YUV image</dd></dl>
 </li>
 </ul>
 <a name="getOffsets()">
@@ -667,14 +637,11 @@
 <ul class="blockList">
 <li class="blockList">
 <h4>getOffsets</h4>
-<pre>public&nbsp;int[]&nbsp;getOffsets()
-                 throws java.lang.Exception</pre>
+<pre>public&nbsp;int[]&nbsp;getOffsets()</pre>
 <div class="block">Returns the offsets (in bytes) of each plane within the planes of a larger
  YUV image.</div>
 <dl><dt><span class="strong">Returns:</span></dt><dd>the offsets (in bytes) of each plane within the planes of a larger
- YUV image</dd>
-<dt><span class="strong">Throws:</span></dt>
-<dd><code>java.lang.Exception</code></dd></dl>
+ YUV image</dd></dl>
 </li>
 </ul>
 <a name="getSubsamp()">
@@ -683,13 +650,10 @@
 <ul class="blockList">
 <li class="blockList">
 <h4>getSubsamp</h4>
-<pre>public&nbsp;int&nbsp;getSubsamp()
-               throws java.lang.Exception</pre>
+<pre>public&nbsp;int&nbsp;getSubsamp()</pre>
 <div class="block">Returns the level of chrominance subsampling used in the YUV image.  See
  <a href="../../../org/libjpegturbo/turbojpeg/TJ.html#SAMP_444"><code>TJ.SAMP_*</code></a>.</div>
-<dl><dt><span class="strong">Returns:</span></dt><dd>the level of chrominance subsampling used in the YUV image</dd>
-<dt><span class="strong">Throws:</span></dt>
-<dd><code>java.lang.Exception</code></dd></dl>
+<dl><dt><span class="strong">Returns:</span></dt><dd>the level of chrominance subsampling used in the YUV image</dd></dl>
 </li>
 </ul>
 <a name="getPlanes()">
@@ -698,13 +662,10 @@
 <ul class="blockList">
 <li class="blockList">
 <h4>getPlanes</h4>
-<pre>public&nbsp;byte[][]&nbsp;getPlanes()
-                   throws java.lang.Exception</pre>
+<pre>public&nbsp;byte[][]&nbsp;getPlanes()</pre>
 <div class="block">Returns the YUV image planes.  If the image is stored in a unified buffer,
  then all image planes will point to that buffer.</div>
-<dl><dt><span class="strong">Returns:</span></dt><dd>the YUV image planes</dd>
-<dt><span class="strong">Throws:</span></dt>
-<dd><code>java.lang.Exception</code></dd></dl>
+<dl><dt><span class="strong">Returns:</span></dt><dd>the YUV image planes</dd></dl>
 </li>
 </ul>
 <a name="getBuf()">
@@ -713,13 +674,10 @@
 <ul class="blockList">
 <li class="blockList">
 <h4>getBuf</h4>
-<pre>public&nbsp;byte[]&nbsp;getBuf()
-              throws java.lang.Exception</pre>
+<pre>public&nbsp;byte[]&nbsp;getBuf()</pre>
 <div class="block">Returns the YUV image buffer (if this image is stored in a unified
  buffer rather than separate image planes.)</div>
-<dl><dt><span class="strong">Returns:</span></dt><dd>the YUV image buffer</dd>
-<dt><span class="strong">Throws:</span></dt>
-<dd><code>java.lang.Exception</code></dd></dl>
+<dl><dt><span class="strong">Returns:</span></dt><dd>the YUV image buffer</dd></dl>
 </li>
 </ul>
 <a name="getSize()">
@@ -728,13 +686,10 @@
 <ul class="blockListLast">
 <li class="blockList">
 <h4>getSize</h4>
-<pre>public&nbsp;int&nbsp;getSize()
-            throws java.lang.Exception</pre>
+<pre>public&nbsp;int&nbsp;getSize()</pre>
 <div class="block">Returns the size (in bytes) of the YUV image buffer (if this image is
  stored in a unified buffer rather than separate image planes.)</div>
-<dl><dt><span class="strong">Returns:</span></dt><dd>the size (in bytes) of the YUV image buffer</dd>
-<dt><span class="strong">Throws:</span></dt>
-<dd><code>java.lang.Exception</code></dd></dl>
+<dl><dt><span class="strong">Returns:</span></dt><dd>the size (in bytes) of the YUV image buffer</dd></dl>
 </li>
 </ul>
 </li>
diff --git a/java/doc/org/libjpegturbo/turbojpeg/package-frame.html b/java/doc/org/libjpegturbo/turbojpeg/package-frame.html
index 7cb8fa0..08a8bf8 100644
--- a/java/doc/org/libjpegturbo/turbojpeg/package-frame.html
+++ b/java/doc/org/libjpegturbo/turbojpeg/package-frame.html
@@ -22,6 +22,10 @@
 <li><a href="TJTransformer.html" title="class in org.libjpegturbo.turbojpeg" target="classFrame">TJTransformer</a></li>
 <li><a href="YUVImage.html" title="class in org.libjpegturbo.turbojpeg" target="classFrame">YUVImage</a></li>
 </ul>
+<h2 title="Exceptions">Exceptions</h2>
+<ul title="Exceptions">
+<li><a href="TJException.html" title="class in org.libjpegturbo.turbojpeg" target="classFrame">TJException</a></li>
+</ul>
 </div>
 </body>
 </html>
diff --git a/java/doc/org/libjpegturbo/turbojpeg/package-summary.html b/java/doc/org/libjpegturbo/turbojpeg/package-summary.html
index ea36057..f94656e 100644
--- a/java/doc/org/libjpegturbo/turbojpeg/package-summary.html
+++ b/java/doc/org/libjpegturbo/turbojpeg/package-summary.html
@@ -134,6 +134,21 @@
 </tbody>
 </table>
 </li>
+<li class="blockList">
+<table class="packageSummary" border="0" cellpadding="3" cellspacing="0" summary="Exception Summary table, listing exceptions, and an explanation">
+<caption><span>Exception Summary</span><span class="tabEnd">&nbsp;</span></caption>
+<tr>
+<th class="colFirst" scope="col">Exception</th>
+<th class="colLast" scope="col">Description</th>
+</tr>
+<tbody>
+<tr class="altColor">
+<td class="colFirst"><a href="../../../org/libjpegturbo/turbojpeg/TJException.html" title="class in org.libjpegturbo.turbojpeg">TJException</a></td>
+<td class="colLast">&nbsp;</td>
+</tr>
+</tbody>
+</table>
+</li>
 </ul>
 </div>
 <!-- ======= START OF BOTTOM NAVBAR ====== -->
diff --git a/java/doc/org/libjpegturbo/turbojpeg/package-tree.html b/java/doc/org/libjpegturbo/turbojpeg/package-tree.html
index 1033ee5..02a5cde 100644
--- a/java/doc/org/libjpegturbo/turbojpeg/package-tree.html
+++ b/java/doc/org/libjpegturbo/turbojpeg/package-tree.html
@@ -79,9 +79,22 @@
 </li>
 </ul>
 </li>
+<li type="circle">java.lang.Throwable (implements java.io.Serializable)
+<ul>
+<li type="circle">java.lang.Exception
+<ul>
+<li type="circle">java.io.IOException
+<ul>
+<li type="circle">org.libjpegturbo.turbojpeg.<a href="../../../org/libjpegturbo/turbojpeg/TJException.html" title="class in org.libjpegturbo.turbojpeg"><span class="strong">TJException</span></a></li>
+</ul>
+</li>
+</ul>
+</li>
+</ul>
+</li>
 <li type="circle">org.libjpegturbo.turbojpeg.<a href="../../../org/libjpegturbo/turbojpeg/TJ.html" title="class in org.libjpegturbo.turbojpeg"><span class="strong">TJ</span></a></li>
-<li type="circle">org.libjpegturbo.turbojpeg.<a href="../../../org/libjpegturbo/turbojpeg/TJCompressor.html" title="class in org.libjpegturbo.turbojpeg"><span class="strong">TJCompressor</span></a></li>
-<li type="circle">org.libjpegturbo.turbojpeg.<a href="../../../org/libjpegturbo/turbojpeg/TJDecompressor.html" title="class in org.libjpegturbo.turbojpeg"><span class="strong">TJDecompressor</span></a>
+<li type="circle">org.libjpegturbo.turbojpeg.<a href="../../../org/libjpegturbo/turbojpeg/TJCompressor.html" title="class in org.libjpegturbo.turbojpeg"><span class="strong">TJCompressor</span></a> (implements java.io.Closeable)</li>
+<li type="circle">org.libjpegturbo.turbojpeg.<a href="../../../org/libjpegturbo/turbojpeg/TJDecompressor.html" title="class in org.libjpegturbo.turbojpeg"><span class="strong">TJDecompressor</span></a> (implements java.io.Closeable)
 <ul>
 <li type="circle">org.libjpegturbo.turbojpeg.<a href="../../../org/libjpegturbo/turbojpeg/TJTransformer.html" title="class in org.libjpegturbo.turbojpeg"><span class="strong">TJTransformer</span></a></li>
 </ul>
diff --git a/java/doc/overview-tree.html b/java/doc/overview-tree.html
index eae18a1..2ae76c6 100644
--- a/java/doc/overview-tree.html
+++ b/java/doc/overview-tree.html
@@ -83,9 +83,22 @@
 </li>
 </ul>
 </li>
+<li type="circle">java.lang.Throwable (implements java.io.Serializable)
+<ul>
+<li type="circle">java.lang.Exception
+<ul>
+<li type="circle">java.io.IOException
+<ul>
+<li type="circle">org.libjpegturbo.turbojpeg.<a href="org/libjpegturbo/turbojpeg/TJException.html" title="class in org.libjpegturbo.turbojpeg"><span class="strong">TJException</span></a></li>
+</ul>
+</li>
+</ul>
+</li>
+</ul>
+</li>
 <li type="circle">org.libjpegturbo.turbojpeg.<a href="org/libjpegturbo/turbojpeg/TJ.html" title="class in org.libjpegturbo.turbojpeg"><span class="strong">TJ</span></a></li>
-<li type="circle">org.libjpegturbo.turbojpeg.<a href="org/libjpegturbo/turbojpeg/TJCompressor.html" title="class in org.libjpegturbo.turbojpeg"><span class="strong">TJCompressor</span></a></li>
-<li type="circle">org.libjpegturbo.turbojpeg.<a href="org/libjpegturbo/turbojpeg/TJDecompressor.html" title="class in org.libjpegturbo.turbojpeg"><span class="strong">TJDecompressor</span></a>
+<li type="circle">org.libjpegturbo.turbojpeg.<a href="org/libjpegturbo/turbojpeg/TJCompressor.html" title="class in org.libjpegturbo.turbojpeg"><span class="strong">TJCompressor</span></a> (implements java.io.Closeable)</li>
+<li type="circle">org.libjpegturbo.turbojpeg.<a href="org/libjpegturbo/turbojpeg/TJDecompressor.html" title="class in org.libjpegturbo.turbojpeg"><span class="strong">TJDecompressor</span></a> (implements java.io.Closeable)
 <ul>
 <li type="circle">org.libjpegturbo.turbojpeg.<a href="org/libjpegturbo/turbojpeg/TJTransformer.html" title="class in org.libjpegturbo.turbojpeg"><span class="strong">TJTransformer</span></a></li>
 </ul>
diff --git a/java/doc/serialized-form.html b/java/doc/serialized-form.html
index bbe1805..846cabc 100644
--- a/java/doc/serialized-form.html
+++ b/java/doc/serialized-form.html
@@ -66,6 +66,15 @@
 <li class="blockList">
 <h2 title="Package">Package&nbsp;org.libjpegturbo.turbojpeg</h2>
 <ul class="blockList">
+<li class="blockList"><a name="org.libjpegturbo.turbojpeg.TJException">
+<!--   -->
+</a>
+<h3>Class <a href="org/libjpegturbo/turbojpeg/TJException.html" title="class in org.libjpegturbo.turbojpeg">org.libjpegturbo.turbojpeg.TJException</a> extends java.io.IOException implements Serializable</h3>
+<dl class="nameValue">
+<dt>serialVersionUID:</dt>
+<dd>1L</dd>
+</dl>
+</li>
 <li class="blockList"><a name="org.libjpegturbo.turbojpeg.TJTransform">
 <!--   -->
 </a>
diff --git a/java/org/libjpegturbo/turbojpeg/TJ.java b/java/org/libjpegturbo/turbojpeg/TJ.java
index 644a197..02d14c0 100644
--- a/java/org/libjpegturbo/turbojpeg/TJ.java
+++ b/java/org/libjpegturbo/turbojpeg/TJ.java
@@ -1,5 +1,6 @@
 /*
  * Copyright (C)2011-2013 D. R. Commander.  All Rights Reserved.
+ * Copyright (C)2015 Viktor Szathmáry.  All Rights Reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions are met:
@@ -87,9 +88,8 @@
    * @return the MCU block width for the given level of chrominance
    * subsampling.
    */
-  public static int getMCUWidth(int subsamp) throws Exception {
-    if (subsamp < 0 || subsamp >= NUMSAMP)
-      throw new Exception("Invalid subsampling type");
+  public static int getMCUWidth(int subsamp) {
+    checkSubsampling(subsamp);
     return mcuWidth[subsamp];
   }
 
@@ -108,9 +108,8 @@
    * @return the MCU block height for the given level of chrominance
    * subsampling.
    */
-  public static int getMCUHeight(int subsamp) throws Exception {
-    if (subsamp < 0 || subsamp >= NUMSAMP)
-      throw new Exception("Invalid subsampling type");
+  public static int getMCUHeight(int subsamp) {
+    checkSubsampling(subsamp);
     return mcuHeight[subsamp];
   }
 
@@ -217,9 +216,8 @@
    *
    * @return the pixel size (in bytes) for the given pixel format.
    */
-  public static int getPixelSize(int pixelFormat) throws Exception {
-    if (pixelFormat < 0 || pixelFormat >= NUMPF)
-      throw new Exception("Invalid pixel format");
+  public static int getPixelSize(int pixelFormat) {
+    checkPixelFormat(pixelFormat);
     return pixelSize[pixelFormat];
   }
 
@@ -239,9 +237,8 @@
    *
    * @return the red offset for the given pixel format.
    */
-  public static int getRedOffset(int pixelFormat) throws Exception {
-    if (pixelFormat < 0 || pixelFormat >= NUMPF)
-      throw new Exception("Invalid pixel format");
+  public static int getRedOffset(int pixelFormat) {
+    checkPixelFormat(pixelFormat);
     return redOffset[pixelFormat];
   }
 
@@ -261,9 +258,8 @@
    *
    * @return the green offset for the given pixel format.
    */
-  public static int getGreenOffset(int pixelFormat) throws Exception {
-    if (pixelFormat < 0 || pixelFormat >= NUMPF)
-      throw new Exception("Invalid pixel format");
+  public static int getGreenOffset(int pixelFormat) {
+    checkPixelFormat(pixelFormat);
     return greenOffset[pixelFormat];
   }
 
@@ -283,9 +279,8 @@
    *
    * @return the blue offset for the given pixel format.
    */
-  public static int getBlueOffset(int pixelFormat) throws Exception {
-    if (pixelFormat < 0 || pixelFormat >= NUMPF)
-      throw new Exception("Invalid pixel format");
+  public static int getBlueOffset(int pixelFormat) {
+    checkPixelFormat(pixelFormat);
     return blueOffset[pixelFormat];
   }
 
@@ -407,8 +402,7 @@
    * @return the maximum size of the buffer (in bytes) required to hold a JPEG
    * image with the given width, height, and level of chrominance subsampling.
    */
-  public static native int bufSize(int width, int height, int jpegSubsamp)
-    throws Exception;
+  public static native int bufSize(int width, int height, int jpegSubsamp);
 
   /**
    * Returns the size of the buffer (in bytes) required to hold a YUV planar
@@ -428,15 +422,13 @@
    * image with the given width, height, and level of chrominance subsampling.
    */
   public static native int bufSizeYUV(int width, int pad, int height,
-                                      int subsamp)
-    throws Exception;
+                                      int subsamp);
 
   /**
    * @deprecated Use {@link #bufSizeYUV(int, int, int, int)} instead.
    */
   @Deprecated
-  public static native int bufSizeYUV(int width, int height, int subsamp)
-    throws Exception;
+  public static native int bufSizeYUV(int width, int height, int subsamp);
 
   /**
    * Returns the size of the buffer (in bytes) required to hold a YUV image
@@ -460,8 +452,7 @@
    * image with the given parameters.
    */
   public static native int planeSizeYUV(int componentID, int width, int stride,
-                                        int height, int subsamp)
-    throws Exception;
+                                        int height, int subsamp);
 
   /**
    * Returns the plane width of a YUV image plane with the given parameters.
@@ -477,8 +468,7 @@
    *
    * @return the plane width of a YUV image plane with the given parameters.
    */
-  public static native int planeWidth(int componentID, int width, int subsamp)
-    throws Exception;
+  public static native int planeWidth(int componentID, int width, int subsamp);
 
   /**
    * Returns the plane height of a YUV image plane with the given parameters.
@@ -495,8 +485,7 @@
    * @return the plane height of a YUV image plane with the given parameters.
    */
   public static native int planeHeight(int componentID, int height,
-                                       int subsamp)
-    throws Exception;
+                                       int subsamp);
 
   /**
    * Returns a list of fractional scaling factors that the JPEG decompressor in
@@ -505,10 +494,20 @@
    * @return a list of fractional scaling factors that the JPEG decompressor in
    * this implementation of TurboJPEG supports.
    */
-  public static native TJScalingFactor[] getScalingFactors()
-    throws Exception;
+  public static native TJScalingFactor[] getScalingFactors();
 
   static {
     TJLoader.load();
   }
-};
+
+  private static void checkPixelFormat(int pixelFormat) {
+    if (pixelFormat < 0 || pixelFormat >= NUMPF)
+      throw new IllegalArgumentException("Invalid pixel format");
+  }
+
+  private static void checkSubsampling(int subsamp) {
+    if (subsamp < 0 || subsamp >= NUMSAMP)
+      throw new IllegalArgumentException("Invalid subsampling type");
+  }
+
+}
diff --git a/java/org/libjpegturbo/turbojpeg/TJCompressor.java b/java/org/libjpegturbo/turbojpeg/TJCompressor.java
index 6ec581a..2ff8e4d 100644
--- a/java/org/libjpegturbo/turbojpeg/TJCompressor.java
+++ b/java/org/libjpegturbo/turbojpeg/TJCompressor.java
@@ -1,5 +1,6 @@
 /*
  * Copyright (C)2011-2015 D. R. Commander.  All Rights Reserved.
+ * Copyright (C)2015 Viktor Szathmáry.  All Rights Reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions are met:
@@ -30,11 +31,12 @@
 
 import java.awt.image.*;
 import java.nio.*;
+import java.io.*;
 
 /**
  * TurboJPEG compressor
  */
-public class TJCompressor {
+public class TJCompressor implements Closeable {
 
   private static final String NO_ASSOC_ERROR =
     "No source image is associated with this instance";
@@ -42,7 +44,7 @@
   /**
    * Create a TurboJPEG compressor instance.
    */
-  public TJCompressor() throws Exception {
+  public TJCompressor() throws TJException {
     init();
   }
 
@@ -67,7 +69,7 @@
    * {@link TJ#PF_RGB TJ.PF_*})
    */
   public TJCompressor(byte[] srcImage, int x, int y, int width, int pitch,
-                      int height, int pixelFormat) throws Exception {
+                      int height, int pixelFormat) throws TJException {
     setSourceImage(srcImage, x, y, width, pitch, height, pixelFormat);
   }
 
@@ -77,7 +79,7 @@
    */
   @Deprecated
   public TJCompressor(byte[] srcImage, int width, int pitch, int height,
-                      int pixelFormat) throws Exception {
+                      int pixelFormat) throws TJException {
     setSourceImage(srcImage, width, pitch, height, pixelFormat);
   }
 
@@ -102,7 +104,7 @@
    * {@link #setSourceImage(BufferedImage, int, int, int, int)} for description
    */
   public TJCompressor(BufferedImage srcImage, int x, int y, int width,
-                      int height) throws Exception {
+                      int height) throws TJException {
     setSourceImage(srcImage, x, y, width, height);
   }
 
@@ -139,11 +141,11 @@
    */
   public void setSourceImage(byte[] srcImage, int x, int y, int width,
                              int pitch, int height, int pixelFormat)
-                             throws Exception {
+                             throws TJException {
     if (handle == 0) init();
     if (srcImage == null || x < 0 || y < 0 || width < 1 || height < 1 ||
         pitch < 0 || pixelFormat < 0 || pixelFormat >= TJ.NUMPF)
-      throw new Exception("Invalid argument in setSourceImage()");
+      throw new IllegalArgumentException("Invalid argument in setSourceImage()");
     srcBuf = srcImage;
     srcWidth = width;
     if (pitch == 0)
@@ -164,7 +166,7 @@
    */
   @Deprecated
   public void setSourceImage(byte[] srcImage, int width, int pitch,
-                             int height, int pixelFormat) throws Exception {
+                             int height, int pixelFormat) throws TJException {
     setSourceImage(srcImage, 0, 0, width, pitch, height, pixelFormat);
     srcX = srcY = -1;
   }
@@ -191,16 +193,16 @@
    * height of the source image)
    */
   public void setSourceImage(BufferedImage srcImage, int x, int y, int width,
-                             int height) throws Exception {
+                             int height) throws TJException {
     if (handle == 0) init();
     if (srcImage == null || x < 0 || y < 0 || width < 0 || height < 0)
-      throw new Exception("Invalid argument in setSourceImage()");
+      throw new IllegalArgumentException("Invalid argument in setSourceImage()");
     srcX = x;
     srcY = y;
     srcWidth = (width == 0) ? srcImage.getWidth(): width;
     srcHeight = (height == 0) ? srcImage.getHeight() : height;
     if (x + width > srcImage.getWidth() || y + height > srcImage.getHeight())
-      throw new Exception("Compression region exceeds the bounds of the source image");
+      throw new IllegalArgumentException("Compression region exceeds the bounds of the source image");
 
     int pixelFormat;
     boolean intPixels = false;
@@ -229,7 +231,7 @@
           pixelFormat = TJ.PF_BGRX;
         intPixels = true;  break;
       default:
-        throw new Exception("Unsupported BufferedImage format");
+        throw new IllegalArgumentException("Unsupported BufferedImage format");
     }
     srcPixelFormat = pixelFormat;
 
@@ -246,7 +248,7 @@
         (ComponentSampleModel)srcImage.getSampleModel();
       int pixelSize = sm.getPixelStride();
       if (pixelSize != TJ.getPixelSize(pixelFormat))
-        throw new Exception("Inconsistency between pixel format and pixel size in BufferedImage");
+        throw new IllegalArgumentException("Inconsistency between pixel format and pixel size in BufferedImage");
       srcPitch = sm.getScanlineStride();
       DataBufferByte db = (DataBufferByte)wr.getDataBuffer();
       srcBuf = db.getData();
@@ -262,10 +264,10 @@
    * @param srcImage YUV planar image to be compressed.  This image is not
    * modified.
    */
-  public void setSourceImage(YUVImage srcImage) throws Exception {
+  public void setSourceImage(YUVImage srcImage) throws TJException {
     if (handle == 0) init();
     if (srcImage == null)
-      throw new Exception("Invalid argument in setSourceImage()");
+      throw new IllegalArgumentException("Invalid argument in setSourceImage()");
     srcYUVImage = srcImage;
     srcBuf = null;
     srcBufInt = null;
@@ -292,9 +294,9 @@
    * subsequent compress/encode oeprations (one of
    * {@link TJ#SAMP_444 TJ.SAMP_*})
    */
-  public void setSubsamp(int newSubsamp) throws Exception {
+  public void setSubsamp(int newSubsamp) {
     if (newSubsamp < 0 || newSubsamp >= TJ.NUMSAMP)
-      throw new Exception("Invalid argument in setSubsamp()");
+      throw new IllegalArgumentException("Invalid argument in setSubsamp()");
     subsamp = newSubsamp;
   }
 
@@ -304,9 +306,9 @@
    * @param quality the new JPEG image quality level (1 to 100, 1 = worst,
    * 100 = best)
    */
-  public void setJPEGQuality(int quality) throws Exception {
+  public void setJPEGQuality(int quality) {
     if (quality < 1 || quality > 100)
-      throw new Exception("Invalid argument in setJPEGQuality()");
+      throw new IllegalArgumentException("Invalid argument in setJPEGQuality()");
     jpegQuality = quality;
   }
 
@@ -322,15 +324,15 @@
    * @param flags the bitwise OR of one or more of
    * {@link TJ#FLAG_BOTTOMUP TJ.FLAG_*}
    */
-  public void compress(byte[] dstBuf, int flags) throws Exception {
+  public void compress(byte[] dstBuf, int flags) throws TJException {
     if (dstBuf == null || flags < 0)
-      throw new Exception("Invalid argument in compress()");
+      throw new IllegalArgumentException("Invalid argument in compress()");
     if (srcBuf == null && srcBufInt == null && srcYUVImage == null)
-      throw new Exception(NO_ASSOC_ERROR);
+      throw new IllegalStateException(NO_ASSOC_ERROR);
     if (jpegQuality < 0)
-      throw new Exception("JPEG Quality not set");
+      throw new IllegalStateException("JPEG Quality not set");
     if (subsamp < 0 && srcYUVImage == null)
-      throw new Exception("Subsampling level not set");
+      throw new IllegalStateException("Subsampling level not set");
 
     if (srcYUVImage != null)
       compressedSize = compressFromYUV(srcYUVImage.getPlanes(),
@@ -372,9 +374,8 @@
    * not be equal to the size of the JPEG image.  Use {@link
    * #getCompressedSize} to obtain the size of the JPEG image.
    */
-  public byte[] compress(int flags) throws Exception {
-    if (srcWidth < 1 || srcHeight < 1)
-      throw new Exception(NO_ASSOC_ERROR);
+  public byte[] compress(int flags) throws TJException {
+    checkSourceImage();
     byte[] buf = new byte[TJ.bufSize(srcWidth, srcHeight, subsamp)];
     compress(buf, flags);
     return buf;
@@ -387,7 +388,7 @@
    */
   @Deprecated
   public void compress(BufferedImage srcImage, byte[] dstBuf, int flags)
-                       throws Exception {
+                       throws TJException {
     setSourceImage(srcImage, 0, 0, 0, 0);
     compress(dstBuf, flags);
   }
@@ -398,7 +399,8 @@
    * {@link #compress(int)} instead.
    */
   @Deprecated
-  public byte[] compress(BufferedImage srcImage, int flags) throws Exception {
+  public byte[] compress(BufferedImage srcImage, int flags)
+                         throws TJException {
     setSourceImage(srcImage, 0, 0, 0, 0);
     return compress(flags);
   }
@@ -417,17 +419,16 @@
    * @param flags the bitwise OR of one or more of
    * {@link TJ#FLAG_BOTTOMUP TJ.FLAG_*}
    */
-  public void encodeYUV(YUVImage dstImage, int flags) throws Exception {
+  public void encodeYUV(YUVImage dstImage, int flags) throws TJException {
     if (dstImage == null || flags < 0)
-      throw new Exception("Invalid argument in encodeYUV()");
+      throw new IllegalArgumentException("Invalid argument in encodeYUV()");
     if (srcBuf == null && srcBufInt == null)
-      throw new Exception(NO_ASSOC_ERROR);
+      throw new IllegalStateException(NO_ASSOC_ERROR);
     if (srcYUVImage != null)
-      throw new Exception("Source image is not correct type");
-    if (subsamp < 0)
-      throw new Exception("Subsampling level not set");
+      throw new IllegalStateException("Source image is not correct type");
+    checkSubsampling();
     if (srcWidth != dstImage.getWidth() || srcHeight != dstImage.getHeight())
-      throw new Exception("Destination image is the wrong size");
+      throw new IllegalStateException("Destination image is the wrong size");
 
     if (srcBufInt != null) {
       encodeYUV(srcBufInt, srcX, srcY, srcWidth, srcStride, srcHeight,
@@ -445,13 +446,11 @@
    * @deprecated Use {@link #encodeYUV(YUVImage, int)} instead.
    */
   @Deprecated
-  public void encodeYUV(byte[] dstBuf, int flags) throws Exception {
+  public void encodeYUV(byte[] dstBuf, int flags) throws TJException {
     if(dstBuf == null)
-      throw new Exception("Invalid argument in encodeYUV()");
-    if (srcWidth < 1 || srcHeight < 1)
-      throw new Exception(NO_ASSOC_ERROR);
-    if (subsamp < 0)
-      throw new Exception("Subsampling level not set");
+      throw new IllegalArgumentException("Invalid argument in encodeYUV()");
+    checkSourceImage();
+    checkSubsampling();
     YUVImage yuvImage = new YUVImage(dstBuf, srcWidth, 4, srcHeight, subsamp);
     encodeYUV(yuvImage, flags);
   }
@@ -473,13 +472,11 @@
    *
    * @return a YUV planar image.
    */
-  public YUVImage encodeYUV(int pad, int flags) throws Exception {
-    if (srcWidth < 1 || srcHeight < 1)
-      throw new Exception(NO_ASSOC_ERROR);
-    if (subsamp < 0)
-      throw new Exception("Subsampling level not set");
+  public YUVImage encodeYUV(int pad, int flags) throws TJException {
+    checkSourceImage();
+    checkSubsampling();
     if(pad < 1 || ((pad & (pad - 1)) != 0))
-      throw new Exception("Invalid argument in encodeYUV()");
+      throw new IllegalStateException("Invalid argument in encodeYUV()");
     YUVImage yuvImage = new YUVImage(srcWidth, pad, srcHeight, subsamp);
     encodeYUV(yuvImage, flags);
     return yuvImage;
@@ -506,11 +503,9 @@
    *
    * @return a YUV planar image.
    */
-  public YUVImage encodeYUV(int[] strides, int flags) throws Exception {
-    if (srcWidth < 1 || srcHeight < 1)
-      throw new Exception(NO_ASSOC_ERROR);
-    if (subsamp < 0)
-      throw new Exception("Subsampling level not set");
+  public YUVImage encodeYUV(int[] strides, int flags) throws TJException {
+    checkSourceImage();
+    checkSubsampling();
     YUVImage yuvImage = new YUVImage(srcWidth, strides, srcHeight, subsamp);
     encodeYUV(yuvImage, flags);
     return yuvImage;
@@ -520,11 +515,9 @@
    * @deprecated Use {@link #encodeYUV(int, int)} instead.
    */
   @Deprecated
-  public byte[] encodeYUV(int flags) throws Exception {
-    if (srcWidth < 1 || srcHeight < 1)
-      throw new Exception(NO_ASSOC_ERROR);
-    if (subsamp < 0)
-      throw new Exception("Subsampling level not set");
+  public byte[] encodeYUV(int flags) throws TJException {
+    checkSourceImage();
+    checkSubsampling();
     YUVImage yuvImage = new YUVImage(srcWidth, 4, srcHeight, subsamp);
     encodeYUV(yuvImage, flags);
     return yuvImage.getBuf();
@@ -537,7 +530,7 @@
    */
   @Deprecated
   public void encodeYUV(BufferedImage srcImage, byte[] dstBuf, int flags)
-    throws Exception {
+                        throws TJException {
     setSourceImage(srcImage, 0, 0, 0, 0);
     encodeYUV(dstBuf, flags);
   }
@@ -548,7 +541,8 @@
    * {@link #encodeYUV(int, int)} instead.
    */
   @Deprecated
-  public byte[] encodeYUV(BufferedImage srcImage, int flags) throws Exception {
+  public byte[] encodeYUV(BufferedImage srcImage, int flags)
+                          throws TJException {
     setSourceImage(srcImage, 0, 0, 0, 0);
     return encodeYUV(flags);
   }
@@ -567,68 +561,84 @@
   /**
    * Free the native structures associated with this compressor instance.
    */
-  public void close() throws Exception {
+  @Override
+  public void close() throws TJException {
     if (handle != 0)
       destroy();
   }
 
+  @Override
   protected void finalize() throws Throwable {
     try {
       close();
-    } catch(Exception e) {
+    } catch(TJException e) {
     } finally {
       super.finalize();
     }
   };
 
-  private native void init() throws Exception;
+  private native void init() throws TJException;
 
-  private native void destroy() throws Exception;
+  private native void destroy() throws TJException;
 
   // JPEG size in bytes is returned
+  @Deprecated
   private native int compress(byte[] srcBuf, int width, int pitch,
     int height, int pixelFormat, byte[] dstBuf, int jpegSubsamp, int jpegQual,
-    int flags) throws Exception; // deprecated
+    int flags) throws TJException;
 
   private native int compress(byte[] srcBuf, int x, int y, int width,
     int pitch, int height, int pixelFormat, byte[] dstBuf, int jpegSubsamp,
-    int jpegQual, int flags) throws Exception;
+    int jpegQual, int flags) throws TJException;
 
+  @Deprecated
   private native int compress(int[] srcBuf, int width, int stride,
     int height, int pixelFormat, byte[] dstBuf, int jpegSubsamp, int jpegQual,
-    int flags) throws Exception; // deprecated
+    int flags) throws TJException;
 
   private native int compress(int[] srcBuf, int x, int y, int width,
     int stride, int height, int pixelFormat, byte[] dstBuf, int jpegSubsamp,
-    int jpegQual, int flags) throws Exception;
+    int jpegQual, int flags) throws TJException;
 
   private native int compressFromYUV(byte[][] srcPlanes, int[] srcOffsets,
     int width, int[] srcStrides, int height, int subsamp, byte[] dstBuf,
     int jpegQual, int flags)
-    throws Exception;
+    throws TJException;
 
+  @Deprecated
   private native void encodeYUV(byte[] srcBuf, int width, int pitch,
     int height, int pixelFormat, byte[] dstBuf, int subsamp, int flags)
-    throws Exception; // deprecated
+    throws TJException;
 
   private native void encodeYUV(byte[] srcBuf, int x, int y, int width,
     int pitch, int height, int pixelFormat, byte[][] dstPlanes,
     int[] dstOffsets, int[] dstStrides, int subsamp, int flags)
-    throws Exception;
+    throws TJException;
 
+  @Deprecated
   private native void encodeYUV(int[] srcBuf, int width, int stride,
     int height, int pixelFormat, byte[] dstBuf, int subsamp, int flags)
-    throws Exception; // deprecated
+    throws TJException;
 
   private native void encodeYUV(int[] srcBuf, int x, int y, int width,
     int srcStride, int height, int pixelFormat, byte[][] dstPlanes,
     int[] dstOffsets, int[] dstStrides, int subsamp, int flags)
-    throws Exception;
+    throws TJException;
 
   static {
     TJLoader.load();
   }
 
+  private void checkSourceImage() {
+    if (srcWidth < 1 || srcHeight < 1)
+      throw new IllegalStateException(NO_ASSOC_ERROR);
+  }
+
+  private void checkSubsampling() {
+    if (subsamp < 0)
+      throw new IllegalStateException("Subsampling level not set");
+  }
+
   private long handle = 0;
   private byte[] srcBuf = null;
   private int[] srcBufInt = null;
@@ -645,4 +655,4 @@
   private int compressedSize = 0;
   private int yuvPad = 4;
   private ByteOrder byteOrder = null;
-};
+}
diff --git a/java/org/libjpegturbo/turbojpeg/TJCustomFilter.java b/java/org/libjpegturbo/turbojpeg/TJCustomFilter.java
index bf78f2e..9a34587 100644
--- a/java/org/libjpegturbo/turbojpeg/TJCustomFilter.java
+++ b/java/org/libjpegturbo/turbojpeg/TJCustomFilter.java
@@ -72,5 +72,5 @@
   void customFilter(ShortBuffer coeffBuffer, Rectangle bufferRegion,
                     Rectangle planeRegion, int componentID, int transformID,
                     TJTransform transform)
-    throws Exception;
+    throws TJException;
 }
diff --git a/java/org/libjpegturbo/turbojpeg/TJDecompressor.java b/java/org/libjpegturbo/turbojpeg/TJDecompressor.java
index 7ec557f..bd0e694 100644
--- a/java/org/libjpegturbo/turbojpeg/TJDecompressor.java
+++ b/java/org/libjpegturbo/turbojpeg/TJDecompressor.java
@@ -1,5 +1,6 @@
 /*
  * Copyright (C)2011-2015 D. R. Commander.  All Rights Reserved.
+ * Copyright (C)2015 Viktor Szathmáry.  All Rights Reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions are met:
@@ -30,11 +31,12 @@
 
 import java.awt.image.*;
 import java.nio.*;
+import java.io.*;
 
 /**
  * TurboJPEG decompressor
  */
-public class TJDecompressor {
+public class TJDecompressor implements Closeable {
 
   private static final String NO_ASSOC_ERROR =
     "No JPEG image is associated with this instance";
@@ -42,7 +44,7 @@
   /**
    * Create a TurboJPEG decompresssor instance.
    */
-  public TJDecompressor() throws Exception {
+  public TJDecompressor() throws TJException {
     init();
   }
 
@@ -53,7 +55,7 @@
    * @param jpegImage JPEG image buffer (size of the JPEG image is assumed to
    * be the length of the array.)  This buffer is not modified.
    */
-  public TJDecompressor(byte[] jpegImage) throws Exception {
+  public TJDecompressor(byte[] jpegImage) throws TJException {
     init();
     setSourceImage(jpegImage, jpegImage.length);
   }
@@ -67,7 +69,7 @@
    *
    * @param imageSize size of the JPEG image (in bytes)
    */
-  public TJDecompressor(byte[] jpegImage, int imageSize) throws Exception {
+  public TJDecompressor(byte[] jpegImage, int imageSize) throws TJException {
     init();
     setSourceImage(jpegImage, imageSize);
   }
@@ -80,7 +82,7 @@
    * @param yuvImage {@link YUVImage} instance containing a YUV planar
    * image to be decoded.  This image is not modified.
    */
-  public TJDecompressor(YUVImage yuvImage) throws Exception {
+  public TJDecompressor(YUVImage yuvImage) throws TJException {
     init();
     setSourceImage(yuvImage);
   }
@@ -95,9 +97,9 @@
    * @param imageSize size of the JPEG image (in bytes)
    */
   public void setSourceImage(byte[] jpegImage, int imageSize)
-    throws Exception {
+                             throws TJException {
     if (jpegImage == null || imageSize < 1)
-      throw new Exception("Invalid argument in setSourceImage()");
+      throw new IllegalArgumentException("Invalid argument in setSourceImage()");
     jpegBuf = jpegImage;
     jpegBufSize = imageSize;
     decompressHeader(jpegBuf, jpegBufSize);
@@ -108,7 +110,8 @@
    * @deprecated Use {@link #setSourceImage(byte[], int)} instead.
    */
   @Deprecated
-  public void setJPEGImage(byte[] jpegImage, int imageSize) throws Exception {
+  public void setJPEGImage(byte[] jpegImage, int imageSize)
+                           throws TJException {
     setSourceImage(jpegImage, imageSize);
   }
 
@@ -120,9 +123,9 @@
    * @param srcImage {@link YUVImage} instance containing a YUV planar image to
    * be decoded.  This image is not modified.
    */
-  public void setSourceImage(YUVImage srcImage) throws Exception {
+  public void setSourceImage(YUVImage srcImage) {
     if (srcImage == null)
-      throw new Exception("Invalid argument in setSourceImage()");
+      throw new IllegalArgumentException("Invalid argument in setSourceImage()");
     yuvImage = srcImage;
     jpegBuf = null;
     jpegBufSize = 0;
@@ -136,11 +139,11 @@
    * @return the width of the source image (JPEG or YUV) associated with this
    * decompressor instance.
    */
-  public int getWidth() throws Exception {
+  public int getWidth() {
     if (yuvImage != null)
       return yuvImage.getWidth();
     if (jpegWidth < 1)
-      throw new Exception(NO_ASSOC_ERROR);
+      throw new IllegalStateException(NO_ASSOC_ERROR);
     return jpegWidth;
   }
 
@@ -151,11 +154,11 @@
    * @return the height of the source image (JPEG or YUV) associated with this
    * decompressor instance.
    */
-  public int getHeight() throws Exception {
+  public int getHeight() {
     if (yuvImage != null)
       return yuvImage.getHeight();
     if (jpegHeight < 1)
-      throw new Exception(NO_ASSOC_ERROR);
+      throw new IllegalStateException(NO_ASSOC_ERROR);
     return jpegHeight;
   }
 
@@ -167,13 +170,13 @@
    * @return the level of chrominance subsampling used in the source image
    * (JPEG or YUV) associated with this decompressor instance.
    */
-  public int getSubsamp() throws Exception {
+  public int getSubsamp() {
     if (yuvImage != null)
       return yuvImage.getSubsamp();
     if (jpegSubsamp < 0)
-      throw new Exception(NO_ASSOC_ERROR);
+      throw new IllegalStateException(NO_ASSOC_ERROR);
     if (jpegSubsamp >= TJ.NUMSAMP)
-      throw new Exception("JPEG header information is invalid");
+      throw new IllegalStateException("JPEG header information is invalid");
     return jpegSubsamp;
   }
 
@@ -185,13 +188,13 @@
    * @return the colorspace used in the source image (JPEG or YUV) associated
    * with this decompressor instance.
    */
-  public int getColorspace() throws Exception {
+  public int getColorspace() {
     if (yuvImage != null)
       return TJ.CS_YCbCr;
     if (jpegColorspace < 0)
-      throw new Exception(NO_ASSOC_ERROR);
+      throw new IllegalStateException(NO_ASSOC_ERROR);
     if (jpegColorspace >= TJ.NUMCS)
-      throw new Exception("JPEG header information is invalid");
+      throw new IllegalStateException("JPEG header information is invalid");
     return jpegColorspace;
   }
 
@@ -200,9 +203,9 @@
    *
    * @return the JPEG image buffer associated with this decompressor instance.
    */
-  public byte[] getJPEGBuf() throws Exception {
+  public byte[] getJPEGBuf() {
     if (jpegBuf == null)
-      throw new Exception(NO_ASSOC_ERROR);
+      throw new IllegalStateException(NO_ASSOC_ERROR);
     return jpegBuf;
   }
 
@@ -213,9 +216,9 @@
    * @return the size of the JPEG image (in bytes) associated with this
    * decompressor instance.
    */
-  public int getJPEGSize() throws Exception {
+  public int getJPEGSize() {
     if (jpegBufSize < 1)
-      throw new Exception(NO_ASSOC_ERROR);
+      throw new IllegalStateException(NO_ASSOC_ERROR);
     return jpegBufSize;
   }
 
@@ -238,12 +241,11 @@
    * decompressor can generate without exceeding the desired image width and
    * height.
    */
-  public int getScaledWidth(int desiredWidth, int desiredHeight)
-                            throws Exception {
+  public int getScaledWidth(int desiredWidth, int desiredHeight) {
     if (jpegWidth < 1 || jpegHeight < 1)
-      throw new Exception(NO_ASSOC_ERROR);
+      throw new IllegalStateException(NO_ASSOC_ERROR);
     if (desiredWidth < 0 || desiredHeight < 0)
-      throw new Exception("Invalid argument in getScaledWidth()");
+      throw new IllegalArgumentException("Invalid argument in getScaledWidth()");
     TJScalingFactor[] sf = TJ.getScalingFactors();
     if (desiredWidth == 0)
       desiredWidth = jpegWidth;
@@ -257,7 +259,7 @@
         break;
     }
     if (scaledWidth > desiredWidth || scaledHeight > desiredHeight)
-      throw new Exception("Could not scale down to desired image dimensions");
+      throw new IllegalArgumentException("Could not scale down to desired image dimensions");
     return scaledWidth;
   }
 
@@ -280,12 +282,11 @@
    * decompressor can generate without exceeding the desired image width and
    * height.
    */
-  public int getScaledHeight(int desiredWidth, int desiredHeight)
-                             throws Exception {
+  public int getScaledHeight(int desiredWidth, int desiredHeight) {
     if (jpegWidth < 1 || jpegHeight < 1)
-      throw new Exception(NO_ASSOC_ERROR);
+      throw new IllegalStateException(NO_ASSOC_ERROR);
     if (desiredWidth < 0 || desiredHeight < 0)
-      throw new Exception("Invalid argument in getScaledHeight()");
+      throw new IllegalArgumentException("Invalid argument in getScaledHeight()");
     TJScalingFactor[] sf = TJ.getScalingFactors();
     if (desiredWidth == 0)
       desiredWidth = jpegWidth;
@@ -299,7 +300,7 @@
         break;
     }
     if (scaledWidth > desiredWidth || scaledHeight > desiredHeight)
-      throw new Exception("Could not scale down to desired image dimensions");
+      throw new IllegalArgumentException("Could not scale down to desired image dimensions");
     return scaledHeight;
   }
 
@@ -369,13 +370,13 @@
    */
   public void decompress(byte[] dstBuf, int x, int y, int desiredWidth,
                          int pitch, int desiredHeight, int pixelFormat,
-                         int flags) throws Exception {
+                         int flags) throws TJException {
     if (jpegBuf == null && yuvImage == null)
-      throw new Exception(NO_ASSOC_ERROR);
+      throw new IllegalStateException(NO_ASSOC_ERROR);
     if (dstBuf == null || x < 0 || y < 0 || pitch < 0 ||
         (yuvImage != null && (desiredWidth < 0 || desiredHeight < 0)) ||
         pixelFormat < 0 || pixelFormat >= TJ.NUMPF || flags < 0)
-      throw new Exception("Invalid argument in decompress()");
+      throw new IllegalArgumentException("Invalid argument in decompress()");
     if (yuvImage != null)
       decodeYUV(yuvImage.getPlanes(), yuvImage.getOffsets(),
                 yuvImage.getStrides(), yuvImage.getSubsamp(), dstBuf, x, y,
@@ -398,7 +399,7 @@
   @Deprecated
   public void decompress(byte[] dstBuf, int desiredWidth, int pitch,
                          int desiredHeight, int pixelFormat, int flags)
-                         throws Exception {
+                         throws TJException {
     decompress(dstBuf, 0, 0, desiredWidth, pitch, desiredHeight, pixelFormat,
                flags);
   }
@@ -428,11 +429,11 @@
    * @return a buffer containing the decompressed image.
    */
   public byte[] decompress(int desiredWidth, int pitch, int desiredHeight,
-                           int pixelFormat, int flags) throws Exception {
+                           int pixelFormat, int flags) throws TJException {
     if (pitch < 0 ||
         (yuvImage == null && (desiredWidth < 0 || desiredHeight < 0)) ||
         pixelFormat < 0 || pixelFormat >= TJ.NUMPF || flags < 0)
-      throw new Exception("Invalid argument in decompress()");
+      throw new IllegalArgumentException("Invalid argument in decompress()");
     int pixelSize = TJ.getPixelSize(pixelFormat);
     int scaledWidth = getScaledWidth(desiredWidth, desiredHeight);
     int scaledHeight = getScaledHeight(desiredWidth, desiredHeight);
@@ -461,20 +462,21 @@
    * @param flags the bitwise OR of one or more of
    * {@link TJ#FLAG_BOTTOMUP TJ.FLAG_*}
    */
-  public void decompressToYUV(YUVImage dstImage, int flags) throws Exception {
+  public void decompressToYUV(YUVImage dstImage, int flags)
+                              throws TJException {
     if (jpegBuf == null)
-      throw new Exception(NO_ASSOC_ERROR);
+      throw new IllegalStateException(NO_ASSOC_ERROR);
     if (dstImage == null || flags < 0)
-      throw new Exception("Invalid argument in decompressToYUV()");
+      throw new IllegalArgumentException("Invalid argument in decompressToYUV()");
     int scaledWidth = getScaledWidth(dstImage.getWidth(),
                                      dstImage.getHeight());
     int scaledHeight = getScaledHeight(dstImage.getWidth(),
                                        dstImage.getHeight());
     if (scaledWidth != dstImage.getWidth() ||
         scaledHeight != dstImage.getHeight())
-      throw new Exception("YUVImage dimensions do not match one of the scaled image sizes that TurboJPEG is capable of generating.");
+      throw new IllegalArgumentException("YUVImage dimensions do not match one of the scaled image sizes that TurboJPEG is capable of generating.");
     if (jpegSubsamp != dstImage.getSubsamp())
-      throw new Exception("YUVImage subsampling level does not match that of the JPEG image");
+      throw new IllegalArgumentException("YUVImage subsampling level does not match that of the JPEG image");
 
     decompressToYUV(jpegBuf, jpegBufSize, dstImage.getPlanes(),
                     dstImage.getOffsets(), dstImage.getWidth(),
@@ -485,7 +487,7 @@
    * @deprecated Use {@link #decompressToYUV(YUVImage, int)} instead.
    */
   @Deprecated
-  public void decompressToYUV(byte[] dstBuf, int flags) throws Exception {
+  public void decompressToYUV(byte[] dstBuf, int flags) throws TJException {
     YUVImage dstImage = new YUVImage(dstBuf, jpegWidth, 4, jpegHeight,
                                      jpegSubsamp);
     decompressToYUV(dstImage, flags);
@@ -531,15 +533,15 @@
    */
   public YUVImage decompressToYUV(int desiredWidth, int[] strides,
                                   int desiredHeight,
-                                  int flags) throws Exception {
+                                  int flags) throws TJException {
     if (flags < 0)
-      throw new Exception("Invalid argument in decompressToYUV()");
+      throw new IllegalArgumentException("Invalid argument in decompressToYUV()");
     if (jpegWidth < 1 || jpegHeight < 1 || jpegSubsamp < 0)
-      throw new Exception(NO_ASSOC_ERROR);
+      throw new IllegalStateException(NO_ASSOC_ERROR);
     if (jpegSubsamp >= TJ.NUMSAMP)
-      throw new Exception("JPEG header information is invalid");
+      throw new IllegalStateException("JPEG header information is invalid");
     if (yuvImage != null)
-      throw new Exception("Source image is the wrong type");
+      throw new IllegalStateException("Source image is the wrong type");
 
     int scaledWidth = getScaledWidth(desiredWidth, desiredHeight);
     int scaledHeight = getScaledHeight(desiredWidth, desiredHeight);
@@ -584,15 +586,15 @@
    * @return a YUV planar image.
    */
   public YUVImage decompressToYUV(int desiredWidth, int pad, int desiredHeight,
-                                  int flags) throws Exception {
+                                  int flags) throws TJException {
     if (flags < 0)
-      throw new Exception("Invalid argument in decompressToYUV()");
+      throw new IllegalArgumentException("Invalid argument in decompressToYUV()");
     if (jpegWidth < 1 || jpegHeight < 1 || jpegSubsamp < 0)
-      throw new Exception(NO_ASSOC_ERROR);
+      throw new IllegalStateException(NO_ASSOC_ERROR);
     if (jpegSubsamp >= TJ.NUMSAMP)
-      throw new Exception("JPEG header information is invalid");
+      throw new IllegalStateException("JPEG header information is invalid");
     if (yuvImage != null)
-      throw new Exception("Source image is the wrong type");
+      throw new IllegalStateException("Source image is the wrong type");
 
     int scaledWidth = getScaledWidth(desiredWidth, desiredHeight);
     int scaledHeight = getScaledHeight(desiredWidth, desiredHeight);
@@ -606,7 +608,7 @@
    * @deprecated Use {@link #decompressToYUV(int, int, int, int)} instead.
    */
   @Deprecated
-  public byte[] decompressToYUV(int flags) throws Exception {
+  public byte[] decompressToYUV(int flags) throws TJException {
     YUVImage dstImage = new YUVImage(jpegWidth, 4, jpegHeight, jpegSubsamp);
     decompressToYUV(dstImage, flags);
     return dstImage.getBuf();
@@ -676,13 +678,13 @@
    */
   public void decompress(int[] dstBuf, int x, int y, int desiredWidth,
                          int stride, int desiredHeight, int pixelFormat,
-                         int flags) throws Exception {
+                         int flags) throws TJException {
     if (jpegBuf == null && yuvImage == null)
-      throw new Exception(NO_ASSOC_ERROR);
+      throw new IllegalStateException(NO_ASSOC_ERROR);
     if (dstBuf == null || x < 0 || y < 0 || stride < 0 ||
         (yuvImage != null && (desiredWidth < 0 || desiredHeight < 0)) ||
         pixelFormat < 0 || pixelFormat >= TJ.NUMPF || flags < 0)
-      throw new Exception("Invalid argument in decompress()");
+      throw new IllegalArgumentException("Invalid argument in decompress()");
     if (yuvImage != null)
       decodeYUV(yuvImage.getPlanes(), yuvImage.getOffsets(),
                 yuvImage.getStrides(), yuvImage.getSubsamp(), dstBuf, x, y,
@@ -709,9 +711,10 @@
    * @param flags the bitwise OR of one or more of
    * {@link TJ#FLAG_BOTTOMUP TJ.FLAG_*}
    */
-  public void decompress(BufferedImage dstImage, int flags) throws Exception {
+  public void decompress(BufferedImage dstImage, int flags)
+                         throws TJException {
     if (dstImage == null || flags < 0)
-      throw new Exception("Invalid argument in decompress()");
+      throw new IllegalArgumentException("Invalid argument in decompress()");
     int desiredWidth = dstImage.getWidth();
     int desiredHeight = dstImage.getHeight();
     int scaledWidth, scaledHeight;
@@ -719,14 +722,14 @@
     if (yuvImage != null) {
       if (desiredWidth != yuvImage.getWidth() ||
           desiredHeight != yuvImage.getHeight())
-        throw new Exception("BufferedImage dimensions do not match the dimensions of the source image.");
+        throw new IllegalArgumentException("BufferedImage dimensions do not match the dimensions of the source image.");
       scaledWidth = yuvImage.getWidth();
       scaledHeight = yuvImage.getHeight();
     } else {
       scaledWidth = getScaledWidth(desiredWidth, desiredHeight);
       scaledHeight = getScaledHeight(desiredWidth, desiredHeight);
       if (scaledWidth != desiredWidth || scaledHeight != desiredHeight)
-        throw new Exception("BufferedImage dimensions do not match one of the scaled image sizes that TurboJPEG is capable of generating.");
+        throw new IllegalArgumentException("BufferedImage dimensions do not match one of the scaled image sizes that TurboJPEG is capable of generating.");
     }
     int pixelFormat;  boolean intPixels = false;
     if (byteOrder == null)
@@ -759,7 +762,7 @@
           pixelFormat = TJ.PF_BGRA;
         intPixels = true;  break;
       default:
-        throw new Exception("Unsupported BufferedImage format");
+        throw new IllegalArgumentException("Unsupported BufferedImage format");
     }
     WritableRaster wr = dstImage.getRaster();
     if (intPixels) {
@@ -775,7 +778,7 @@
                   pixelFormat, flags);
       else {
         if (jpegBuf == null)
-          throw new Exception(NO_ASSOC_ERROR);
+          throw new IllegalStateException(NO_ASSOC_ERROR);
         decompress(jpegBuf, jpegBufSize, buf, 0, 0, scaledWidth, stride,
                    scaledHeight, pixelFormat, flags);
       }
@@ -784,7 +787,7 @@
         (ComponentSampleModel)dstImage.getSampleModel();
       int pixelSize = sm.getPixelStride();
       if (pixelSize != TJ.getPixelSize(pixelFormat))
-        throw new Exception("Inconsistency between pixel format and pixel size in BufferedImage");
+        throw new IllegalArgumentException("Inconsistency between pixel format and pixel size in BufferedImage");
       int pitch = sm.getScanlineStride();
       DataBufferByte db = (DataBufferByte)wr.getDataBuffer();
       byte[] buf = db.getData();
@@ -818,10 +821,10 @@
    */
   public BufferedImage decompress(int desiredWidth, int desiredHeight,
                                   int bufferedImageType, int flags)
-                                  throws Exception {
+                                  throws TJException {
     if ((yuvImage == null && (desiredWidth < 0 || desiredHeight < 0)) ||
         flags < 0)
-      throw new Exception("Invalid argument in decompress()");
+      throw new IllegalArgumentException("Invalid argument in decompress()");
     int scaledWidth = getScaledWidth(desiredWidth, desiredHeight);
     int scaledHeight = getScaledHeight(desiredWidth, desiredHeight);
     BufferedImage img = new BufferedImage(scaledWidth, scaledHeight,
@@ -833,57 +836,62 @@
   /**
    * Free the native structures associated with this decompressor instance.
    */
-  public void close() throws Exception {
+  @Override
+  public void close() throws TJException {
     if (handle != 0)
       destroy();
   }
 
+  @Override
   protected void finalize() throws Throwable {
     try {
       close();
-    } catch(Exception e) {
+    } catch(TJException e) {
     } finally {
       super.finalize();
     }
   };
 
-  private native void init() throws Exception;
+  private native void init() throws TJException;
 
-  private native void destroy() throws Exception;
+  private native void destroy() throws TJException;
 
   private native void decompressHeader(byte[] srcBuf, int size)
-    throws Exception;
+    throws TJException;
 
+  @Deprecated
   private native void decompress(byte[] srcBuf, int size, byte[] dstBuf,
     int desiredWidth, int pitch, int desiredHeight, int pixelFormat, int flags)
-    throws Exception; // deprecated
+    throws TJException;
 
   private native void decompress(byte[] srcBuf, int size, byte[] dstBuf, int x,
     int y, int desiredWidth, int pitch, int desiredHeight, int pixelFormat,
-    int flags) throws Exception;
+    int flags) throws TJException;
 
+  @Deprecated
   private native void decompress(byte[] srcBuf, int size, int[] dstBuf,
     int desiredWidth, int stride, int desiredHeight, int pixelFormat,
-    int flags) throws Exception; // deprecated
+    int flags) throws TJException;
 
   private native void decompress(byte[] srcBuf, int size, int[] dstBuf, int x,
     int y, int desiredWidth, int stride, int desiredHeight, int pixelFormat,
-    int flags) throws Exception;
+    int flags) throws TJException;
 
+  @Deprecated
   private native void decompressToYUV(byte[] srcBuf, int size, byte[] dstBuf,
-    int flags) throws Exception; // deprecated
+    int flags) throws TJException;
 
   private native void decompressToYUV(byte[] srcBuf, int size,
     byte[][] dstPlanes, int[] dstOffsets, int desiredWidth, int[] dstStrides,
-    int desiredheight, int flags) throws Exception;
+    int desiredheight, int flags) throws TJException;
 
   private native void decodeYUV(byte[][] srcPlanes, int[] srcOffsets,
     int[] srcStrides, int subsamp, byte[] dstBuf, int x, int y, int width,
-    int pitch, int height, int pixelFormat, int flags) throws Exception;
+    int pitch, int height, int pixelFormat, int flags) throws TJException;
 
   private native void decodeYUV(byte[][] srcPlanes, int[] srcOffsets,
     int[] srcStrides, int subsamp, int[] dstBuf, int x, int y, int width,
-    int stride, int height, int pixelFormat, int flags) throws Exception;
+    int stride, int height, int pixelFormat, int flags) throws TJException;
 
   static {
     TJLoader.load();
@@ -898,4 +906,4 @@
   protected int jpegSubsamp = -1;
   protected int jpegColorspace = -1;
   private ByteOrder byteOrder = null;
-};
+}
diff --git a/java/org/libjpegturbo/turbojpeg/TJException.java b/java/org/libjpegturbo/turbojpeg/TJException.java
new file mode 100644
index 0000000..59c2041
--- /dev/null
+++ b/java/org/libjpegturbo/turbojpeg/TJException.java
@@ -0,0 +1,53 @@
+/*
+ * Copyright (C)2015 Viktor Szathmáry.  All Rights Reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * - Redistributions of source code must retain the above copyright notice,
+ *   this list of conditions and the following disclaimer.
+ * - Redistributions in binary form must reproduce the above copyright notice,
+ *   this list of conditions and the following disclaimer in the documentation
+ *   and/or other materials provided with the distribution.
+ * - Neither the name of the libjpeg-turbo Project nor the names of its
+ *   contributors may be used to endorse or promote products derived from this
+ *   software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS",
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT HOLDERS OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ */
+
+package org.libjpegturbo.turbojpeg;
+
+import java.io.IOException;
+
+public class TJException extends IOException {
+
+  private static final long serialVersionUID = 1L;
+
+  public TJException() {
+    super();
+  }
+
+  public TJException(String message, Throwable cause) {
+    super(message, cause);
+  }
+
+  public TJException(String message) {
+    super(message);
+  }
+
+  public TJException(Throwable cause) {
+    super(cause);
+  }
+
+}
diff --git a/java/org/libjpegturbo/turbojpeg/TJLoader.java.in b/java/org/libjpegturbo/turbojpeg/TJLoader.java.in
index 22353a5..8397780 100644
--- a/java/org/libjpegturbo/turbojpeg/TJLoader.java.in
+++ b/java/org/libjpegturbo/turbojpeg/TJLoader.java.in
@@ -32,4 +32,4 @@
   static void load() {
     System.loadLibrary("@TURBOJPEG_DLL_NAME@");
   }
-};
+}
diff --git a/java/org/libjpegturbo/turbojpeg/TJLoader.java.tmpl b/java/org/libjpegturbo/turbojpeg/TJLoader.java.tmpl
index a4f1c87..5ef3118 100644
--- a/java/org/libjpegturbo/turbojpeg/TJLoader.java.tmpl
+++ b/java/org/libjpegturbo/turbojpeg/TJLoader.java.tmpl
@@ -56,4 +56,4 @@
       }
     }
   }
-};
+}
diff --git a/java/org/libjpegturbo/turbojpeg/TJScalingFactor.java b/java/org/libjpegturbo/turbojpeg/TJScalingFactor.java
index e00fdf7..ddb1d75 100644
--- a/java/org/libjpegturbo/turbojpeg/TJScalingFactor.java
+++ b/java/org/libjpegturbo/turbojpeg/TJScalingFactor.java
@@ -1,5 +1,6 @@
 /*
  * Copyright (C)2011 D. R. Commander.  All Rights Reserved.
+ * Copyright (C)2015 Viktor Szathmáry.  All Rights Reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions are met:
@@ -33,9 +34,9 @@
  */
 public class TJScalingFactor {
 
-  public TJScalingFactor(int num, int denom) throws Exception {
+  public TJScalingFactor(int num, int denom) {
     if (num < 1 || denom < 1)
-      throw new Exception("Numerator and denominator must be >= 1");
+      throw new IllegalArgumentException("Numerator and denominator must be >= 1");
     this.num = num;
     this.denom = denom;
   }
@@ -77,7 +78,7 @@
    * <code>other</code> have the same numerator and denominator.
    */
   public boolean equals(TJScalingFactor other) {
-    return (this.num == other.num && this.denom == other.denom);
+    return this.num == other.num && this.denom == other.denom;
   }
 
   /**
@@ -88,7 +89,7 @@
    * 1/1.
    */
   public boolean isOne() {
-    return (num == 1 && denom == 1);
+    return num == 1 && denom == 1;
   }
 
   /**
@@ -100,4 +101,4 @@
    * Denominator
    */
   private int denom = 1;
-};
+}
diff --git a/java/org/libjpegturbo/turbojpeg/TJTransform.java b/java/org/libjpegturbo/turbojpeg/TJTransform.java
index b464ffd..7381f36 100644
--- a/java/org/libjpegturbo/turbojpeg/TJTransform.java
+++ b/java/org/libjpegturbo/turbojpeg/TJTransform.java
@@ -160,7 +160,7 @@
    * TJCustomFilter} interface, or null if no custom filter is needed
    */
   public TJTransform(int x, int y, int w, int h, int op, int options,
-                     TJCustomFilter cf) throws Exception {
+                     TJCustomFilter cf) {
     super(x, y, w, h);
     this.op = op;
     this.options = options;
@@ -184,7 +184,7 @@
    * TJCustomFilter} interface, or null if no custom filter is needed
    */
   public TJTransform(Rectangle r, int op, int options,
-                     TJCustomFilter cf) throws Exception {
+                     TJCustomFilter cf) {
     super(r);
     this.op = op;
     this.options = options;
diff --git a/java/org/libjpegturbo/turbojpeg/TJTransformer.java b/java/org/libjpegturbo/turbojpeg/TJTransformer.java
index 2e17344..d76647f 100644
--- a/java/org/libjpegturbo/turbojpeg/TJTransformer.java
+++ b/java/org/libjpegturbo/turbojpeg/TJTransformer.java
@@ -1,5 +1,6 @@
 /*
  * Copyright (C)2011, 2013-2015 D. R. Commander.  All Rights Reserved.
+ * Copyright (C)2015 Viktor Szathmáry.  All Rights Reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions are met:
@@ -36,7 +37,7 @@
   /**
    * Create a TurboJPEG lossless transformer instance.
    */
-  public TJTransformer() throws Exception {
+  public TJTransformer() throws TJException {
     init();
   }
 
@@ -47,7 +48,7 @@
    * @param jpegImage JPEG image buffer (size of the JPEG image is assumed to
    * be the length of the array.)  This buffer is not modified.
    */
-  public TJTransformer(byte[] jpegImage) throws Exception {
+  public TJTransformer(byte[] jpegImage) throws TJException {
     init();
     setSourceImage(jpegImage, jpegImage.length);
   }
@@ -61,7 +62,7 @@
    *
    * @param imageSize size of the JPEG image (in bytes)
    */
-  public TJTransformer(byte[] jpegImage, int imageSize) throws Exception {
+  public TJTransformer(byte[] jpegImage, int imageSize) throws TJException {
     init();
     setSourceImage(jpegImage, imageSize);
   }
@@ -94,9 +95,9 @@
    * {@link TJ#FLAG_BOTTOMUP TJ.FLAG_*}
    */
   public void transform(byte[][] dstBufs, TJTransform[] transforms,
-                        int flags) throws Exception {
+                        int flags) throws TJException {
     if (jpegBuf == null)
-      throw new Exception("JPEG buffer not initialized");
+      throw new IllegalStateException("JPEG buffer not initialized");
     transformedSizes = transform(jpegBuf, jpegBufSize, dstBufs, transforms,
                                  flags);
   }
@@ -117,10 +118,10 @@
    * {@link TJ#FLAG_BOTTOMUP TJ.FLAG_*}
    */
   public TJDecompressor[] transform(TJTransform[] transforms, int flags)
-    throws Exception {
+                                    throws TJException {
     byte[][] dstBufs = new byte[transforms.length][];
     if (jpegWidth < 1 || jpegHeight < 1)
-      throw new Exception("JPEG buffer not initialized");
+      throw new IllegalStateException("JPEG buffer not initialized");
     for (int i = 0; i < transforms.length; i++) {
       int w = jpegWidth, h = jpegHeight;
       if ((transforms[i].options & TJTransform.OPT_CROP) != 0) {
@@ -143,20 +144,20 @@
    * @return an array containing the sizes of the transformed JPEG images
    * generated by the most recent transform operation.
    */
-  public int[] getTransformedSizes() throws Exception {
+  public int[] getTransformedSizes() {
     if (transformedSizes == null)
-      throw new Exception("No image has been transformed yet");
+      throw new IllegalStateException("No image has been transformed yet");
     return transformedSizes;
   }
 
-  private native void init() throws Exception;
+  private native void init() throws TJException;
 
   private native int[] transform(byte[] srcBuf, int srcSize, byte[][] dstBufs,
-    TJTransform[] transforms, int flags) throws Exception;
+    TJTransform[] transforms, int flags) throws TJException;
 
   static {
     TJLoader.load();
   }
 
   private int[] transformedSizes = null;
-};
+}
diff --git a/java/org/libjpegturbo/turbojpeg/YUVImage.java b/java/org/libjpegturbo/turbojpeg/YUVImage.java
index 2d790e9..1a05e62 100644
--- a/java/org/libjpegturbo/turbojpeg/YUVImage.java
+++ b/java/org/libjpegturbo/turbojpeg/YUVImage.java
@@ -1,5 +1,6 @@
 /*
  * Copyright (C)2014 D. R. Commander.  All Rights Reserved.
+ * Copyright (C)2015 Viktor Szathmáry.  All Rights Reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions are met:
@@ -86,8 +87,7 @@
    * @param subsamp the level of chrominance subsampling to be used in the YUV
    * image (one of {@link TJ#SAMP_444 TJ.SAMP_*})
    */
-  public YUVImage(int width, int[] strides, int height, int subsamp)
-    throws Exception {
+  public YUVImage(int width, int[] strides, int height, int subsamp) {
     setBuf(null, null, width, strides, height, subsamp, true);
   }
 
@@ -105,8 +105,7 @@
    * @param subsamp the level of chrominance subsampling to be used in the YUV
    * image (one of {@link TJ#SAMP_444 TJ.SAMP_*})
    */
-  public YUVImage(int width, int pad, int height, int subsamp)
-    throws Exception {
+  public YUVImage(int width, int pad, int height, int subsamp) {
     setBuf(new byte[TJ.bufSizeYUV(width, pad, height, subsamp)], width, pad,
            height, subsamp);
   }
@@ -146,7 +145,7 @@
    * image (one of {@link TJ#SAMP_444 TJ.SAMP_*})
    */
   public YUVImage(byte[][] planes, int[] offsets, int width, int[] strides,
-                  int height, int subsamp) throws Exception {
+                  int height, int subsamp) {
     setBuf(planes, offsets, width, strides, height, subsamp, false);
   }
 
@@ -172,7 +171,7 @@
    * image (one of {@link TJ#SAMP_444 TJ.SAMP_*})
    */
   public YUVImage(byte[] yuvImage, int width, int pad, int height,
-                  int subsamp) throws Exception {
+                  int subsamp) {
     setBuf(yuvImage, width, pad, height, subsamp);
   }
 
@@ -210,20 +209,20 @@
    * image (one of {@link TJ#SAMP_444 TJ.SAMP_*})
    */
   public void setBuf(byte[][] planes, int[] offsets, int width, int strides[],
-                     int height, int subsamp) throws Exception {
+                     int height, int subsamp) {
     setBuf(planes, offsets, width, strides, height, subsamp, false);
   }
 
   private void setBuf(byte[][] planes, int[] offsets, int width, int strides[],
-                     int height, int subsamp, boolean alloc) throws Exception {
+                     int height, int subsamp, boolean alloc) {
     if ((planes == null && !alloc) || width < 1 || height < 1 || subsamp < 0 ||
         subsamp >= TJ.NUMSAMP)
-      throw new Exception("Invalid argument in YUVImage::setBuf()");
+      throw new IllegalArgumentException("Invalid argument in YUVImage::setBuf()");
 
     int nc = (subsamp == TJ.SAMP_GRAY ? 1 : 3);
     if (planes.length != nc || (offsets != null && offsets.length != nc) ||
         (strides != null && strides.length != nc))
-      throw new Exception("YUVImage::setBuf(): planes, offsets, or strides array is the wrong size");
+      throw new IllegalArgumentException("YUVImage::setBuf(): planes, offsets, or strides array is the wrong size");
 
     if (offsets == null)
       offsets = new int[nc];
@@ -239,15 +238,15 @@
         strides[i] = pw;
       if (alloc) {
         if (strides[i] < pw)
-          throw new Exception("Stride must be >= plane width when allocating a new YUV image");
+          throw new IllegalArgumentException("Stride must be >= plane width when allocating a new YUV image");
         planes[i] = new byte[strides[i] * ph];
       }
       if (planes[i] == null || offsets[i] < 0)
-        throw new Exception("Invalid argument in YUVImage::setBuf()");
+        throw new IllegalArgumentException("Invalid argument in YUVImage::setBuf()");
       if (strides[i] < 0 && offsets[i] - planeSize + pw < 0)
-        throw new Exception("Stride for plane " + i + " would cause memory to be accessed below plane boundary");
+        throw new IllegalArgumentException("Stride for plane " + i + " would cause memory to be accessed below plane boundary");
       if (planes[i].length < offsets[i] + planeSize)
-        throw new Exception("Image plane " + i + " is not large enough");
+        throw new IllegalArgumentException("Image plane " + i + " is not large enough");
     }
 
     yuvPlanes = planes;
@@ -279,13 +278,13 @@
    * image (one of {@link TJ#SAMP_444 TJ.SAMP_*})
    */
   public void setBuf(byte[] yuvImage, int width, int pad, int height,
-                     int subsamp) throws Exception {
+                     int subsamp) {
     if (yuvImage == null || width < 1 || pad < 1 || ((pad & (pad - 1)) != 0) ||
         height < 1 || subsamp < 0 || subsamp >= TJ.NUMSAMP)
-      throw new Exception("Invalid argument in YUVImage::setBuf()");
+      throw new IllegalArgumentException("Invalid argument in YUVImage::setBuf()");
     if (yuvImage.length < TJ.bufSizeYUV(width, pad, height, subsamp))
-      throw new Exception("YUV image buffer is not large enough");
-    
+      throw new IllegalArgumentException("YUV image buffer is not large enough");
+
     int nc = (subsamp == TJ.SAMP_GRAY ? 1 : 3);
     byte[][] planes = new byte[nc][];
     int[] strides = new int[nc];
@@ -311,9 +310,9 @@
    *
    * @return the width of the YUV image (or subregion)
    */
-  public int getWidth() throws Exception {
+  public int getWidth() {
     if (yuvWidth < 1)
-      throw new Exception(NO_ASSOC_ERROR);
+      throw new IllegalStateException(NO_ASSOC_ERROR);
     return yuvWidth;
   }
 
@@ -322,9 +321,9 @@
    *
    * @return the height of the YUV image (or subregion)
    */
-  public int getHeight() throws Exception {
+  public int getHeight() {
     if (yuvHeight < 1)
-      throw new Exception(NO_ASSOC_ERROR);
+      throw new IllegalStateException(NO_ASSOC_ERROR);
     return yuvHeight;
   }
 
@@ -334,11 +333,11 @@
    *
    * @return the line padding used in the YUV image buffer
    */
-  public int getPad() throws Exception {
+  public int getPad() {
     if (yuvPlanes == null)
-      throw new Exception(NO_ASSOC_ERROR);
+      throw new IllegalStateException(NO_ASSOC_ERROR);
     if (yuvPad < 1 || ((yuvPad & (yuvPad - 1)) != 0))
-      throw new Exception("Image is not stored in a unified buffer");
+      throw new IllegalStateException("Image is not stored in a unified buffer");
     return yuvPad;
   }
 
@@ -347,9 +346,9 @@
    *
    * @return the number of bytes per line of each plane in the YUV image
    */
-  public int[] getStrides() throws Exception {
+  public int[] getStrides() {
     if (yuvStrides == null)
-      throw new Exception(NO_ASSOC_ERROR);
+      throw new IllegalStateException(NO_ASSOC_ERROR);
     return yuvStrides;
   }
 
@@ -360,9 +359,9 @@
    * @return the offsets (in bytes) of each plane within the planes of a larger
    * YUV image
    */
-  public int[] getOffsets() throws Exception {
+  public int[] getOffsets() {
     if (yuvOffsets == null)
-      throw new Exception(NO_ASSOC_ERROR);
+      throw new IllegalStateException(NO_ASSOC_ERROR);
     return yuvOffsets;
   }
 
@@ -372,9 +371,9 @@
    *
    * @return the level of chrominance subsampling used in the YUV image
    */
-  public int getSubsamp() throws Exception {
+  public int getSubsamp() {
     if (yuvSubsamp < 0 || yuvSubsamp >= TJ.NUMSAMP)
-      throw new Exception(NO_ASSOC_ERROR);
+      throw new IllegalStateException(NO_ASSOC_ERROR);
     return yuvSubsamp;
   }
 
@@ -384,9 +383,9 @@
    *
    * @return the YUV image planes
    */
-  public byte[][] getPlanes() throws Exception {
+  public byte[][] getPlanes() {
     if (yuvPlanes == null)
-      throw new Exception(NO_ASSOC_ERROR);
+      throw new IllegalStateException(NO_ASSOC_ERROR);
     return yuvPlanes;
   }
 
@@ -396,13 +395,13 @@
    *
    * @return the YUV image buffer
    */
-  public byte[] getBuf() throws Exception {
+  public byte[] getBuf() {
     if (yuvPlanes == null || yuvSubsamp < 0 || yuvSubsamp >= TJ.NUMSAMP)
-      throw new Exception(NO_ASSOC_ERROR);
+      throw new IllegalStateException(NO_ASSOC_ERROR);
     int nc = (yuvSubsamp == TJ.SAMP_GRAY ? 1 : 3);
     for (int i = 1; i < nc; i++) {
       if (yuvPlanes[i] != yuvPlanes[0])
-        throw new Exception("Image is not stored in a unified buffer");
+        throw new IllegalStateException("Image is not stored in a unified buffer");
     }
     return yuvPlanes[0];
   }
@@ -413,15 +412,15 @@
    *
    * @return the size (in bytes) of the YUV image buffer
    */
-  public int getSize() throws Exception {
+  public int getSize() {
     if (yuvPlanes == null || yuvSubsamp < 0 || yuvSubsamp >= TJ.NUMSAMP)
-      throw new Exception(NO_ASSOC_ERROR);
+      throw new IllegalStateException(NO_ASSOC_ERROR);
     int nc = (yuvSubsamp == TJ.SAMP_GRAY ? 1 : 3);
     if (yuvPad < 1)
-      throw new Exception("Image is not stored in a unified buffer");
+      throw new IllegalStateException("Image is not stored in a unified buffer");
     for (int i = 1; i < nc; i++) {
       if (yuvPlanes[i] != yuvPlanes[0])
-        throw new Exception("Image is not stored in a unified buffer");
+        throw new IllegalStateException("Image is not stored in a unified buffer");
     }
     return TJ.bufSizeYUV(yuvWidth, yuvPad, yuvHeight, yuvSubsamp);
   }
@@ -438,4 +437,4 @@
   protected int yuvWidth = 0;
   protected int yuvHeight = 0;
   protected int yuvSubsamp = -1;
-};
+}
diff --git a/jcapimin.c b/jcapimin.c
index 3b005d3..15674be 100644
--- a/jcapimin.c
+++ b/jcapimin.c
@@ -6,7 +6,8 @@
  * Modified 2003-2010 by Guido Vollbeding.
  * It was modified by The libjpeg-turbo Project to include only code relevant
  * to libjpeg-turbo.
- * For conditions of distribution and use, see the accompanying README file.
+ * For conditions of distribution and use, see the accompanying README.ijg
+ * file.
  *
  * This file contains application interface code for the compression half
  * of the JPEG library.  These are the "minimum" API routines that may be
@@ -49,8 +50,8 @@
    * complain here.
    */
   {
-    struct jpeg_error_mgr * err = cinfo->err;
-    void * client_data = cinfo->client_data; /* ignore Purify complaint here */
+    struct jpeg_error_mgr *err = cinfo->err;
+    void *client_data = cinfo->client_data; /* ignore Purify complaint here */
     MEMZERO(cinfo, sizeof(struct jpeg_compress_struct));
     cinfo->err = err;
     cinfo->client_data = client_data;
@@ -133,8 +134,8 @@
 jpeg_suppress_tables (j_compress_ptr cinfo, boolean suppress)
 {
   int i;
-  JQUANT_TBL * qtbl;
-  JHUFF_TBL * htbl;
+  JQUANT_TBL *qtbl;
+  JHUFF_TBL *htbl;
 
   for (i = 0; i < NUM_QUANT_TBLS; i++) {
     if ((qtbl = cinfo->quant_tbl_ptrs[i]) != NULL)
diff --git a/jcapistd.c b/jcapistd.c
index 167f020..5c6d0be 100644
--- a/jcapistd.c
+++ b/jcapistd.c
@@ -3,7 +3,8 @@
  *
  * Copyright (C) 1994-1996, Thomas G. Lane.
  * This file is part of the Independent JPEG Group's software.
- * For conditions of distribution and use, see the accompanying README file.
+ * For conditions of distribution and use, see the accompanying README.ijg
+ * file.
  *
  * This file contains application interface code for the compression half
  * of the JPEG library.  These are the "standard" API routines that are
diff --git a/jcarith.c b/jcarith.c
index 71a84dd..6d3b8af 100644
--- a/jcarith.c
+++ b/jcarith.c
@@ -3,9 +3,10 @@
  *
  * This file was part of the Independent JPEG Group's software:
  * Developed 1997-2009 by Guido Vollbeding.
- * It was modified by The libjpeg-turbo Project to include only code relevant
- * to libjpeg-turbo.
- * For conditions of distribution and use, see the accompanying README file.
+ * libjpeg-turbo Modifications:
+ * Copyright (C) 2015, D. R. Commander.
+ * For conditions of distribution and use, see the accompanying README.ijg
+ * file.
  *
  * This file contains portable arithmetic entropy encoding routines for JPEG
  * (implementing the ISO/IEC IS 10918-1 and CCITT Recommendation ITU-T T.81).
@@ -25,10 +26,10 @@
 typedef struct {
   struct jpeg_entropy_encoder pub; /* public fields */
 
-  INT32 c; /* C register, base of coding interval, layout as in sec. D.1.3 */
-  INT32 a;               /* A register, normalized size of coding interval */
-  INT32 sc;        /* counter for stacked 0xFF values which might overflow */
-  INT32 zc;          /* counter for pending 0x00 output values which might *
+  JLONG c; /* C register, base of coding interval, layout as in sec. D.1.3 */
+  JLONG a;               /* A register, normalized size of coding interval */
+  JLONG sc;        /* counter for stacked 0xFF values which might overflow */
+  JLONG zc;          /* counter for pending 0x00 output values which might *
                           * be discarded at the end ("Pacman" termination) */
   int ct;  /* bit shift counter, determines when next byte will be written */
   int buffer;                /* buffer for most recent output byte != 0xFF */
@@ -40,14 +41,14 @@
   int next_restart_num;         /* next restart number to write (0-7) */
 
   /* Pointers to statistics areas (these workspaces have image lifespan) */
-  unsigned char * dc_stats[NUM_ARITH_TBLS];
-  unsigned char * ac_stats[NUM_ARITH_TBLS];
+  unsigned char *dc_stats[NUM_ARITH_TBLS];
+  unsigned char *ac_stats[NUM_ARITH_TBLS];
 
   /* Statistics bin for coding with fixed probability 0.5 */
   unsigned char fixed_bin[4];
 } arith_entropy_encoder;
 
-typedef arith_entropy_encoder * arith_entropy_ptr;
+typedef arith_entropy_encoder *arith_entropy_ptr;
 
 /* The following two definitions specify the allocation chunk size
  * for the statistics area.
@@ -97,8 +98,8 @@
 #define CALCULATE_SPECTRAL_CONDITIONING
  */
 
-/* IRIGHT_SHIFT is like RIGHT_SHIFT, but works on int rather than INT32.
- * We assume that int right shift is unsigned if INT32 right shift is,
+/* IRIGHT_SHIFT is like RIGHT_SHIFT, but works on int rather than JLONG.
+ * We assume that int right shift is unsigned if JLONG right shift is,
  * which should be safe.
  */
 
@@ -118,7 +119,7 @@
 emit_byte (int val, j_compress_ptr cinfo)
 /* Write next output byte; we do not support suspension in this module. */
 {
-  struct jpeg_destination_mgr * dest = cinfo->dest;
+  struct jpeg_destination_mgr *dest = cinfo->dest;
 
   *dest->next_output_byte++ = (JOCTET) val;
   if (--dest->free_in_buffer == 0)
@@ -135,7 +136,7 @@
 finish_pass (j_compress_ptr cinfo)
 {
   arith_entropy_ptr e = (arith_entropy_ptr) cinfo->entropy;
-  INT32 temp;
+  JLONG temp;
 
   /* Section D.1.8: Termination of encoding */
 
@@ -222,7 +223,7 @@
 {
   register arith_entropy_ptr e = (arith_entropy_ptr) cinfo->entropy;
   register unsigned char nl, nm;
-  register INT32 qe, temp;
+  register JLONG qe, temp;
   register int sv;
 
   /* Fetch values from our compact representation of Table D.2:
@@ -322,7 +323,7 @@
 {
   arith_entropy_ptr entropy = (arith_entropy_ptr) cinfo->entropy;
   int ci;
-  jpeg_component_info * compptr;
+  jpeg_component_info *compptr;
 
   finish_pass(cinfo);
 
@@ -682,7 +683,7 @@
 encode_mcu (j_compress_ptr cinfo, JBLOCKROW *MCU_data)
 {
   arith_entropy_ptr entropy = (arith_entropy_ptr) cinfo->entropy;
-  jpeg_component_info * compptr;
+  jpeg_component_info *compptr;
   JBLOCKROW block;
   unsigned char *st;
   int blkn, ci, tbl, k, ke;
@@ -825,7 +826,7 @@
 {
   arith_entropy_ptr entropy = (arith_entropy_ptr) cinfo->entropy;
   int ci, tbl;
-  jpeg_component_info * compptr;
+  jpeg_component_info *compptr;
 
   if (gather_statistics)
     /* Make sure to avoid that in the master control logic!
diff --git a/jccoefct.c b/jccoefct.c
index a4acce5..a08d6e3 100644
--- a/jccoefct.c
+++ b/jccoefct.c
@@ -5,7 +5,8 @@
  * Copyright (C) 1994-1997, Thomas G. Lane.
  * It was modified by The libjpeg-turbo Project to include only code and
  * information relevant to libjpeg-turbo.
- * For conditions of distribution and use, see the accompanying README file.
+ * For conditions of distribution and use, see the accompanying README.ijg
+ * file.
  *
  * This file contains the coefficient buffer controller for compression.
  * This controller is the top level of the JPEG compressor proper.
@@ -53,7 +54,7 @@
   jvirt_barray_ptr whole_image[MAX_COMPONENTS];
 } my_coef_controller;
 
-typedef my_coef_controller * my_coef_ptr;
+typedef my_coef_controller *my_coef_ptr;
 
 
 /* Forward declarations */
diff --git a/jccolext.c b/jccolext.c
index 2c6b7ac..479b320 100644
--- a/jccolext.c
+++ b/jccolext.c
@@ -4,8 +4,9 @@
  * This file was part of the Independent JPEG Group's software:
  * Copyright (C) 1991-1996, Thomas G. Lane.
  * libjpeg-turbo Modifications:
- * Copyright (C) 2009-2012, D. R. Commander.
- * For conditions of distribution and use, see the accompanying README file.
+ * Copyright (C) 2009-2012, 2015, D. R. Commander.
+ * For conditions of distribution and use, see the accompanying README.ijg
+ * file.
  *
  * This file contains input colorspace conversion routines.
  */
@@ -34,7 +35,7 @@
 {
   my_cconvert_ptr cconvert = (my_cconvert_ptr) cinfo->cconvert;
   register int r, g, b;
-  register INT32 * ctab = cconvert->rgb_ycc_tab;
+  register JLONG * ctab = cconvert->rgb_ycc_tab;
   register JSAMPROW inptr;
   register JSAMPROW outptr0, outptr1, outptr2;
   register JDIMENSION col;
@@ -91,7 +92,7 @@
 {
   my_cconvert_ptr cconvert = (my_cconvert_ptr) cinfo->cconvert;
   register int r, g, b;
-  register INT32 * ctab = cconvert->rgb_ycc_tab;
+  register JLONG * ctab = cconvert->rgb_ycc_tab;
   register JSAMPROW inptr;
   register JSAMPROW outptr;
   register JDIMENSION col;
diff --git a/jccolor.c b/jccolor.c
index 34ea23b..a93498a 100644
--- a/jccolor.c
+++ b/jccolor.c
@@ -7,7 +7,8 @@
  * Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
  * Copyright (C) 2009-2012, 2015 D. R. Commander.
  * Copyright (C) 2014, MIPS Technologies, Inc., California
- * For conditions of distribution and use, see the accompanying README file.
+ * For conditions of distribution and use, see the accompanying README.ijg
+ * file.
  *
  * This file contains input colorspace conversion routines.
  */
@@ -25,10 +26,10 @@
   struct jpeg_color_converter pub; /* public fields */
 
   /* Private state for RGB->YCC conversion */
-  INT32 * rgb_ycc_tab;          /* => table for RGB to YCbCr conversion */
+  JLONG *rgb_ycc_tab;           /* => table for RGB to YCbCr conversion */
 } my_color_converter;
 
-typedef my_color_converter * my_cconvert_ptr;
+typedef my_color_converter *my_cconvert_ptr;
 
 
 /**************** RGB -> YCbCr conversion: most common case **************/
@@ -62,9 +63,9 @@
  */
 
 #define SCALEBITS       16      /* speediest right-shift on some machines */
-#define CBCR_OFFSET     ((INT32) CENTERJSAMPLE << SCALEBITS)
-#define ONE_HALF        ((INT32) 1 << (SCALEBITS-1))
-#define FIX(x)          ((INT32) ((x) * (1L<<SCALEBITS) + 0.5))
+#define CBCR_OFFSET     ((JLONG) CENTERJSAMPLE << SCALEBITS)
+#define ONE_HALF        ((JLONG) 1 << (SCALEBITS-1))
+#define FIX(x)          ((JLONG) ((x) * (1L<<SCALEBITS) + 0.5))
 
 /* We allocate one big table and divide it up into eight parts, instead of
  * doing eight alloc_small requests.  This lets us use a single table base
@@ -197,13 +198,13 @@
 rgb_ycc_start (j_compress_ptr cinfo)
 {
   my_cconvert_ptr cconvert = (my_cconvert_ptr) cinfo->cconvert;
-  INT32 * rgb_ycc_tab;
-  INT32 i;
+  JLONG *rgb_ycc_tab;
+  JLONG i;
 
   /* Allocate and fill in the conversion tables. */
-  cconvert->rgb_ycc_tab = rgb_ycc_tab = (INT32 *)
+  cconvert->rgb_ycc_tab = rgb_ycc_tab = (JLONG *)
     (*cinfo->mem->alloc_small) ((j_common_ptr) cinfo, JPOOL_IMAGE,
-                                (TABLE_SIZE * sizeof(INT32)));
+                                (TABLE_SIZE * sizeof(JLONG)));
 
   for (i = 0; i <= MAXJSAMPLE; i++) {
     rgb_ycc_tab[i+R_Y_OFF] = FIX(0.29900) * i;
@@ -381,7 +382,7 @@
 {
   my_cconvert_ptr cconvert = (my_cconvert_ptr) cinfo->cconvert;
   register int r, g, b;
-  register INT32 * ctab = cconvert->rgb_ycc_tab;
+  register JLONG *ctab = cconvert->rgb_ycc_tab;
   register JSAMPROW inptr;
   register JSAMPROW outptr0, outptr1, outptr2, outptr3;
   register JDIMENSION col;
diff --git a/jcdctmgr.c b/jcdctmgr.c
index 6a8e831..aef8517 100644
--- a/jcdctmgr.c
+++ b/jcdctmgr.c
@@ -6,8 +6,9 @@
  * libjpeg-turbo Modifications:
  * Copyright (C) 1999-2006, MIYASAKA Masaru.
  * Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
- * Copyright (C) 2011, 2014-2015 D. R. Commander
- * For conditions of distribution and use, see the accompanying README file.
+ * Copyright (C) 2011, 2014-2015, D. R. Commander.
+ * For conditions of distribution and use, see the accompanying README.ijg
+ * file.
  *
  * This file contains the forward-DCT management logic.
  * This code selects a particular DCT implementation to be used,
@@ -24,21 +25,21 @@
 
 /* Private subobject for this module */
 
-typedef void (*forward_DCT_method_ptr) (DCTELEM * data);
-typedef void (*float_DCT_method_ptr) (FAST_FLOAT * data);
+typedef void (*forward_DCT_method_ptr) (DCTELEM *data);
+typedef void (*float_DCT_method_ptr) (FAST_FLOAT *data);
 
 typedef void (*convsamp_method_ptr) (JSAMPARRAY sample_data,
                                      JDIMENSION start_col,
-                                     DCTELEM * workspace);
+                                     DCTELEM *workspace);
 typedef void (*float_convsamp_method_ptr) (JSAMPARRAY sample_data,
                                            JDIMENSION start_col,
                                            FAST_FLOAT *workspace);
 
-typedef void (*quantize_method_ptr) (JCOEFPTR coef_block, DCTELEM * divisors,
-                                     DCTELEM * workspace);
+typedef void (*quantize_method_ptr) (JCOEFPTR coef_block, DCTELEM *divisors,
+                                     DCTELEM *workspace);
 typedef void (*float_quantize_method_ptr) (JCOEFPTR coef_block,
-                                           FAST_FLOAT * divisors,
-                                           FAST_FLOAT * workspace);
+                                           FAST_FLOAT *divisors,
+                                           FAST_FLOAT *workspace);
 
 METHODDEF(void) quantize (JCOEFPTR, DCTELEM *, DCTELEM *);
 
@@ -54,22 +55,22 @@
    * entries, because of scaling (especially for an unnormalized DCT).
    * Each table is given in normal array order.
    */
-  DCTELEM * divisors[NUM_QUANT_TBLS];
+  DCTELEM *divisors[NUM_QUANT_TBLS];
 
   /* work area for FDCT subroutine */
-  DCTELEM * workspace;
+  DCTELEM *workspace;
 
 #ifdef DCT_FLOAT_SUPPORTED
   /* Same as above for the floating-point case. */
   float_DCT_method_ptr float_dct;
   float_convsamp_method_ptr float_convsamp;
   float_quantize_method_ptr float_quantize;
-  FAST_FLOAT * float_divisors[NUM_QUANT_TBLS];
-  FAST_FLOAT * float_workspace;
+  FAST_FLOAT *float_divisors[NUM_QUANT_TBLS];
+  FAST_FLOAT *float_workspace;
 #endif
 } my_fdct_controller;
 
-typedef my_fdct_controller * my_fdct_ptr;
+typedef my_fdct_controller *my_fdct_ptr;
 
 
 #if BITS_IN_JSAMPLE == 8
@@ -169,7 +170,7 @@
  */
 
 LOCAL(int)
-compute_reciprocal (UINT16 divisor, DCTELEM * dtbl)
+compute_reciprocal (UINT16 divisor, DCTELEM *dtbl)
 {
   UDCTELEM2 fq, fr;
   UDCTELEM c;
@@ -208,7 +209,11 @@
 
   dtbl[DCTSIZE2 * 0] = (DCTELEM) fq;      /* reciprocal */
   dtbl[DCTSIZE2 * 1] = (DCTELEM) c;       /* correction + roundfactor */
+#ifdef WITH_SIMD
   dtbl[DCTSIZE2 * 2] = (DCTELEM) (1 << (sizeof(DCTELEM)*8*2 - r));  /* scale */
+#else
+  dtbl[DCTSIZE2 * 2] = 1;
+#endif
   dtbl[DCTSIZE2 * 3] = (DCTELEM) r - sizeof(DCTELEM)*8; /* shift */
 
   if(r <= 16) return 0;
@@ -233,8 +238,8 @@
   my_fdct_ptr fdct = (my_fdct_ptr) cinfo->fdct;
   int ci, qtblno, i;
   jpeg_component_info *compptr;
-  JQUANT_TBL * qtbl;
-  DCTELEM * dtbl;
+  JQUANT_TBL *qtbl;
+  DCTELEM *dtbl;
 
   for (ci = 0, compptr = cinfo->comp_info; ci < cinfo->num_components;
        ci++, compptr++) {
@@ -260,8 +265,8 @@
       dtbl = fdct->divisors[qtblno];
       for (i = 0; i < DCTSIZE2; i++) {
 #if BITS_IN_JSAMPLE == 8
-        if(!compute_reciprocal(qtbl->quantval[i] << 3, &dtbl[i])
-          && fdct->quantize == jsimd_quantize)
+        if (!compute_reciprocal(qtbl->quantval[i] << 3, &dtbl[i]) &&
+            fdct->quantize == jsimd_quantize)
           fdct->quantize = quantize;
 #else
         dtbl[i] = ((DCTELEM) qtbl->quantval[i]) << 3;
@@ -300,16 +305,16 @@
         dtbl = fdct->divisors[qtblno];
         for (i = 0; i < DCTSIZE2; i++) {
 #if BITS_IN_JSAMPLE == 8
-          if(!compute_reciprocal(
-            DESCALE(MULTIPLY16V16((INT32) qtbl->quantval[i],
-                                  (INT32) aanscales[i]),
-                    CONST_BITS-3), &dtbl[i])
-            && fdct->quantize == jsimd_quantize)
+          if (!compute_reciprocal(
+                DESCALE(MULTIPLY16V16((JLONG) qtbl->quantval[i],
+                                      (JLONG) aanscales[i]),
+                        CONST_BITS-3), &dtbl[i]) &&
+              fdct->quantize == jsimd_quantize)
             fdct->quantize = quantize;
 #else
            dtbl[i] = (DCTELEM)
-             DESCALE(MULTIPLY16V16((INT32) qtbl->quantval[i],
-                                   (INT32) aanscales[i]),
+             DESCALE(MULTIPLY16V16((JLONG) qtbl->quantval[i],
+                                   (JLONG) aanscales[i]),
                      CONST_BITS-3);
 #endif
         }
@@ -327,7 +332,7 @@
          * What's actually stored is 1/divisor so that the inner loop can
          * use a multiplication rather than a division.
          */
-        FAST_FLOAT * fdtbl;
+        FAST_FLOAT *fdtbl;
         int row, col;
         static const double aanscalefactor[DCTSIZE] = {
           1.0, 1.387039845, 1.306562965, 1.175875602,
@@ -365,7 +370,7 @@
  */
 
 METHODDEF(void)
-convsamp (JSAMPARRAY sample_data, JDIMENSION start_col, DCTELEM * workspace)
+convsamp (JSAMPARRAY sample_data, JDIMENSION start_col, DCTELEM *workspace)
 {
   register DCTELEM *workspaceptr;
   register JSAMPROW elemptr;
@@ -400,7 +405,7 @@
  */
 
 METHODDEF(void)
-quantize (JCOEFPTR coef_block, DCTELEM * divisors, DCTELEM * workspace)
+quantize (JCOEFPTR coef_block, DCTELEM *divisors, DCTELEM *workspace)
 {
   int i;
   DCTELEM temp;
@@ -422,12 +427,12 @@
       temp = -temp;
       product = (UDCTELEM2)(temp + corr) * recip;
       product >>= shift + sizeof(DCTELEM)*8;
-      temp = product;
+      temp = (DCTELEM)product;
       temp = -temp;
     } else {
       product = (UDCTELEM2)(temp + corr) * recip;
       product >>= shift + sizeof(DCTELEM)*8;
-      temp = product;
+      temp = (DCTELEM)product;
     }
     output_ptr[i] = (JCOEF) temp;
   }
@@ -482,7 +487,7 @@
  */
 
 METHODDEF(void)
-forward_DCT (j_compress_ptr cinfo, jpeg_component_info * compptr,
+forward_DCT (j_compress_ptr cinfo, jpeg_component_info *compptr,
              JSAMPARRAY sample_data, JBLOCKROW coef_blocks,
              JDIMENSION start_row, JDIMENSION start_col,
              JDIMENSION num_blocks)
@@ -490,8 +495,8 @@
 {
   /* This routine is heavily used, so it's worth coding it tightly. */
   my_fdct_ptr fdct = (my_fdct_ptr) cinfo->fdct;
-  DCTELEM * divisors = fdct->divisors[compptr->quant_tbl_no];
-  DCTELEM * workspace;
+  DCTELEM *divisors = fdct->divisors[compptr->quant_tbl_no];
+  DCTELEM *workspace;
   JDIMENSION bi;
 
   /* Make sure the compiler doesn't look up these every pass */
@@ -519,7 +524,7 @@
 
 
 METHODDEF(void)
-convsamp_float (JSAMPARRAY sample_data, JDIMENSION start_col, FAST_FLOAT * workspace)
+convsamp_float (JSAMPARRAY sample_data, JDIMENSION start_col, FAST_FLOAT *workspace)
 {
   register FAST_FLOAT *workspaceptr;
   register JSAMPROW elemptr;
@@ -550,7 +555,7 @@
 
 
 METHODDEF(void)
-quantize_float (JCOEFPTR coef_block, FAST_FLOAT * divisors, FAST_FLOAT * workspace)
+quantize_float (JCOEFPTR coef_block, FAST_FLOAT *divisors, FAST_FLOAT *workspace)
 {
   register FAST_FLOAT temp;
   register int i;
@@ -572,7 +577,7 @@
 
 
 METHODDEF(void)
-forward_DCT_float (j_compress_ptr cinfo, jpeg_component_info * compptr,
+forward_DCT_float (j_compress_ptr cinfo, jpeg_component_info *compptr,
                    JSAMPARRAY sample_data, JBLOCKROW coef_blocks,
                    JDIMENSION start_row, JDIMENSION start_col,
                    JDIMENSION num_blocks)
@@ -580,8 +585,8 @@
 {
   /* This routine is heavily used, so it's worth coding it tightly. */
   my_fdct_ptr fdct = (my_fdct_ptr) cinfo->fdct;
-  FAST_FLOAT * divisors = fdct->float_divisors[compptr->quant_tbl_no];
-  FAST_FLOAT * workspace;
+  FAST_FLOAT *divisors = fdct->float_divisors[compptr->quant_tbl_no];
+  FAST_FLOAT *workspace;
   JDIMENSION bi;
 
 
diff --git a/jchuff.c b/jchuff.c
index 7e91fde..58acd70 100644
--- a/jchuff.c
+++ b/jchuff.c
@@ -5,7 +5,9 @@
  * Copyright (C) 1991-1997, Thomas G. Lane.
  * libjpeg-turbo Modifications:
  * Copyright (C) 2009-2011, 2014-2016 D. R. Commander.
- * For conditions of distribution and use, see the accompanying README file.
+ * Copyright (C) 2015 Matthieu Darbois.
+ * For conditions of distribution and use, see the accompanying README.ijg
+ * file.
  *
  * This file contains Huffman entropy encoding routines.
  *
@@ -19,7 +21,7 @@
 #define JPEG_INTERNALS
 #include "jinclude.h"
 #include "jpeglib.h"
-#include "jchuff.h"             /* Declarations shared with jcphuff.c */
+#include "jsimd.h"
 #include "jconfigint.h"
 #include <limits.h>
 
@@ -100,23 +102,25 @@
   int next_restart_num;         /* next restart number to write (0-7) */
 
   /* Pointers to derived tables (these workspaces have image lifespan) */
-  c_derived_tbl * dc_derived_tbls[NUM_HUFF_TBLS];
-  c_derived_tbl * ac_derived_tbls[NUM_HUFF_TBLS];
+  c_derived_tbl *dc_derived_tbls[NUM_HUFF_TBLS];
+  c_derived_tbl *ac_derived_tbls[NUM_HUFF_TBLS];
 
 #ifdef ENTROPY_OPT_SUPPORTED    /* Statistics tables for optimization */
-  long * dc_count_ptrs[NUM_HUFF_TBLS];
-  long * ac_count_ptrs[NUM_HUFF_TBLS];
+  long *dc_count_ptrs[NUM_HUFF_TBLS];
+  long *ac_count_ptrs[NUM_HUFF_TBLS];
 #endif
+
+  int simd;
 } huff_entropy_encoder;
 
-typedef huff_entropy_encoder * huff_entropy_ptr;
+typedef huff_entropy_encoder *huff_entropy_ptr;
 
 /* Working state while writing an MCU.
  * This struct contains all the fields that are needed by subroutines.
  */
 
 typedef struct {
-  JOCTET * next_output_byte;    /* => next byte to write in buffer */
+  JOCTET *next_output_byte;     /* => next byte to write in buffer */
   size_t free_in_buffer;        /* # of byte spaces remaining in buffer */
   savable_state cur;            /* Current bit buffer & DC state */
   j_compress_ptr cinfo;         /* dump_buffer needs access to this */
@@ -144,7 +148,7 @@
 {
   huff_entropy_ptr entropy = (huff_entropy_ptr) cinfo->entropy;
   int ci, dctbl, actbl;
-  jpeg_component_info * compptr;
+  jpeg_component_info *compptr;
 
   if (gather_statistics) {
 #ifdef ENTROPY_OPT_SUPPORTED
@@ -158,6 +162,8 @@
     entropy->pub.finish_pass = finish_pass_huff;
   }
 
+  entropy->simd = jsimd_can_huff_encode_one_block();
+
   for (ci = 0; ci < cinfo->comps_in_scan; ci++) {
     compptr = cinfo->cur_comp_info[ci];
     dctbl = compptr->dc_tbl_no;
@@ -214,7 +220,7 @@
 
 GLOBAL(void)
 jpeg_make_c_derived_tbl (j_compress_ptr cinfo, boolean isDC, int tblno,
-                         c_derived_tbl ** pdtbl)
+                         c_derived_tbl **pdtbl)
 {
   JHUFF_TBL *htbl;
   c_derived_tbl *dtbl;
@@ -269,7 +275,7 @@
     /* code is now 1 more than the last code used for codelength si; but
      * it must still fit in si bits, since no code is allowed to be all ones.
      */
-    if (((INT32) code) >= (((INT32) 1) << si))
+    if (((JLONG) code) >= (((JLONG) 1) << si))
       ERREXIT(cinfo, JERR_BAD_HUFF_TABLE);
     code <<= 1;
     si++;
@@ -312,10 +318,10 @@
 
 
 LOCAL(boolean)
-dump_buffer (working_state * state)
+dump_buffer (working_state *state)
 /* Empty the output buffer; return TRUE if successful, FALSE if must suspend */
 {
-  struct jpeg_destination_mgr * dest = state->cinfo->dest;
+  struct jpeg_destination_mgr *dest = state->cinfo->dest;
 
   if (! (*dest->empty_output_buffer) (state->cinfo))
     return FALSE;
@@ -389,7 +395,7 @@
 }
 
 #define EMIT_CODE(code, size) { \
-  temp2 &= (((INT32) 1)<<nbits) - 1; \
+  temp2 &= (((JLONG) 1)<<nbits) - 1; \
   CHECKBUF31() \
   PUT_BITS(code, size) \
   PUT_BITS(temp2, nbits) \
@@ -403,7 +409,7 @@
 }
 
 #define EMIT_CODE(code, size) { \
-  temp2 &= (((INT32) 1)<<nbits) - 1; \
+  temp2 &= (((JLONG) 1)<<nbits) - 1; \
   PUT_BITS(code, size) \
   CHECKBUF15() \
   PUT_BITS(temp2, nbits) \
@@ -455,7 +461,7 @@
 
 
 LOCAL(boolean)
-flush_bits (working_state * state)
+flush_bits (working_state *state)
 {
   JOCTET _buffer[BUFSIZE], *buffer;
   size_t put_buffer;  int put_bits;
@@ -480,7 +486,24 @@
 /* Encode a single block's worth of coefficients */
 
 LOCAL(boolean)
-encode_one_block (working_state * state, JCOEFPTR block, int last_dc_val,
+encode_one_block_simd (working_state *state, JCOEFPTR block, int last_dc_val,
+                       c_derived_tbl *dctbl, c_derived_tbl *actbl)
+{
+  JOCTET _buffer[BUFSIZE], *buffer;
+  size_t bytes, bytestocopy;  int localbuf = 0;
+
+  LOAD_BUFFER()
+
+  buffer = jsimd_huff_encode_one_block(state, buffer, block, last_dc_val,
+                                       dctbl, actbl);
+
+  STORE_BUFFER()
+
+  return TRUE;
+}
+
+LOCAL(boolean)
+encode_one_block (working_state *state, JCOEFPTR block, int last_dc_val,
                   c_derived_tbl *dctbl, c_derived_tbl *actbl)
 {
   int temp, temp2, temp3;
@@ -521,7 +544,7 @@
   EMIT_BITS(code, size)
 
   /* Mask off any extra bits in code */
-  temp2 &= (((INT32) 1)<<nbits) - 1;
+  temp2 &= (((JLONG) 1)<<nbits) - 1;
 
   /* Emit that number of bits of the value, if positive, */
   /* or the complement of its magnitude, if negative. */
@@ -593,7 +616,7 @@
  */
 
 LOCAL(boolean)
-emit_restart (working_state * state, int restart_num)
+emit_restart (working_state *state, int restart_num)
 {
   int ci;
 
@@ -623,7 +646,7 @@
   huff_entropy_ptr entropy = (huff_entropy_ptr) cinfo->entropy;
   working_state state;
   int blkn, ci;
-  jpeg_component_info * compptr;
+  jpeg_component_info *compptr;
 
   /* Load up working state */
   state.next_output_byte = cinfo->dest->next_output_byte;
@@ -639,16 +662,30 @@
   }
 
   /* Encode the MCU data blocks */
-  for (blkn = 0; blkn < cinfo->blocks_in_MCU; blkn++) {
-    ci = cinfo->MCU_membership[blkn];
-    compptr = cinfo->cur_comp_info[ci];
-    if (! encode_one_block(&state,
-                           MCU_data[blkn][0], state.cur.last_dc_val[ci],
-                           entropy->dc_derived_tbls[compptr->dc_tbl_no],
-                           entropy->ac_derived_tbls[compptr->ac_tbl_no]))
-      return FALSE;
-    /* Update last_dc_val */
-    state.cur.last_dc_val[ci] = MCU_data[blkn][0][0];
+  if (entropy->simd) {
+    for (blkn = 0; blkn < cinfo->blocks_in_MCU; blkn++) {
+      ci = cinfo->MCU_membership[blkn];
+      compptr = cinfo->cur_comp_info[ci];
+      if (! encode_one_block_simd(&state,
+                                  MCU_data[blkn][0], state.cur.last_dc_val[ci],
+                                  entropy->dc_derived_tbls[compptr->dc_tbl_no],
+                                  entropy->ac_derived_tbls[compptr->ac_tbl_no]))
+        return FALSE;
+      /* Update last_dc_val */
+      state.cur.last_dc_val[ci] = MCU_data[blkn][0][0];
+    }
+  } else {
+    for (blkn = 0; blkn < cinfo->blocks_in_MCU; blkn++) {
+      ci = cinfo->MCU_membership[blkn];
+      compptr = cinfo->cur_comp_info[ci];
+      if (! encode_one_block(&state,
+                             MCU_data[blkn][0], state.cur.last_dc_val[ci],
+                             entropy->dc_derived_tbls[compptr->dc_tbl_no],
+                             entropy->ac_derived_tbls[compptr->ac_tbl_no]))
+        return FALSE;
+      /* Update last_dc_val */
+      state.cur.last_dc_val[ci] = MCU_data[blkn][0][0];
+    }
   }
 
   /* Completed MCU, so update state */
@@ -791,7 +828,7 @@
 {
   huff_entropy_ptr entropy = (huff_entropy_ptr) cinfo->entropy;
   int blkn, ci;
-  jpeg_component_info * compptr;
+  jpeg_component_info *compptr;
 
   /* Take care of restart intervals if needed */
   if (cinfo->restart_interval) {
@@ -847,7 +884,7 @@
  */
 
 GLOBAL(void)
-jpeg_gen_optimal_table (j_compress_ptr cinfo, JHUFF_TBL * htbl, long freq[])
+jpeg_gen_optimal_table (j_compress_ptr cinfo, JHUFF_TBL *htbl, long freq[])
 {
 #define MAX_CLEN 32             /* assumed maximum initial code length */
   UINT8 bits[MAX_CLEN+1];       /* bits[k] = # of symbols with code length k */
@@ -992,7 +1029,7 @@
 {
   huff_entropy_ptr entropy = (huff_entropy_ptr) cinfo->entropy;
   int ci, dctbl, actbl;
-  jpeg_component_info * compptr;
+  jpeg_component_info *compptr;
   JHUFF_TBL **htblptr;
   boolean did_dc[NUM_HUFF_TBLS];
   boolean did_ac[NUM_HUFF_TBLS];
diff --git a/jchuff.h b/jchuff.h
index d49a992..4236089 100644
--- a/jchuff.h
+++ b/jchuff.h
@@ -5,7 +5,8 @@
  * Copyright (C) 1991-1997, Thomas G. Lane.
  * It was modified by The libjpeg-turbo Project to include only code relevant
  * to libjpeg-turbo.
- * For conditions of distribution and use, see the accompanying README file.
+ * For conditions of distribution and use, see the accompanying README.ijg
+ * file.
  *
  * This file contains declarations for Huffman entropy encoding routines
  * that are shared between the sequential encoder (jchuff.c) and the
@@ -39,4 +40,4 @@
 
 /* Generate an optimal table definition given the specified counts */
 EXTERN(void) jpeg_gen_optimal_table
-        (j_compress_ptr cinfo, JHUFF_TBL * htbl, long freq[]);
+        (j_compress_ptr cinfo, JHUFF_TBL *htbl, long freq[]);
diff --git a/jcinit.c b/jcinit.c
index 347cf6d..463bd8c 100644
--- a/jcinit.c
+++ b/jcinit.c
@@ -3,7 +3,8 @@
  *
  * Copyright (C) 1991-1997, Thomas G. Lane.
  * This file is part of the Independent JPEG Group's software.
- * For conditions of distribution and use, see the accompanying README file.
+ * For conditions of distribution and use, see the accompanying README.ijg
+ * file.
  *
  * This file contains initialization logic for the JPEG compressor.
  * This routine is in charge of selecting the modules to be executed and
diff --git a/jcmainct.c b/jcmainct.c
index 6ca3768..d01f463 100644
--- a/jcmainct.c
+++ b/jcmainct.c
@@ -5,7 +5,8 @@
  * Copyright (C) 1994-1996, Thomas G. Lane.
  * It was modified by The libjpeg-turbo Project to include only code relevant
  * to libjpeg-turbo.
- * For conditions of distribution and use, see the accompanying README file.
+ * For conditions of distribution and use, see the accompanying README.ijg
+ * file.
  *
  * This file contains the main buffer controller for compression.
  * The main buffer lies between the pre-processor and the JPEG
@@ -34,7 +35,7 @@
   JSAMPARRAY buffer[MAX_COMPONENTS];
 } my_main_controller;
 
-typedef my_main_controller * my_main_ptr;
+typedef my_main_controller *my_main_ptr;
 
 
 /* Forward declarations */
diff --git a/jcmarker.c b/jcmarker.c
index aac7dbd..463f665 100644
--- a/jcmarker.c
+++ b/jcmarker.c
@@ -6,7 +6,8 @@
  * Modified 2003-2010 by Guido Vollbeding.
  * libjpeg-turbo Modifications:
  * Copyright (C) 2010, D. R. Commander.
- * For conditions of distribution and use, see the accompanying README file.
+ * For conditions of distribution and use, see the accompanying README.ijg
+ * file.
  *
  * This file contains routines to write JPEG datastream markers.
  */
@@ -93,7 +94,7 @@
   unsigned int last_restart_interval; /* last DRI value emitted; 0 after SOI */
 } my_marker_writer;
 
-typedef my_marker_writer * my_marker_ptr;
+typedef my_marker_writer *my_marker_ptr;
 
 
 /*
@@ -112,7 +113,7 @@
 emit_byte (j_compress_ptr cinfo, int val)
 /* Emit a byte */
 {
-  struct jpeg_destination_mgr * dest = cinfo->dest;
+  struct jpeg_destination_mgr *dest = cinfo->dest;
 
   *(dest->next_output_byte)++ = (JOCTET) val;
   if (--dest->free_in_buffer == 0) {
@@ -149,7 +150,7 @@
 /* Emit a DQT marker */
 /* Returns the precision used (0 = 8bits, 1 = 16bits) for baseline checking */
 {
-  JQUANT_TBL * qtbl = cinfo->quant_tbl_ptrs[index];
+  JQUANT_TBL *qtbl = cinfo->quant_tbl_ptrs[index];
   int prec;
   int i;
 
@@ -188,7 +189,7 @@
 emit_dht (j_compress_ptr cinfo, int index, boolean is_ac)
 /* Emit a DHT marker */
 {
-  JHUFF_TBL * htbl;
+  JHUFF_TBL *htbl;
   int length, i;
 
   if (is_ac) {
diff --git a/jcmaster.c b/jcmaster.c
index dca02d2..03a8b40 100644
--- a/jcmaster.c
+++ b/jcmaster.c
@@ -6,7 +6,8 @@
  * Modified 2003-2010 by Guido Vollbeding.
  * libjpeg-turbo Modifications:
  * Copyright (C) 2010, 2016, D. R. Commander.
- * For conditions of distribution and use, see the accompanying README file.
+ * For conditions of distribution and use, see the accompanying README.ijg
+ * file.
  *
  * This file contains master control logic for the JPEG compressor.
  * These routines are concerned with parameter validation, initial setup,
@@ -50,7 +51,7 @@
 
 } my_comp_master;
 
-typedef my_comp_master * my_master_ptr;
+typedef my_comp_master *my_master_ptr;
 
 
 /*
@@ -178,12 +179,12 @@
  * determine whether it uses progressive JPEG, and set cinfo->progressive_mode.
  */
 {
-  const jpeg_scan_info * scanptr;
+  const jpeg_scan_info *scanptr;
   int scanno, ncomps, ci, coefi, thisi;
   int Ss, Se, Ah, Al;
   boolean component_sent[MAX_COMPONENTS];
 #ifdef C_PROGRESSIVE_SUPPORTED
-  int * last_bitpos_ptr;
+  int *last_bitpos_ptr;
   int last_bitpos[MAX_COMPONENTS][DCTSIZE2];
   /* -1 until that coefficient has been seen; then last Al for it */
 #endif
@@ -319,7 +320,7 @@
   if (cinfo->scan_info != NULL) {
     /* Prepare for current scan --- the script is already validated */
     my_master_ptr master = (my_master_ptr) cinfo->master;
-    const jpeg_scan_info * scanptr = cinfo->scan_info + master->scan_number;
+    const jpeg_scan_info *scanptr = cinfo->scan_info + master->scan_number;
 
     cinfo->comps_in_scan = scanptr->comps_in_scan;
     for (ci = 0; ci < scanptr->comps_in_scan; ci++) {
@@ -613,7 +614,7 @@
     cinfo->num_scans = 1;
   }
 
-  if (cinfo->progressive_mode && !cinfo->arith_code)    /*  TEMPORARY HACK ??? */
+  if (cinfo->progressive_mode && !cinfo->arith_code)  /*  TEMPORARY HACK ??? */
     cinfo->optimize_coding = TRUE; /* assume default tables no good for progressive mode */
 
   /* Initialize my private state */
diff --git a/jcomapi.c b/jcomapi.c
index d8f396d..6e5bf3d 100644
--- a/jcomapi.c
+++ b/jcomapi.c
@@ -2,10 +2,11 @@
  * jcomapi.c
  *
  * This file was part of the Independent JPEG Group's software:
- * Copyright (C) 1994-1997, Thomas G. Lane.0
+ * Copyright (C) 1994-1997, Thomas G. Lane.
  * It was modified by The libjpeg-turbo Project to include only code relevant
  * to libjpeg-turbo.
- * For conditions of distribution and use, see the accompanying README file.
+ * For conditions of distribution and use, see the accompanying README.ijg
+ * file.
  *
  * This file contains application interface routines that are used for both
  * compression and decompression.
diff --git a/jconfig.h.in b/jconfig.h.in
index 42d86f2..02c12cc 100644
--- a/jconfig.h.in
+++ b/jconfig.h.in
@@ -6,6 +6,9 @@
 /* libjpeg-turbo version */
 #define LIBJPEG_TURBO_VERSION 0
 
+/* libjpeg-turbo version in integer form */
+#define LIBJPEG_TURBO_VERSION_NUMBER 0
+
 /* Support arithmetic encoding */
 #undef C_ARITH_CODING_SUPPORTED
 
diff --git a/jconfig.txt b/jconfig.txt
index 8acd8dd..808f87f 100644
--- a/jconfig.txt
+++ b/jconfig.txt
@@ -5,7 +5,8 @@
  * Copyright (C) 1991-1994, Thomas G. Lane.
  * It was modified by The libjpeg-turbo Project to include only code relevant
  * to libjpeg-turbo.
- * For conditions of distribution and use, see the accompanying README file.
+ * For conditions of distribution and use, see the accompanying README.ijg
+ * file.
  *
  * This file documents the configuration options that are required to
  * customize the JPEG software for a particular system.
diff --git a/jcparam.c b/jcparam.c
index 3194c9d..18b2d48 100644
--- a/jcparam.c
+++ b/jcparam.c
@@ -6,7 +6,8 @@
  * Modified 2003-2008 by Guido Vollbeding.
  * libjpeg-turbo Modifications:
  * Copyright (C) 2009-2011, D. R. Commander.
- * For conditions of distribution and use, see the accompanying README file.
+ * For conditions of distribution and use, see the accompanying README.ijg
+ * file.
  *
  * This file contains optional default-setting code for the JPEG compressor.
  * Applications do not have to use this file, but those that don't use it
@@ -33,7 +34,7 @@
  * are limited to 1..255 for JPEG baseline compatibility.
  */
 {
-  JQUANT_TBL ** qtblptr;
+  JQUANT_TBL **qtblptr;
   int i;
   long temp;
 
@@ -321,7 +322,7 @@
 GLOBAL(void)
 jpeg_set_colorspace (j_compress_ptr cinfo, J_COLOR_SPACE colorspace)
 {
-  jpeg_component_info * compptr;
+  jpeg_component_info *compptr;
   int ci;
 
 #define SET_COMP(index,id,hsamp,vsamp,quant,dctbl,actbl)  \
@@ -403,7 +404,7 @@
 #ifdef C_PROGRESSIVE_SUPPORTED
 
 LOCAL(jpeg_scan_info *)
-fill_a_scan (jpeg_scan_info * scanptr, int ci,
+fill_a_scan (jpeg_scan_info *scanptr, int ci,
              int Ss, int Se, int Ah, int Al)
 /* Support routine: generate one scan for specified component */
 {
@@ -418,7 +419,7 @@
 }
 
 LOCAL(jpeg_scan_info *)
-fill_scans (jpeg_scan_info * scanptr, int ncomps,
+fill_scans (jpeg_scan_info *scanptr, int ncomps,
             int Ss, int Se, int Ah, int Al)
 /* Support routine: generate one scan for each component */
 {
@@ -437,7 +438,7 @@
 }
 
 LOCAL(jpeg_scan_info *)
-fill_dc_scans (jpeg_scan_info * scanptr, int ncomps, int Ah, int Al)
+fill_dc_scans (jpeg_scan_info *scanptr, int ncomps, int Ah, int Al)
 /* Support routine: generate interleaved DC scan if possible, else N scans */
 {
   int ci;
@@ -469,7 +470,7 @@
 {
   int ncomps = cinfo->num_components;
   int nscans;
-  jpeg_scan_info * scanptr;
+  jpeg_scan_info *scanptr;
 
   /* Safety check to ensure start_compress not called yet. */
   if (cinfo->global_state != CSTATE_START)
diff --git a/jcphuff.c b/jcphuff.c
index 5ce12b5..046e2e1 100644
--- a/jcphuff.c
+++ b/jcphuff.c
@@ -3,9 +3,10 @@
  *
  * This file was part of the Independent JPEG Group's software:
  * Copyright (C) 1995-1997, Thomas G. Lane.
- * It was modified by The libjpeg-turbo Project to include only code relevant
- * to libjpeg-turbo.
- * For conditions of distribution and use, see the accompanying README file.
+ * libjpeg-turbo Modifications:
+ * Copyright (C) 2015, D. R. Commander.
+ * For conditions of distribution and use, see the accompanying README.ijg
+ * file.
  *
  * This file contains Huffman entropy encoding routines for progressive JPEG.
  *
@@ -32,9 +33,9 @@
   /* Bit-level coding status.
    * next_output_byte/free_in_buffer are local copies of cinfo->dest fields.
    */
-  JOCTET * next_output_byte;    /* => next byte to write in buffer */
+  JOCTET *next_output_byte;     /* => next byte to write in buffer */
   size_t free_in_buffer;        /* # of byte spaces remaining in buffer */
-  INT32 put_buffer;             /* current bit-accumulation buffer */
+  size_t put_buffer;            /* current bit-accumulation buffer */
   int put_bits;                 /* # of bits now in it */
   j_compress_ptr cinfo;         /* link to cinfo (needed for dump_buffer) */
 
@@ -45,7 +46,7 @@
   int ac_tbl_no;                /* the table number of the single component */
   unsigned int EOBRUN;          /* run length of EOBs */
   unsigned int BE;              /* # of buffered correction bits before MCU */
-  char * bit_buffer;            /* buffer for correction bits (1 per char) */
+  char *bit_buffer;             /* buffer for correction bits (1 per char) */
   /* packing correction bits tightly would save some space but cost time... */
 
   unsigned int restarts_to_go;  /* MCUs left in this restart interval */
@@ -55,13 +56,13 @@
    * Since any one scan codes only DC or only AC, we only need one set
    * of tables, not one for DC and one for AC.
    */
-  c_derived_tbl * derived_tbls[NUM_HUFF_TBLS];
+  c_derived_tbl *derived_tbls[NUM_HUFF_TBLS];
 
   /* Statistics tables for optimization; again, one set is enough */
-  long * count_ptrs[NUM_HUFF_TBLS];
+  long *count_ptrs[NUM_HUFF_TBLS];
 } phuff_entropy_encoder;
 
-typedef phuff_entropy_encoder * phuff_entropy_ptr;
+typedef phuff_entropy_encoder *phuff_entropy_ptr;
 
 /* MAX_CORR_BITS is the number of bits the AC refinement correction-bit
  * buffer can hold.  Larger sizes may slightly improve compression, but
@@ -71,8 +72,8 @@
 
 #define MAX_CORR_BITS  1000     /* Max # of correction bits I can buffer */
 
-/* IRIGHT_SHIFT is like RIGHT_SHIFT, but works on int rather than INT32.
- * We assume that int right shift is unsigned if INT32 right shift is,
+/* IRIGHT_SHIFT is like RIGHT_SHIFT, but works on int rather than JLONG.
+ * We assume that int right shift is unsigned if JLONG right shift is,
  * which should be safe.
  */
 
@@ -110,7 +111,7 @@
   phuff_entropy_ptr entropy = (phuff_entropy_ptr) cinfo->entropy;
   boolean is_DC_band;
   int ci, tbl;
-  jpeg_component_info * compptr;
+  jpeg_component_info *compptr;
 
   entropy->cinfo = cinfo;
   entropy->gather_statistics = gather_statistics;
@@ -207,7 +208,7 @@
 dump_buffer (phuff_entropy_ptr entropy)
 /* Empty the output buffer; we do not support suspension in this module. */
 {
-  struct jpeg_destination_mgr * dest = entropy->cinfo->dest;
+  struct jpeg_destination_mgr *dest = entropy->cinfo->dest;
 
   if (! (*dest->empty_output_buffer) (entropy->cinfo))
     ERREXIT(entropy->cinfo, JERR_CANT_SUSPEND);
@@ -230,7 +231,7 @@
 /* Emit some bits, unless we are in gather mode */
 {
   /* This routine is heavily used, so it's worth coding tightly. */
-  register INT32 put_buffer = (INT32) code;
+  register size_t put_buffer = (size_t) code;
   register int put_bits = entropy->put_bits;
 
   /* if size is 0, caller used an invalid Huffman table entry */
@@ -240,7 +241,7 @@
   if (entropy->gather_statistics)
     return;                     /* do nothing if we're only getting stats */
 
-  put_buffer &= (((INT32) 1)<<size) - 1; /* mask off any extra bits in code */
+  put_buffer &= (((size_t) 1)<<size) - 1; /* mask off any extra bits in code */
 
   put_bits += size;             /* new number of bits in buffer */
 
@@ -283,7 +284,7 @@
   if (entropy->gather_statistics)
     entropy->count_ptrs[tbl_no][symbol]++;
   else {
-    c_derived_tbl * tbl = entropy->derived_tbls[tbl_no];
+    c_derived_tbl *tbl = entropy->derived_tbls[tbl_no];
     emit_bits(entropy, tbl->ehufco[symbol], tbl->ehufsi[symbol]);
   }
 }
@@ -294,7 +295,7 @@
  */
 
 LOCAL(void)
-emit_buffered_bits (phuff_entropy_ptr entropy, char * bufstart,
+emit_buffered_bits (phuff_entropy_ptr entropy, char *bufstart,
                     unsigned int nbits)
 {
   if (entropy->gather_statistics)
@@ -382,7 +383,7 @@
   int blkn, ci;
   int Al = cinfo->Al;
   JBLOCKROW block;
-  jpeg_component_info * compptr;
+  jpeg_component_info *compptr;
   ISHIFT_TEMPS
 
   entropy->next_output_byte = cinfo->dest->next_output_byte;
@@ -769,7 +770,7 @@
   phuff_entropy_ptr entropy = (phuff_entropy_ptr) cinfo->entropy;
   boolean is_DC_band;
   int ci, tbl;
-  jpeg_component_info * compptr;
+  jpeg_component_info *compptr;
   JHUFF_TBL **htblptr;
   boolean did[NUM_HUFF_TBLS];
 
diff --git a/jcprepct.c b/jcprepct.c
index 3470de0..e72ebd8 100644
--- a/jcprepct.c
+++ b/jcprepct.c
@@ -5,7 +5,8 @@
  * Copyright (C) 1994-1996, Thomas G. Lane.
  * It was modified by The libjpeg-turbo Project to include only code relevant
  * to libjpeg-turbo.
- * For conditions of distribution and use, see the accompanying README file.
+ * For conditions of distribution and use, see the accompanying README.ijg
+ * file.
  *
  * This file contains the compression preprocessing controller.
  * This controller manages the color conversion, downsampling,
@@ -69,7 +70,7 @@
 #endif
 } my_prep_controller;
 
-typedef my_prep_controller * my_prep_ptr;
+typedef my_prep_controller *my_prep_ptr;
 
 
 /*
@@ -136,7 +137,7 @@
   my_prep_ptr prep = (my_prep_ptr) cinfo->prep;
   int numrows, ci;
   JDIMENSION inrows;
-  jpeg_component_info * compptr;
+  jpeg_component_info *compptr;
 
   while (*in_row_ctr < in_rows_avail &&
          *out_row_group_ctr < out_row_groups_avail) {
@@ -271,7 +272,7 @@
   my_prep_ptr prep = (my_prep_ptr) cinfo->prep;
   int rgroup_height = cinfo->max_v_samp_factor;
   int ci, i;
-  jpeg_component_info * compptr;
+  jpeg_component_info *compptr;
   JSAMPARRAY true_buffer, fake_buffer;
 
   /* Grab enough space for fake row pointers for all the components;
@@ -318,7 +319,7 @@
 {
   my_prep_ptr prep;
   int ci;
-  jpeg_component_info * compptr;
+  jpeg_component_info *compptr;
 
   if (need_full_buffer)         /* safety check */
     ERREXIT(cinfo, JERR_BAD_BUFFER_MODE);
diff --git a/jcsample.c b/jcsample.c
index 24d31ae..879bd51 100644
--- a/jcsample.c
+++ b/jcsample.c
@@ -6,7 +6,9 @@
  * libjpeg-turbo Modifications:
  * Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
  * Copyright (C) 2014, MIPS Technologies, Inc., California
- * For conditions of distribution and use, see the accompanying README file.
+ * Copyright (C) 2015, D. R. Commander.
+ * For conditions of distribution and use, see the accompanying README.ijg
+ * file.
  *
  * This file contains downsampling routines.
  *
@@ -56,7 +58,7 @@
 
 /* Pointer to routine to downsample a single component */
 typedef void (*downsample1_ptr) (j_compress_ptr cinfo,
-                                 jpeg_component_info * compptr,
+                                 jpeg_component_info *compptr,
                                  JSAMPARRAY input_data,
                                  JSAMPARRAY output_data);
 
@@ -69,7 +71,7 @@
   downsample1_ptr methods[MAX_COMPONENTS];
 } my_downsampler;
 
-typedef my_downsampler * my_downsample_ptr;
+typedef my_downsampler *my_downsample_ptr;
 
 
 /*
@@ -122,7 +124,7 @@
 {
   my_downsample_ptr downsample = (my_downsample_ptr) cinfo->downsample;
   int ci;
-  jpeg_component_info * compptr;
+  jpeg_component_info *compptr;
   JSAMPARRAY in_ptr, out_ptr;
 
   for (ci = 0, compptr = cinfo->comp_info; ci < cinfo->num_components;
@@ -142,14 +144,14 @@
  */
 
 METHODDEF(void)
-int_downsample (j_compress_ptr cinfo, jpeg_component_info * compptr,
+int_downsample (j_compress_ptr cinfo, jpeg_component_info *compptr,
                 JSAMPARRAY input_data, JSAMPARRAY output_data)
 {
   int inrow, outrow, h_expand, v_expand, numpix, numpix2, h, v;
   JDIMENSION outcol, outcol_h;  /* outcol_h == outcol*h_expand */
   JDIMENSION output_cols = compptr->width_in_blocks * DCTSIZE;
   JSAMPROW inptr, outptr;
-  INT32 outvalue;
+  JLONG outvalue;
 
   h_expand = cinfo->max_h_samp_factor / compptr->h_samp_factor;
   v_expand = cinfo->max_v_samp_factor / compptr->v_samp_factor;
@@ -172,7 +174,7 @@
       for (v = 0; v < v_expand; v++) {
         inptr = input_data[inrow+v] + outcol_h;
         for (h = 0; h < h_expand; h++) {
-          outvalue += (INT32) GETJSAMPLE(*inptr++);
+          outvalue += (JLONG) GETJSAMPLE(*inptr++);
         }
       }
       *outptr++ = (JSAMPLE) ((outvalue + numpix2) / numpix);
@@ -189,7 +191,7 @@
  */
 
 METHODDEF(void)
-fullsize_downsample (j_compress_ptr cinfo, jpeg_component_info * compptr,
+fullsize_downsample (j_compress_ptr cinfo, jpeg_component_info *compptr,
                      JSAMPARRAY input_data, JSAMPARRAY output_data)
 {
   /* Copy the data */
@@ -214,7 +216,7 @@
  */
 
 METHODDEF(void)
-h2v1_downsample (j_compress_ptr cinfo, jpeg_component_info * compptr,
+h2v1_downsample (j_compress_ptr cinfo, jpeg_component_info *compptr,
                  JSAMPARRAY input_data, JSAMPARRAY output_data)
 {
   int outrow;
@@ -251,7 +253,7 @@
  */
 
 METHODDEF(void)
-h2v2_downsample (j_compress_ptr cinfo, jpeg_component_info * compptr,
+h2v2_downsample (j_compress_ptr cinfo, jpeg_component_info *compptr,
                  JSAMPARRAY input_data, JSAMPARRAY output_data)
 {
   int inrow, outrow;
@@ -294,14 +296,14 @@
  */
 
 METHODDEF(void)
-h2v2_smooth_downsample (j_compress_ptr cinfo, jpeg_component_info * compptr,
+h2v2_smooth_downsample (j_compress_ptr cinfo, jpeg_component_info *compptr,
                         JSAMPARRAY input_data, JSAMPARRAY output_data)
 {
   int inrow, outrow;
   JDIMENSION colctr;
   JDIMENSION output_cols = compptr->width_in_blocks * DCTSIZE;
   register JSAMPROW inptr0, inptr1, above_ptr, below_ptr, outptr;
-  INT32 membersum, neighsum, memberscale, neighscale;
+  JLONG membersum, neighsum, memberscale, neighscale;
 
   /* Expand input data enough to let all the output samples be generated
    * by the standard loop.  Special-casing padded output would be more
@@ -401,7 +403,7 @@
   JDIMENSION colctr;
   JDIMENSION output_cols = compptr->width_in_blocks * DCTSIZE;
   register JSAMPROW inptr, above_ptr, below_ptr, outptr;
-  INT32 membersum, neighsum, memberscale, neighscale;
+  JLONG membersum, neighsum, memberscale, neighscale;
   int colsum, lastcolsum, nextcolsum;
 
   /* Expand input data enough to let all the output samples be generated
@@ -470,7 +472,7 @@
 {
   my_downsample_ptr downsample;
   int ci;
-  jpeg_component_info * compptr;
+  jpeg_component_info *compptr;
   boolean smoothok = TRUE;
 
   downsample = (my_downsample_ptr)
diff --git a/jctrans.c b/jctrans.c
index ccd7b34..6f16b05 100644
--- a/jctrans.c
+++ b/jctrans.c
@@ -6,7 +6,8 @@
  * Modified 2000-2009 by Guido Vollbeding.
  * It was modified by The libjpeg-turbo Project to include only code relevant
  * to libjpeg-turbo.
- * For conditions of distribution and use, see the accompanying README file.
+ * For conditions of distribution and use, see the accompanying README.ijg
+ * file.
  *
  * This file contains library routines for transcoding compression,
  * that is, writing raw DCT coefficient arrays to an output JPEG file.
@@ -20,9 +21,9 @@
 
 /* Forward declarations */
 LOCAL(void) transencode_master_selection
-        (j_compress_ptr cinfo, jvirt_barray_ptr * coef_arrays);
+        (j_compress_ptr cinfo, jvirt_barray_ptr *coef_arrays);
 LOCAL(void) transencode_coef_controller
-        (j_compress_ptr cinfo, jvirt_barray_ptr * coef_arrays);
+        (j_compress_ptr cinfo, jvirt_barray_ptr *coef_arrays);
 
 
 /*
@@ -38,7 +39,7 @@
  */
 
 GLOBAL(void)
-jpeg_write_coefficients (j_compress_ptr cinfo, jvirt_barray_ptr * coef_arrays)
+jpeg_write_coefficients (j_compress_ptr cinfo, jvirt_barray_ptr *coef_arrays)
 {
   if (cinfo->global_state != CSTATE_START)
     ERREXIT1(cinfo, JERR_BAD_STATE, cinfo->global_state);
@@ -66,7 +67,7 @@
 jpeg_copy_critical_parameters (j_decompress_ptr srcinfo,
                                j_compress_ptr dstinfo)
 {
-  JQUANT_TBL ** qtblptr;
+  JQUANT_TBL **qtblptr;
   jpeg_component_info *incomp, *outcomp;
   JQUANT_TBL *c_quant, *slot_quant;
   int tblno, ci, coefi;
@@ -165,7 +166,7 @@
 
 LOCAL(void)
 transencode_master_selection (j_compress_ptr cinfo,
-                              jvirt_barray_ptr * coef_arrays)
+                              jvirt_barray_ptr *coef_arrays)
 {
   /* Although we don't actually use input_components for transcoding,
    * jcmaster.c's initial_setup will complain if input_components is 0.
@@ -227,13 +228,13 @@
   int MCU_rows_per_iMCU_row;    /* number of such rows needed */
 
   /* Virtual block array for each component. */
-  jvirt_barray_ptr * whole_image;
+  jvirt_barray_ptr *whole_image;
 
   /* Workspace for constructing dummy blocks at right/bottom edges. */
   JBLOCKROW dummy_buffer[C_MAX_BLOCKS_IN_MCU];
 } my_coef_controller;
 
-typedef my_coef_controller * my_coef_ptr;
+typedef my_coef_controller *my_coef_ptr;
 
 
 LOCAL(void)
@@ -374,7 +375,7 @@
 
 LOCAL(void)
 transencode_coef_controller (j_compress_ptr cinfo,
-                             jvirt_barray_ptr * coef_arrays)
+                             jvirt_barray_ptr *coef_arrays)
 {
   my_coef_ptr coef;
   JBLOCKROW buffer;
diff --git a/jdapimin.c b/jdapimin.c
index fc8898f..f80a146 100644
--- a/jdapimin.c
+++ b/jdapimin.c
@@ -3,9 +3,10 @@
  *
  * This file was part of the Independent JPEG Group's software:
  * Copyright (C) 1994-1998, Thomas G. Lane.
- * It was modified by The libjpeg-turbo Project to include only code relevant
- * to libjpeg-turbo.
- * For conditions of distribution and use, see the accompanying README file.
+ * libjpeg-turbo Modifications:
+ * Copyright (C) 2016, D. R. Commander.
+ * For conditions of distribution and use, see the accompanying README.ijg
+ * file.
  *
  * This file contains application interface code for the decompression half
  * of the JPEG library.  These are the "minimum" API routines that may be
@@ -21,6 +22,7 @@
 #define JPEG_INTERNALS
 #include "jinclude.h"
 #include "jpeglib.h"
+#include "jdmaster.h"
 
 
 /*
@@ -82,6 +84,14 @@
 
   /* OK, I'm ready */
   cinfo->global_state = DSTATE_START;
+
+  /* The master struct is used to store extension parameters, so we allocate it
+   * here.
+   */
+  cinfo->master = (struct jpeg_decomp_master *)
+      (*cinfo->mem->alloc_small) ((j_common_ptr) cinfo, JPOOL_PERMANENT,
+                                  sizeof(my_decomp_master));
+  MEMZERO(cinfo->master, sizeof(my_decomp_master));
 }
 
 
diff --git a/jdapistd.c b/jdapistd.c
index 3be527c..37afc84 100644
--- a/jdapistd.c
+++ b/jdapistd.c
@@ -4,8 +4,10 @@
  * This file was part of the Independent JPEG Group's software:
  * Copyright (C) 1994-1996, Thomas G. Lane.
  * libjpeg-turbo Modifications:
- * Copyright (C) 2010, D. R. Commander.
- * For conditions of distribution and use, see the accompanying README file.
+ * Copyright (C) 2010, 2015-2016, D. R. Commander.
+ * Copyright (C) 2015, Google, Inc.
+ * For conditions of distribution and use, see the accompanying README.ijg
+ * file.
  *
  * This file contains application interface code for the decompression half
  * of the JPEG library.  These are the "standard" API routines that are
@@ -16,11 +18,11 @@
  * whole decompression library into a transcoder.
  */
 
-#define JPEG_INTERNALS
 #include "jinclude.h"
-#include "jpeglib.h"
-#include "jpegcomp.h"
-
+#include "jdmainct.h"
+#include "jdcoefct.h"
+#include "jdsample.h"
+#include "jmemsys.h"
 
 /* Forward declarations */
 LOCAL(boolean) output_pass_setup (j_decompress_ptr cinfo);
@@ -139,6 +141,110 @@
 
 
 /*
+ * Enable partial scanline decompression
+ *
+ * Must be called after jpeg_start_decompress() and before any calls to
+ * jpeg_read_scanlines() or jpeg_skip_scanlines().
+ *
+ * Refer to libjpeg.txt for more information.
+ */
+
+GLOBAL(void)
+jpeg_crop_scanline (j_decompress_ptr cinfo, JDIMENSION *xoffset,
+                    JDIMENSION *width)
+{
+  int ci, align, orig_downsampled_width;
+  JDIMENSION input_xoffset;
+  boolean reinit_upsampler = FALSE;
+  jpeg_component_info *compptr;
+
+  if (cinfo->global_state != DSTATE_SCANNING || cinfo->output_scanline != 0)
+    ERREXIT1(cinfo, JERR_BAD_STATE, cinfo->global_state);
+
+  if (!xoffset || !width)
+    ERREXIT(cinfo, JERR_BAD_CROP_SPEC);
+
+  /* xoffset and width must fall within the output image dimensions. */
+  if (*width == 0 || *xoffset + *width > cinfo->output_width)
+    ERREXIT(cinfo, JERR_WIDTH_OVERFLOW);
+
+  /* No need to do anything if the caller wants the entire width. */
+  if (*width == cinfo->output_width)
+    return;
+
+  /* Ensuring the proper alignment of xoffset is tricky.  At minimum, it
+   * must align with an MCU boundary, because:
+   *
+   *   (1) The IDCT is performed in blocks, and it is not feasible to modify
+   *       the algorithm so that it can transform partial blocks.
+   *   (2) Because of the SIMD extensions, any input buffer passed to the
+   *       upsampling and color conversion routines must be aligned to the
+   *       SIMD word size (for instance, 128-bit in the case of SSE2.)  The
+   *       easiest way to accomplish this without copying data is to ensure
+   *       that upsampling and color conversion begin at the start of the
+   *       first MCU column that will be inverse transformed.
+   *
+   * In practice, we actually impose a stricter alignment requirement.  We
+   * require that xoffset be a multiple of the maximum MCU column width of all
+   * of the components (the "iMCU column width.")  This is to simplify the
+   * single-pass decompression case, allowing us to use the same MCU column
+   * width for all of the components.
+   */
+  align = cinfo->_min_DCT_scaled_size * cinfo->max_h_samp_factor;
+
+  /* Adjust xoffset to the nearest iMCU boundary <= the requested value */
+  input_xoffset = *xoffset;
+  *xoffset = (input_xoffset / align) * align;
+
+  /* Adjust the width so that the right edge of the output image is as
+   * requested (only the left edge is altered.)  It is important that calling
+   * programs check this value after this function returns, so that they can
+   * allocate an output buffer with the appropriate size.
+   */
+  *width = *width + input_xoffset - *xoffset;
+  cinfo->output_width = *width;
+
+  /* Set the first and last iMCU columns that we must decompress.  These values
+   * will be used in single-scan decompressions.
+   */
+  cinfo->master->first_iMCU_col =
+    (JDIMENSION) (long) (*xoffset) / (long) align;
+  cinfo->master->last_iMCU_col =
+    (JDIMENSION) jdiv_round_up((long) (*xoffset + cinfo->output_width),
+                               (long) align) - 1;
+
+  for (ci = 0, compptr = cinfo->comp_info; ci < cinfo->num_components;
+       ci++, compptr++) {
+    /* Set downsampled_width to the new output width. */
+    orig_downsampled_width = compptr->downsampled_width;
+    compptr->downsampled_width =
+      (JDIMENSION) jdiv_round_up((long) (cinfo->output_width *
+                                         compptr->h_samp_factor),
+                                 (long) cinfo->max_h_samp_factor);
+    if (compptr->downsampled_width < 2 && orig_downsampled_width >= 2)
+      reinit_upsampler = TRUE;
+
+    /* Set the first and last iMCU columns that we must decompress.  These
+     * values will be used in multi-scan decompressions.
+     */
+    cinfo->master->first_MCU_col[ci] =
+      (JDIMENSION) (long) (*xoffset * compptr->h_samp_factor) /
+                   (long) align;
+    cinfo->master->last_MCU_col[ci] =
+      (JDIMENSION) jdiv_round_up((long) ((*xoffset + cinfo->output_width) *
+                                         compptr->h_samp_factor),
+                                 (long) align) - 1;
+  }
+
+  if (reinit_upsampler) {
+    cinfo->master->jinit_upsampler_no_alloc = TRUE;
+    jinit_upsampler(cinfo);
+    cinfo->master->jinit_upsampler_no_alloc = FALSE;
+  }
+}
+
+
+/*
  * Read some scanlines of data from the JPEG decompressor.
  *
  * The return value will be the number of lines actually read.
@@ -179,6 +285,236 @@
 }
 
 
+/* Dummy color convert function used by jpeg_skip_scanlines() */
+LOCAL(void)
+noop_convert (j_decompress_ptr cinfo, JSAMPIMAGE input_buf,
+              JDIMENSION input_row, JSAMPARRAY output_buf, int num_rows)
+{
+}
+
+
+/*
+ * In some cases, it is best to call jpeg_read_scanlines() and discard the
+ * output, rather than skipping the scanlines, because this allows us to
+ * maintain the internal state of the context-based upsampler.  In these cases,
+ * we set up and tear down a dummy color converter in order to avoid valgrind
+ * errors and to achieve the best possible performance.
+ */
+
+LOCAL(void)
+read_and_discard_scanlines (j_decompress_ptr cinfo, JDIMENSION num_lines)
+{
+  JDIMENSION n;
+  void (*color_convert) (j_decompress_ptr cinfo, JSAMPIMAGE input_buf,
+                         JDIMENSION input_row, JSAMPARRAY output_buf,
+                         int num_rows);
+
+  color_convert = cinfo->cconvert->color_convert;
+  cinfo->cconvert->color_convert = noop_convert;
+
+  for (n = 0; n < num_lines; n++)
+    jpeg_read_scanlines(cinfo, NULL, 1);
+
+  cinfo->cconvert->color_convert = color_convert;
+}
+
+
+/*
+ * Called by jpeg_skip_scanlines().  This partially skips a decompress block by
+ * incrementing the rowgroup counter.
+ */
+
+LOCAL(void)
+increment_simple_rowgroup_ctr (j_decompress_ptr cinfo, JDIMENSION rows)
+{
+  JDIMENSION rows_left;
+  my_main_ptr main_ptr = (my_main_ptr) cinfo->main;
+
+  /* Increment the counter to the next row group after the skipped rows. */
+  main_ptr->rowgroup_ctr += rows / cinfo->max_v_samp_factor;
+
+  /* Partially skipping a row group would involve modifying the internal state
+   * of the upsampler, so read the remaining rows into a dummy buffer instead.
+   */
+  rows_left = rows % cinfo->max_v_samp_factor;
+  cinfo->output_scanline += rows - rows_left;
+
+  read_and_discard_scanlines(cinfo, rows_left);
+}
+
+/*
+ * Skips some scanlines of data from the JPEG decompressor.
+ *
+ * The return value will be the number of lines actually skipped.  If skipping
+ * num_lines would move beyond the end of the image, then the actual number of
+ * lines remaining in the image is returned.  Otherwise, the return value will
+ * be equal to num_lines.
+ *
+ * Refer to libjpeg.txt for more information.
+ */
+
+GLOBAL(JDIMENSION)
+jpeg_skip_scanlines (j_decompress_ptr cinfo, JDIMENSION num_lines)
+{
+  my_main_ptr main_ptr = (my_main_ptr) cinfo->main;
+  my_coef_ptr coef = (my_coef_ptr) cinfo->coef;
+  my_upsample_ptr upsample = (my_upsample_ptr) cinfo->upsample;
+  JDIMENSION i, x;
+  int y;
+  JDIMENSION lines_per_iMCU_row, lines_left_in_iMCU_row, lines_after_iMCU_row;
+  JDIMENSION lines_to_skip, lines_to_read;
+
+  if (cinfo->global_state != DSTATE_SCANNING)
+    ERREXIT1(cinfo, JERR_BAD_STATE, cinfo->global_state);
+
+  /* Do not skip past the bottom of the image. */
+  if (cinfo->output_scanline + num_lines >= cinfo->output_height) {
+    cinfo->output_scanline = cinfo->output_height;
+    return cinfo->output_height - cinfo->output_scanline;
+  }
+
+  if (num_lines == 0)
+    return 0;
+
+  lines_per_iMCU_row = cinfo->_min_DCT_scaled_size * cinfo->max_v_samp_factor;
+  lines_left_in_iMCU_row =
+    (lines_per_iMCU_row - (cinfo->output_scanline % lines_per_iMCU_row)) %
+    lines_per_iMCU_row;
+  lines_after_iMCU_row = num_lines - lines_left_in_iMCU_row;
+
+  /* Skip the lines remaining in the current iMCU row.  When upsampling
+   * requires context rows, we need the previous and next rows in order to read
+   * the current row.  This adds some complexity.
+   */
+  if (cinfo->upsample->need_context_rows) {
+    /* If the skipped lines would not move us past the current iMCU row, we
+     * read the lines and ignore them.  There might be a faster way of doing
+     * this, but we are facing increasing complexity for diminishing returns.
+     * The increasing complexity would be a by-product of meddling with the
+     * state machine used to skip context rows.  Near the end of an iMCU row,
+     * the next iMCU row may have already been entropy-decoded.  In this unique
+     * case, we will read the next iMCU row if we cannot skip past it as well.
+     */
+    if ((num_lines < lines_left_in_iMCU_row + 1) ||
+        (lines_left_in_iMCU_row <= 1 && main_ptr->buffer_full &&
+         lines_after_iMCU_row < lines_per_iMCU_row + 1)) {
+      read_and_discard_scanlines(cinfo, num_lines);
+      return num_lines;
+    }
+
+    /* If the next iMCU row has already been entropy-decoded, make sure that
+     * we do not skip too far.
+     */
+    if (lines_left_in_iMCU_row <= 1 && main_ptr->buffer_full) {
+      cinfo->output_scanline += lines_left_in_iMCU_row + lines_per_iMCU_row;
+      lines_after_iMCU_row -= lines_per_iMCU_row;
+    } else {
+      cinfo->output_scanline += lines_left_in_iMCU_row;
+    }
+
+    /* If we have just completed the first block, adjust the buffer pointers */
+    if (main_ptr->iMCU_row_ctr == 0 ||
+        (main_ptr->iMCU_row_ctr == 1 && lines_left_in_iMCU_row > 2))
+      set_wraparound_pointers(cinfo);
+    main_ptr->buffer_full = FALSE;
+    main_ptr->rowgroup_ctr = 0;
+    main_ptr->context_state = CTX_PREPARE_FOR_IMCU;
+    upsample->next_row_out = cinfo->max_v_samp_factor;
+    upsample->rows_to_go = cinfo->output_height - cinfo->output_scanline;
+  }
+
+  /* Skipping is much simpler when context rows are not required. */
+  else {
+    if (num_lines < lines_left_in_iMCU_row) {
+      increment_simple_rowgroup_ctr(cinfo, num_lines);
+      return num_lines;
+    } else {
+      cinfo->output_scanline += lines_left_in_iMCU_row;
+      main_ptr->buffer_full = FALSE;
+      main_ptr->rowgroup_ctr = 0;
+      upsample->next_row_out = cinfo->max_v_samp_factor;
+      upsample->rows_to_go = cinfo->output_height - cinfo->output_scanline;
+    }
+  }
+
+  /* Calculate how many full iMCU rows we can skip. */
+  if (cinfo->upsample->need_context_rows)
+    lines_to_skip = ((lines_after_iMCU_row - 1) / lines_per_iMCU_row) *
+                    lines_per_iMCU_row;
+  else
+    lines_to_skip = (lines_after_iMCU_row / lines_per_iMCU_row) *
+                    lines_per_iMCU_row;
+  /* Calculate the number of lines that remain to be skipped after skipping all
+   * of the full iMCU rows that we can.  We will not read these lines unless we
+   * have to.
+   */
+  lines_to_read = lines_after_iMCU_row - lines_to_skip;
+
+  /* For images requiring multiple scans (progressive, non-interleaved, etc.),
+   * all of the entropy decoding occurs in jpeg_start_decompress(), assuming
+   * that the input data source is non-suspending.  This makes skipping easy.
+   */
+  if (cinfo->inputctl->has_multiple_scans) {
+    if (cinfo->upsample->need_context_rows) {
+      cinfo->output_scanline += lines_to_skip;
+      cinfo->output_iMCU_row += lines_to_skip / lines_per_iMCU_row;
+      main_ptr->iMCU_row_ctr += lines_after_iMCU_row / lines_per_iMCU_row;
+      /* It is complex to properly move to the middle of a context block, so
+       * read the remaining lines instead of skipping them.
+       */
+      read_and_discard_scanlines(cinfo, lines_to_read);
+    } else {
+      cinfo->output_scanline += lines_to_skip;
+      cinfo->output_iMCU_row += lines_to_skip / lines_per_iMCU_row;
+      increment_simple_rowgroup_ctr(cinfo, lines_to_read);
+    }
+    upsample->rows_to_go = cinfo->output_height - cinfo->output_scanline;
+    return num_lines;
+  }
+
+  /* Skip the iMCU rows that we can safely skip. */
+  for (i = 0; i < lines_to_skip; i += lines_per_iMCU_row) {
+    for (y = 0; y < coef->MCU_rows_per_iMCU_row; y++) {
+      for (x = 0; x < cinfo->MCUs_per_row; x++) {
+        /* Calling decode_mcu() with a NULL pointer causes it to discard the
+         * decoded coefficients.  This is ~5% faster for large subsets, but
+         * it's tough to tell a difference for smaller images.
+         */
+        (*cinfo->entropy->decode_mcu) (cinfo, NULL);
+      }
+    }
+    cinfo->input_iMCU_row++;
+    cinfo->output_iMCU_row++;
+    if (cinfo->input_iMCU_row < cinfo->total_iMCU_rows)
+      start_iMCU_row(cinfo);
+    else
+      (*cinfo->inputctl->finish_input_pass) (cinfo);
+  }
+  cinfo->output_scanline += lines_to_skip;
+
+  if (cinfo->upsample->need_context_rows) {
+    /* Context-based upsampling keeps track of iMCU rows. */
+    main_ptr->iMCU_row_ctr += lines_to_skip / lines_per_iMCU_row;
+
+    /* It is complex to properly move to the middle of a context block, so
+     * read the remaining lines instead of skipping them.
+     */
+    read_and_discard_scanlines(cinfo, lines_to_read);
+  } else {
+    increment_simple_rowgroup_ctr(cinfo, lines_to_read);
+  }
+
+  /* Since skipping lines involves skipping the upsampling step, the value of
+   * "rows_to_go" will become invalid unless we set it here.  NOTE: This is a
+   * bit odd, since "rows_to_go" seems to be redundantly keeping track of
+   * output_scanline.
+   */
+  upsample->rows_to_go = cinfo->output_height - cinfo->output_scanline;
+
+  /* Always skip the requested number of lines. */
+  return num_lines;
+}
+
 /*
  * Alternate entry point to read raw data.
  * Processes exactly one iMCU row per call, unless suspended.
diff --git a/jdarith.c b/jdarith.c
index 885c8eb..98d5fad 100644
--- a/jdarith.c
+++ b/jdarith.c
@@ -2,10 +2,11 @@
  * jdarith.c
  *
  * This file was part of the Independent JPEG Group's software:
- * Developed 1997-2009 by Guido Vollbeding.
- * It was modified by The libjpeg-turbo Project to include only code relevant
- * to libjpeg-turbo.
- * For conditions of distribution and use, see the accompanying README file.
+ * Developed 1997-2015 by Guido Vollbeding.
+ * libjpeg-turbo Modifications:
+ * Copyright (C) 2015, D. R. Commander.
+ * For conditions of distribution and use, see the accompanying README.ijg
+ * file.
  *
  * This file contains portable arithmetic entropy decoding routines for JPEG
  * (implementing the ISO/IEC IS 10918-1 and CCITT Recommendation ITU-T T.81).
@@ -25,8 +26,8 @@
 typedef struct {
   struct jpeg_entropy_decoder pub; /* public fields */
 
-  INT32 c;       /* C register, base of coding interval + input bit buffer */
-  INT32 a;               /* A register, normalized size of coding interval */
+  JLONG c;       /* C register, base of coding interval + input bit buffer */
+  JLONG a;               /* A register, normalized size of coding interval */
   int ct;     /* bit shift counter, # of bits left in bit buffer part of C */
                                                          /* init: ct = -16 */
                                                          /* run: ct = 0..7 */
@@ -37,14 +38,14 @@
   unsigned int restarts_to_go;  /* MCUs left in this restart interval */
 
   /* Pointers to statistics areas (these workspaces have image lifespan) */
-  unsigned char * dc_stats[NUM_ARITH_TBLS];
-  unsigned char * ac_stats[NUM_ARITH_TBLS];
+  unsigned char *dc_stats[NUM_ARITH_TBLS];
+  unsigned char *ac_stats[NUM_ARITH_TBLS];
 
   /* Statistics bin for coding with fixed probability 0.5 */
   unsigned char fixed_bin[4];
 } arith_entropy_decoder;
 
-typedef arith_entropy_decoder * arith_entropy_ptr;
+typedef arith_entropy_decoder *arith_entropy_ptr;
 
 /* The following two definitions specify the allocation chunk size
  * for the statistics area.
@@ -67,7 +68,7 @@
 get_byte (j_decompress_ptr cinfo)
 /* Read next input byte; we do not support suspension in this module. */
 {
-  struct jpeg_source_mgr * src = cinfo->src;
+  struct jpeg_source_mgr *src = cinfo->src;
 
   if (src->bytes_in_buffer == 0)
     if (! (*src->fill_input_buffer) (cinfo))
@@ -96,7 +97,7 @@
  * (instead of fixed) with the bit shift counter CT.
  * Thus, we also need only one (variable instead of
  * fixed size) shift for the LPS/MPS decision, and
- * we can get away with any renormalization update
+ * we can do away with any renormalization update
  * of C (except for new data insertion, of course).
  *
  * I've also introduced a new scheme for accessing
@@ -109,7 +110,7 @@
 {
   register arith_entropy_ptr e = (arith_entropy_ptr) cinfo->entropy;
   register unsigned char nl, nm;
-  register INT32 qe, temp;
+  register JLONG qe, temp;
   register int sv, data;
 
   /* Renormalization & data input per section D.2.6 */
@@ -193,7 +194,7 @@
 {
   arith_entropy_ptr entropy = (arith_entropy_ptr) cinfo->entropy;
   int ci;
-  jpeg_component_info * compptr;
+  jpeg_component_info *compptr;
 
   /* Advance past the RSTn marker */
   if (! (*cinfo->marker->read_restart_marker) (cinfo))
@@ -202,13 +203,13 @@
   /* Re-initialize statistics areas */
   for (ci = 0; ci < cinfo->comps_in_scan; ci++) {
     compptr = cinfo->cur_comp_info[ci];
-    if (! cinfo->progressive_mode || (cinfo->Ss == 0 && cinfo->Ah == 0)) {
+    if (!cinfo->progressive_mode || (cinfo->Ss == 0 && cinfo->Ah == 0)) {
       MEMZERO(entropy->dc_stats[compptr->dc_tbl_no], DC_STAT_BINS);
       /* Reset DC predictions to 0 */
       entropy->last_dc_val[ci] = 0;
       entropy->dc_context[ci] = 0;
     }
-    if (! cinfo->progressive_mode || cinfo->Ss) {
+    if (!cinfo->progressive_mode || cinfo->Ss) {
       MEMZERO(entropy->ac_stats[compptr->ac_tbl_no], AC_STAT_BINS);
     }
   }
@@ -498,7 +499,7 @@
 decode_mcu (j_decompress_ptr cinfo, JBLOCKROW *MCU_data)
 {
   arith_entropy_ptr entropy = (arith_entropy_ptr) cinfo->entropy;
-  jpeg_component_info * compptr;
+  jpeg_component_info *compptr;
   JBLOCKROW block;
   unsigned char *st;
   int blkn, ci, tbl, sign, k;
@@ -516,7 +517,7 @@
   /* Outer loop handles each block in the MCU */
 
   for (blkn = 0; blkn < cinfo->blocks_in_MCU; blkn++) {
-    block = MCU_data[blkn];
+    block = MCU_data ? MCU_data[blkn] : NULL;
     ci = cinfo->MCU_membership[blkn];
     compptr = cinfo->cur_comp_info[ci];
 
@@ -563,7 +564,8 @@
       entropy->last_dc_val[ci] += v;
     }
 
-    (*block)[0] = (JCOEF) entropy->last_dc_val[ci];
+    if (block)
+      (*block)[0] = (JCOEF) entropy->last_dc_val[ci];
 
     /* Sections F.2.4.2 & F.1.4.4.2: Decoding of AC coefficients */
 
@@ -607,7 +609,8 @@
       while (m >>= 1)
         if (arith_decode(cinfo, st)) v |= m;
       v += 1; if (sign) v = -v;
-      (*block)[jpeg_natural_order[k]] = (JCOEF) v;
+      if (block)
+        (*block)[jpeg_natural_order[k]] = (JCOEF) v;
     }
   }
 
@@ -624,7 +627,7 @@
 {
   arith_entropy_ptr entropy = (arith_entropy_ptr) cinfo->entropy;
   int ci, tbl;
-  jpeg_component_info * compptr;
+  jpeg_component_info *compptr;
 
   if (cinfo->progressive_mode) {
     /* Validate progressive scan parameters */
@@ -691,7 +694,7 @@
   /* Allocate & initialize requested statistics areas */
   for (ci = 0; ci < cinfo->comps_in_scan; ci++) {
     compptr = cinfo->cur_comp_info[ci];
-    if (! cinfo->progressive_mode || (cinfo->Ss == 0 && cinfo->Ah == 0)) {
+    if (!cinfo->progressive_mode || (cinfo->Ss == 0 && cinfo->Ah == 0)) {
       tbl = compptr->dc_tbl_no;
       if (tbl < 0 || tbl >= NUM_ARITH_TBLS)
         ERREXIT1(cinfo, JERR_NO_ARITH_TABLE, tbl);
@@ -703,7 +706,7 @@
       entropy->last_dc_val[ci] = 0;
       entropy->dc_context[ci] = 0;
     }
-    if (! cinfo->progressive_mode || cinfo->Ss) {
+    if (!cinfo->progressive_mode || cinfo->Ss) {
       tbl = compptr->ac_tbl_no;
       if (tbl < 0 || tbl >= NUM_ARITH_TBLS)
         ERREXIT1(cinfo, JERR_NO_ARITH_TABLE, tbl);
diff --git a/jdatadst-tj.c b/jdatadst-tj.c
index 1f6f3a5..c6144ec 100644
--- a/jdatadst-tj.c
+++ b/jdatadst-tj.c
@@ -5,8 +5,9 @@
  * Copyright (C) 1994-1996, Thomas G. Lane.
  * Modified 2009-2012 by Guido Vollbeding.
  * libjpeg-turbo Modifications:
- * Copyright (C) 2011, 2014 D. R. Commander.
- * For conditions of distribution and use, see the accompanying README file.
+ * Copyright (C) 2011, 2014, 2016, D. R. Commander.
+ * For conditions of distribution and use, see the accompanying README.ijg
+ * file.
  *
  * This file contains compression data destination routines for the case of
  * emitting JPEG data to memory or to a file (or any stdio stream).
@@ -23,7 +24,7 @@
 #include "jerror.h"
 
 #ifndef HAVE_STDLIB_H           /* <stdlib.h> should declare malloc(),free() */
-extern void * malloc (size_t size);
+extern void *malloc (size_t size);
 extern void free (void *ptr);
 #endif
 
@@ -36,15 +37,15 @@
 typedef struct {
   struct jpeg_destination_mgr pub; /* public fields */
 
-  unsigned char ** outbuffer;   /* target buffer */
-  unsigned long * outsize;
-  unsigned char * newbuffer;    /* newly allocated buffer */
-  JOCTET * buffer;              /* start of buffer */
+  unsigned char **outbuffer;    /* target buffer */
+  unsigned long *outsize;
+  unsigned char *newbuffer;     /* newly allocated buffer */
+  JOCTET *buffer;               /* start of buffer */
   size_t bufsize;
   boolean alloc;
 } my_mem_destination_mgr;
 
-typedef my_mem_destination_mgr * my_mem_dest_ptr;
+typedef my_mem_destination_mgr *my_mem_dest_ptr;
 
 
 /*
@@ -86,7 +87,7 @@
 empty_mem_output_buffer (j_compress_ptr cinfo)
 {
   size_t nextsize;
-  JOCTET * nextbuffer;
+  JOCTET *nextbuffer;
   my_mem_dest_ptr dest = (my_mem_dest_ptr) cinfo->dest;
 
   if (!dest->alloc) ERREXIT(cinfo, JERR_BUFFER_SIZE);
@@ -147,7 +148,7 @@
 
 GLOBAL(void)
 jpeg_mem_dest_tj (j_compress_ptr cinfo,
-               unsigned char ** outbuffer, unsigned long * outsize,
+               unsigned char **outbuffer, unsigned long *outsize,
                boolean alloc)
 {
   boolean reused = FALSE;
@@ -166,6 +167,11 @@
     dest = (my_mem_dest_ptr) cinfo->dest;
     dest->newbuffer = NULL;
     dest->buffer = NULL;
+  } else if (cinfo->dest->init_destination != init_mem_destination) {
+    /* It is unsafe to reuse the existing destination manager unless it was
+     * created by this function.
+     */
+    ERREXIT(cinfo, JERR_BUFFER_SIZE);
   }
 
   dest = (my_mem_dest_ptr) cinfo->dest;
diff --git a/jdatadst.c b/jdatadst.c
index 7a40e4f..dcaf6f0 100644
--- a/jdatadst.c
+++ b/jdatadst.c
@@ -5,8 +5,9 @@
  * Copyright (C) 1994-1996, Thomas G. Lane.
  * Modified 2009-2012 by Guido Vollbeding.
  * libjpeg-turbo Modifications:
- * Copyright (C) 2013, D. R. Commander.
- * For conditions of distribution and use, see the accompanying README file.
+ * Copyright (C) 2013, 2016, D. R. Commander.
+ * For conditions of distribution and use, see the accompanying README.ijg
+ * file.
  *
  * This file contains compression data destination routines for the case of
  * emitting JPEG data to memory or to a file (or any stdio stream).
@@ -23,7 +24,7 @@
 #include "jerror.h"
 
 #ifndef HAVE_STDLIB_H           /* <stdlib.h> should declare malloc(),free() */
-extern void * malloc (size_t size);
+extern void *malloc (size_t size);
 extern void free (void *ptr);
 #endif
 
@@ -33,11 +34,11 @@
 typedef struct {
   struct jpeg_destination_mgr pub; /* public fields */
 
-  FILE * outfile;               /* target stream */
-  JOCTET * buffer;              /* start of buffer */
+  FILE *outfile;                /* target stream */
+  JOCTET *buffer;               /* start of buffer */
 } my_destination_mgr;
 
-typedef my_destination_mgr * my_dest_ptr;
+typedef my_destination_mgr *my_dest_ptr;
 
 #define OUTPUT_BUF_SIZE  4096   /* choose an efficiently fwrite'able size */
 
@@ -48,14 +49,14 @@
 typedef struct {
   struct jpeg_destination_mgr pub; /* public fields */
 
-  unsigned char ** outbuffer;   /* target buffer */
-  unsigned long * outsize;
-  unsigned char * newbuffer;    /* newly allocated buffer */
-  JOCTET * buffer;              /* start of buffer */
+  unsigned char **outbuffer;    /* target buffer */
+  unsigned long *outsize;
+  unsigned char *newbuffer;     /* newly allocated buffer */
+  JOCTET *buffer;               /* start of buffer */
   size_t bufsize;
 } my_mem_destination_mgr;
 
-typedef my_mem_destination_mgr * my_mem_dest_ptr;
+typedef my_mem_destination_mgr *my_mem_dest_ptr;
 #endif
 
 
@@ -130,7 +131,7 @@
 empty_mem_output_buffer (j_compress_ptr cinfo)
 {
   size_t nextsize;
-  JOCTET * nextbuffer;
+  JOCTET *nextbuffer;
   my_mem_dest_ptr dest = (my_mem_dest_ptr) cinfo->dest;
 
   /* Try to allocate new buffer with double size */
@@ -203,20 +204,25 @@
  */
 
 GLOBAL(void)
-jpeg_stdio_dest (j_compress_ptr cinfo, FILE * outfile)
+jpeg_stdio_dest (j_compress_ptr cinfo, FILE *outfile)
 {
   my_dest_ptr dest;
 
   /* The destination object is made permanent so that multiple JPEG images
    * can be written to the same file without re-executing jpeg_stdio_dest.
-   * This makes it dangerous to use this manager and a different destination
-   * manager serially with the same JPEG object, because their private object
-   * sizes may be different.  Caveat programmer.
    */
   if (cinfo->dest == NULL) {    /* first time for this JPEG object? */
     cinfo->dest = (struct jpeg_destination_mgr *)
       (*cinfo->mem->alloc_small) ((j_common_ptr) cinfo, JPOOL_PERMANENT,
                                   sizeof(my_destination_mgr));
+  } else if (cinfo->dest->init_destination != init_destination) {
+    /* It is unsafe to reuse the existing destination manager unless it was
+     * created by this function.  Otherwise, there is no guarantee that the
+     * opaque structure is the right size.  Note that we could just create a
+     * new structure, but the old structure would not be freed until
+     * jpeg_destroy_compress() was called.
+     */
+    ERREXIT(cinfo, JERR_BUFFER_SIZE);
   }
 
   dest = (my_dest_ptr) cinfo->dest;
@@ -237,11 +243,14 @@
  * larger memory, so the buffer is available to the application after
  * finishing compression, and then the application is responsible for
  * freeing the requested memory.
+ * Note:  An initial buffer supplied by the caller is expected to be
+ * managed by the application.  The library does not free such buffer
+ * when allocating a larger buffer.
  */
 
 GLOBAL(void)
 jpeg_mem_dest (j_compress_ptr cinfo,
-               unsigned char ** outbuffer, unsigned long * outsize)
+               unsigned char **outbuffer, unsigned long *outsize)
 {
   my_mem_dest_ptr dest;
 
@@ -255,6 +264,11 @@
     cinfo->dest = (struct jpeg_destination_mgr *)
       (*cinfo->mem->alloc_small) ((j_common_ptr) cinfo, JPOOL_PERMANENT,
                                   sizeof(my_mem_destination_mgr));
+  } else if (cinfo->dest->init_destination != init_mem_destination) {
+    /* It is unsafe to reuse the existing destination manager unless it was
+     * created by this function.
+     */
+    ERREXIT(cinfo, JERR_BUFFER_SIZE);
   }
 
   dest = (my_mem_dest_ptr) cinfo->dest;
diff --git a/jdatasrc-tj.c b/jdatasrc-tj.c
index 09f9596..05456c8 100644
--- a/jdatasrc-tj.c
+++ b/jdatasrc-tj.c
@@ -5,8 +5,9 @@
  * Copyright (C) 1994-1996, Thomas G. Lane.
  * Modified 2009-2011 by Guido Vollbeding.
  * libjpeg-turbo Modifications:
- * Copyright (C) 2011, D. R. Commander.
- * For conditions of distribution and use, see the accompanying README file.
+ * Copyright (C) 2011, 2016, D. R. Commander.
+ * For conditions of distribution and use, see the accompanying README.ijg
+ * file.
  *
  * This file contains decompression data source routines for the case of
  * reading JPEG data from memory or from a file (or any stdio stream).
@@ -105,7 +106,7 @@
 METHODDEF(void)
 skip_input_data (j_decompress_ptr cinfo, long num_bytes)
 {
-  struct jpeg_source_mgr * src = cinfo->src;
+  struct jpeg_source_mgr *src = cinfo->src;
 
   /* Just a dumb implementation for now.  Could use fseek() except
    * it doesn't work on pipes.  Not clear that being smart is worth
@@ -157,9 +158,9 @@
 
 GLOBAL(void)
 jpeg_mem_src_tj (j_decompress_ptr cinfo,
-              unsigned char * inbuffer, unsigned long insize)
+                 const unsigned char *inbuffer, unsigned long insize)
 {
-  struct jpeg_source_mgr * src;
+  struct jpeg_source_mgr *src;
 
   if (inbuffer == NULL || insize == 0)  /* Treat empty input as fatal error */
     ERREXIT(cinfo, JERR_INPUT_EMPTY);
@@ -172,6 +173,11 @@
     cinfo->src = (struct jpeg_source_mgr *)
       (*cinfo->mem->alloc_small) ((j_common_ptr) cinfo, JPOOL_PERMANENT,
                                   sizeof(struct jpeg_source_mgr));
+  } else if (cinfo->src->init_source != init_mem_source) {
+    /* It is unsafe to reuse the existing source manager unless it was created
+     * by this function.
+     */
+    ERREXIT(cinfo, JERR_BUFFER_SIZE);
   }
 
   src = cinfo->src;
@@ -181,5 +187,5 @@
   src->resync_to_restart = jpeg_resync_to_restart; /* use default method */
   src->term_source = term_source;
   src->bytes_in_buffer = (size_t) insize;
-  src->next_input_byte = (JOCTET *) inbuffer;
+  src->next_input_byte = (const JOCTET *) inbuffer;
 }
diff --git a/jdatasrc.c b/jdatasrc.c
index bf70422..c83183f 100644
--- a/jdatasrc.c
+++ b/jdatasrc.c
@@ -5,8 +5,9 @@
  * Copyright (C) 1994-1996, Thomas G. Lane.
  * Modified 2009-2011 by Guido Vollbeding.
  * libjpeg-turbo Modifications:
- * Copyright (C) 2013, D. R. Commander.
- * For conditions of distribution and use, see the accompanying README file.
+ * Copyright (C) 2013, 2016, D. R. Commander.
+ * For conditions of distribution and use, see the accompanying README.ijg
+ * file.
  *
  * This file contains decompression data source routines for the case of
  * reading JPEG data from memory or from a file (or any stdio stream).
@@ -28,12 +29,12 @@
 typedef struct {
   struct jpeg_source_mgr pub;   /* public fields */
 
-  FILE * infile;                /* source stream */
-  JOCTET * buffer;              /* start of buffer */
+  FILE *infile;                 /* source stream */
+  JOCTET *buffer;               /* start of buffer */
   boolean start_of_file;        /* have we gotten any data yet? */
 } my_source_mgr;
 
-typedef my_source_mgr * my_src_ptr;
+typedef my_source_mgr *my_src_ptr;
 
 #define INPUT_BUF_SIZE  4096    /* choose an efficiently fread'able size */
 
@@ -161,7 +162,7 @@
 METHODDEF(void)
 skip_input_data (j_decompress_ptr cinfo, long num_bytes)
 {
-  struct jpeg_source_mgr * src = cinfo->src;
+  struct jpeg_source_mgr *src = cinfo->src;
 
   /* Just a dumb implementation for now.  Could use fseek() except
    * it doesn't work on pipes.  Not clear that being smart is worth
@@ -213,7 +214,7 @@
  */
 
 GLOBAL(void)
-jpeg_stdio_src (j_decompress_ptr cinfo, FILE * infile)
+jpeg_stdio_src (j_decompress_ptr cinfo, FILE *infile)
 {
   my_src_ptr src;
 
@@ -221,8 +222,6 @@
    * of JPEG images can be read from the same file by calling jpeg_stdio_src
    * only before the first one.  (If we discarded the buffer at the end of
    * one image, we'd likely lose the start of the next one.)
-   * This makes it unsafe to use this manager and a different source
-   * manager serially with the same JPEG object.  Caveat programmer.
    */
   if (cinfo->src == NULL) {     /* first time for this JPEG object? */
     cinfo->src = (struct jpeg_source_mgr *)
@@ -232,6 +231,14 @@
     src->buffer = (JOCTET *)
       (*cinfo->mem->alloc_small) ((j_common_ptr) cinfo, JPOOL_PERMANENT,
                                   INPUT_BUF_SIZE * sizeof(JOCTET));
+  } else if (cinfo->src->init_source != init_source) {
+    /* It is unsafe to reuse the existing source manager unless it was created
+     * by this function.  Otherwise, there is no guarantee that the opaque
+     * structure is the right size.  Note that we could just create a new
+     * structure, but the old structure would not be freed until
+     * jpeg_destroy_decompress() was called.
+     */
+    ERREXIT(cinfo, JERR_BUFFER_SIZE);
   }
 
   src = (my_src_ptr) cinfo->src;
@@ -254,9 +261,9 @@
 
 GLOBAL(void)
 jpeg_mem_src (j_decompress_ptr cinfo,
-              unsigned char * inbuffer, unsigned long insize)
+              const unsigned char *inbuffer, unsigned long insize)
 {
-  struct jpeg_source_mgr * src;
+  struct jpeg_source_mgr *src;
 
   if (inbuffer == NULL || insize == 0)  /* Treat empty input as fatal error */
     ERREXIT(cinfo, JERR_INPUT_EMPTY);
@@ -269,6 +276,11 @@
     cinfo->src = (struct jpeg_source_mgr *)
       (*cinfo->mem->alloc_small) ((j_common_ptr) cinfo, JPOOL_PERMANENT,
                                   sizeof(struct jpeg_source_mgr));
+  } else if (cinfo->src->init_source != init_mem_source) {
+    /* It is unsafe to reuse the existing source manager unless it was created
+     * by this function.
+     */
+    ERREXIT(cinfo, JERR_BUFFER_SIZE);
   }
 
   src = cinfo->src;
@@ -278,6 +290,6 @@
   src->resync_to_restart = jpeg_resync_to_restart; /* use default method */
   src->term_source = term_source;
   src->bytes_in_buffer = (size_t) insize;
-  src->next_input_byte = (JOCTET *) inbuffer;
+  src->next_input_byte = (const JOCTET *) inbuffer;
 }
 #endif
diff --git a/jdcoefct.c b/jdcoefct.c
index 199a628..1a48969 100644
--- a/jdcoefct.c
+++ b/jdcoefct.c
@@ -4,8 +4,11 @@
  * This file was part of the Independent JPEG Group's software:
  * Copyright (C) 1994-1997, Thomas G. Lane.
  * libjpeg-turbo Modifications:
- * Copyright (C) 2010, D. R. Commander.
- * For conditions of distribution and use, see the accompanying README file.
+ * Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
+ * Copyright (C) 2010, 2015-2016, D. R. Commander.
+ * Copyright (C) 2015, Google, Inc.
+ * For conditions of distribution and use, see the accompanying README.ijg
+ * file.
  *
  * This file contains the coefficient buffer controller for decompression.
  * This controller is the top level of the JPEG decompressor proper.
@@ -16,53 +19,10 @@
  * Also, the input side (only) is used when reading a file for transcoding.
  */
 
-#define JPEG_INTERNALS
 #include "jinclude.h"
-#include "jpeglib.h"
+#include "jdcoefct.h"
 #include "jpegcomp.h"
 
-/* Block smoothing is only applicable for progressive JPEG, so: */
-#ifndef D_PROGRESSIVE_SUPPORTED
-#undef BLOCK_SMOOTHING_SUPPORTED
-#endif
-
-/* Private buffer controller object */
-
-typedef struct {
-  struct jpeg_d_coef_controller pub; /* public fields */
-
-  /* These variables keep track of the current location of the input side. */
-  /* cinfo->input_iMCU_row is also used for this. */
-  JDIMENSION MCU_ctr;           /* counts MCUs processed in current row */
-  int MCU_vert_offset;          /* counts MCU rows within iMCU row */
-  int MCU_rows_per_iMCU_row;    /* number of such rows needed */
-
-  /* The output side's location is represented by cinfo->output_iMCU_row. */
-
-  /* In single-pass modes, it's sufficient to buffer just one MCU.
-   * We allocate a workspace of D_MAX_BLOCKS_IN_MCU coefficient blocks,
-   * and let the entropy decoder write into that workspace each time.
-   * In multi-pass modes, this array points to the current MCU's blocks
-   * within the virtual arrays; it is used only by the input side.
-   */
-  JBLOCKROW MCU_buffer[D_MAX_BLOCKS_IN_MCU];
-
-  /* Temporary workspace for one MCU */
-  JCOEF * workspace;
-
-#ifdef D_MULTISCAN_FILES_SUPPORTED
-  /* In multi-pass modes, we need a virtual block array for each component. */
-  jvirt_barray_ptr whole_image[MAX_COMPONENTS];
-#endif
-
-#ifdef BLOCK_SMOOTHING_SUPPORTED
-  /* When doing block smoothing, we latch coefficient Al values here */
-  int * coef_bits_latch;
-#define SAVED_COEFS  6          /* we save coef_bits[0..5] */
-#endif
-} my_coef_controller;
-
-typedef my_coef_controller * my_coef_ptr;
 
 /* Forward declarations */
 METHODDEF(int) decompress_onepass
@@ -78,30 +38,6 @@
 #endif
 
 
-LOCAL(void)
-start_iMCU_row (j_decompress_ptr cinfo)
-/* Reset within-iMCU-row counters for a new row (input side) */
-{
-  my_coef_ptr coef = (my_coef_ptr) cinfo->coef;
-
-  /* In an interleaved scan, an MCU row is the same as an iMCU row.
-   * In a noninterleaved scan, an iMCU row has v_samp_factor MCU rows.
-   * But at the bottom of the image, process only what's left.
-   */
-  if (cinfo->comps_in_scan > 1) {
-    coef->MCU_rows_per_iMCU_row = 1;
-  } else {
-    if (cinfo->input_iMCU_row < (cinfo->total_iMCU_rows-1))
-      coef->MCU_rows_per_iMCU_row = cinfo->cur_comp_info[0]->v_samp_factor;
-    else
-      coef->MCU_rows_per_iMCU_row = cinfo->cur_comp_info[0]->last_row_height;
-  }
-
-  coef->MCU_ctr = 0;
-  coef->MCU_vert_offset = 0;
-}
-
-
 /*
  * Initialize for an input processing pass.
  */
@@ -173,38 +109,46 @@
         coef->MCU_ctr = MCU_col_num;
         return JPEG_SUSPENDED;
       }
-      /* Determine where data should go in output_buf and do the IDCT thing.
-       * We skip dummy blocks at the right and bottom edges (but blkn gets
-       * incremented past them!).  Note the inner loop relies on having
-       * allocated the MCU_buffer[] blocks sequentially.
+
+      /* Only perform the IDCT on blocks that are contained within the desired
+       * cropping region.
        */
-      blkn = 0;                 /* index of current DCT block within MCU */
-      for (ci = 0; ci < cinfo->comps_in_scan; ci++) {
-        compptr = cinfo->cur_comp_info[ci];
-        /* Don't bother to IDCT an uninteresting component. */
-        if (! compptr->component_needed) {
-          blkn += compptr->MCU_blocks;
-          continue;
-        }
-        inverse_DCT = cinfo->idct->inverse_DCT[compptr->component_index];
-        useful_width = (MCU_col_num < last_MCU_col) ? compptr->MCU_width
-                                                    : compptr->last_col_width;
-        output_ptr = output_buf[compptr->component_index] +
-          yoffset * compptr->_DCT_scaled_size;
-        start_col = MCU_col_num * compptr->MCU_sample_width;
-        for (yindex = 0; yindex < compptr->MCU_height; yindex++) {
-          if (cinfo->input_iMCU_row < last_iMCU_row ||
-              yoffset+yindex < compptr->last_row_height) {
-            output_col = start_col;
-            for (xindex = 0; xindex < useful_width; xindex++) {
-              (*inverse_DCT) (cinfo, compptr,
-                              (JCOEFPTR) coef->MCU_buffer[blkn+xindex],
-                              output_ptr, output_col);
-              output_col += compptr->_DCT_scaled_size;
-            }
+      if (MCU_col_num >= cinfo->master->first_iMCU_col &&
+          MCU_col_num <= cinfo->master->last_iMCU_col) {
+        /* Determine where data should go in output_buf and do the IDCT thing.
+         * We skip dummy blocks at the right and bottom edges (but blkn gets
+         * incremented past them!).  Note the inner loop relies on having
+         * allocated the MCU_buffer[] blocks sequentially.
+         */
+        blkn = 0;                 /* index of current DCT block within MCU */
+        for (ci = 0; ci < cinfo->comps_in_scan; ci++) {
+          compptr = cinfo->cur_comp_info[ci];
+          /* Don't bother to IDCT an uninteresting component. */
+          if (! compptr->component_needed) {
+            blkn += compptr->MCU_blocks;
+            continue;
           }
-          blkn += compptr->MCU_width;
-          output_ptr += compptr->_DCT_scaled_size;
+          inverse_DCT = cinfo->idct->inverse_DCT[compptr->component_index];
+          useful_width = (MCU_col_num < last_MCU_col) ? compptr->MCU_width
+                                                      : compptr->last_col_width;
+          output_ptr = output_buf[compptr->component_index] +
+            yoffset * compptr->_DCT_scaled_size;
+          start_col = (MCU_col_num - cinfo->master->first_iMCU_col) *
+              compptr->MCU_sample_width;
+          for (yindex = 0; yindex < compptr->MCU_height; yindex++) {
+            if (cinfo->input_iMCU_row < last_iMCU_row ||
+                yoffset+yindex < compptr->last_row_height) {
+              output_col = start_col;
+              for (xindex = 0; xindex < useful_width; xindex++) {
+                (*inverse_DCT) (cinfo, compptr,
+                                (JCOEFPTR) coef->MCU_buffer[blkn+xindex],
+                                output_ptr, output_col);
+                output_col += compptr->_DCT_scaled_size;
+              }
+            }
+            blkn += compptr->MCU_width;
+            output_ptr += compptr->_DCT_scaled_size;
+          }
         }
       }
     }
@@ -359,9 +303,10 @@
     output_ptr = output_buf[ci];
     /* Loop over all DCT blocks to be processed. */
     for (block_row = 0; block_row < block_rows; block_row++) {
-      buffer_ptr = buffer[block_row];
+      buffer_ptr = buffer[block_row] + cinfo->master->first_MCU_col[ci];
       output_col = 0;
-      for (block_num = 0; block_num < compptr->width_in_blocks; block_num++) {
+      for (block_num = cinfo->master->first_MCU_col[ci];
+           block_num <= cinfo->master->last_MCU_col[ci]; block_num++) {
         (*inverse_DCT) (cinfo, compptr, (JCOEFPTR) buffer_ptr,
                         output_ptr, output_col);
         buffer_ptr++;
@@ -411,9 +356,9 @@
   boolean smoothing_useful = FALSE;
   int ci, coefi;
   jpeg_component_info *compptr;
-  JQUANT_TBL * qtable;
-  int * coef_bits;
-  int * coef_bits_latch;
+  JQUANT_TBL *qtable;
+  int *coef_bits;
+  int *coef_bits_latch;
 
   if (! cinfo->progressive_mode || cinfo->coef_bits == NULL)
     return FALSE;
@@ -474,10 +419,10 @@
   jpeg_component_info *compptr;
   inverse_DCT_method_ptr inverse_DCT;
   boolean first_row, last_row;
-  JCOEF * workspace;
+  JCOEF *workspace;
   int *coef_bits;
   JQUANT_TBL *quanttbl;
-  INT32 Q00,Q01,Q02,Q10,Q11,Q20, num;
+  JLONG Q00,Q01,Q02,Q10,Q11,Q20, num;
   int DC1,DC2,DC3,DC4,DC5,DC6,DC7,DC8,DC9;
   int Al, pred;
 
@@ -547,7 +492,7 @@
     output_ptr = output_buf[ci];
     /* Loop over all DCT blocks to be processed. */
     for (block_row = 0; block_row < block_rows; block_row++) {
-      buffer_ptr = buffer[block_row];
+      buffer_ptr = buffer[block_row] + cinfo->master->first_MCU_col[ci];
       if (first_row && block_row == 0)
         prev_block_row = buffer_ptr;
       else
@@ -564,7 +509,8 @@
       DC7 = DC8 = DC9 = (int) next_block_row[0][0];
       output_col = 0;
       last_block_column = compptr->width_in_blocks - 1;
-      for (block_num = 0; block_num <= last_block_column; block_num++) {
+      for (block_num = cinfo->master->first_MCU_col[ci];
+           block_num <= cinfo->master->last_MCU_col[ci]; block_num++) {
         /* Fetch current DCT block into workspace so we can modify it. */
         jcopy_block_row(buffer_ptr, (JBLOCKROW) workspace, (JDIMENSION) 1);
         /* Update DC values */
diff --git a/jdcoefct.h b/jdcoefct.h
new file mode 100644
index 0000000..bf6beb2
--- /dev/null
+++ b/jdcoefct.h
@@ -0,0 +1,82 @@
+/*
+ * jdcoefct.h
+ *
+ * This file was part of the Independent JPEG Group's software:
+ * Copyright (C) 1994-1997, Thomas G. Lane.
+ * libjpeg-turbo Modifications:
+ * Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
+ * For conditions of distribution and use, see the accompanying README.ijg
+ * file.
+ */
+
+#define JPEG_INTERNALS
+#include "jpeglib.h"
+
+
+/* Block smoothing is only applicable for progressive JPEG, so: */
+#ifndef D_PROGRESSIVE_SUPPORTED
+#undef BLOCK_SMOOTHING_SUPPORTED
+#endif
+
+
+/* Private buffer controller object */
+
+typedef struct {
+  struct jpeg_d_coef_controller pub; /* public fields */
+
+  /* These variables keep track of the current location of the input side. */
+  /* cinfo->input_iMCU_row is also used for this. */
+  JDIMENSION MCU_ctr;           /* counts MCUs processed in current row */
+  int MCU_vert_offset;          /* counts MCU rows within iMCU row */
+  int MCU_rows_per_iMCU_row;    /* number of such rows needed */
+
+  /* The output side's location is represented by cinfo->output_iMCU_row. */
+
+  /* In single-pass modes, it's sufficient to buffer just one MCU.
+   * We allocate a workspace of D_MAX_BLOCKS_IN_MCU coefficient blocks,
+   * and let the entropy decoder write into that workspace each time.
+   * In multi-pass modes, this array points to the current MCU's blocks
+   * within the virtual arrays; it is used only by the input side.
+   */
+  JBLOCKROW MCU_buffer[D_MAX_BLOCKS_IN_MCU];
+
+  /* Temporary workspace for one MCU */
+  JCOEF *workspace;
+
+#ifdef D_MULTISCAN_FILES_SUPPORTED
+  /* In multi-pass modes, we need a virtual block array for each component. */
+  jvirt_barray_ptr whole_image[MAX_COMPONENTS];
+#endif
+
+#ifdef BLOCK_SMOOTHING_SUPPORTED
+  /* When doing block smoothing, we latch coefficient Al values here */
+  int *coef_bits_latch;
+#define SAVED_COEFS  6          /* we save coef_bits[0..5] */
+#endif
+} my_coef_controller;
+
+typedef my_coef_controller *my_coef_ptr;
+
+
+LOCAL(void)
+start_iMCU_row (j_decompress_ptr cinfo)
+/* Reset within-iMCU-row counters for a new row (input side) */
+{
+  my_coef_ptr coef = (my_coef_ptr) cinfo->coef;
+
+  /* In an interleaved scan, an MCU row is the same as an iMCU row.
+   * In a noninterleaved scan, an iMCU row has v_samp_factor MCU rows.
+   * But at the bottom of the image, process only what's left.
+   */
+  if (cinfo->comps_in_scan > 1) {
+    coef->MCU_rows_per_iMCU_row = 1;
+  } else {
+    if (cinfo->input_iMCU_row < (cinfo->total_iMCU_rows-1))
+      coef->MCU_rows_per_iMCU_row = cinfo->cur_comp_info[0]->v_samp_factor;
+    else
+      coef->MCU_rows_per_iMCU_row = cinfo->cur_comp_info[0]->last_row_height;
+  }
+
+  coef->MCU_ctr = 0;
+  coef->MCU_vert_offset = 0;
+}
diff --git a/jdcol565.c b/jdcol565.c
index 695f262..349fce4 100644
--- a/jdcol565.c
+++ b/jdcol565.c
@@ -5,8 +5,9 @@
  * Copyright (C) 1991-1997, Thomas G. Lane.
  * Modifications:
  * Copyright (C) 2013, Linaro Limited.
- * Copyright (C) 2014, D. R. Commander.
- * For conditions of distribution and use, see the accompanying README file.
+ * Copyright (C) 2014-2015, D. R. Commander.
+ * For conditions of distribution and use, see the accompanying README.ijg
+ * file.
  *
  * This file contains output colorspace conversion routines.
  */
@@ -30,12 +31,12 @@
   register JSAMPLE * range_limit = cinfo->sample_range_limit;
   register int * Crrtab = cconvert->Cr_r_tab;
   register int * Cbbtab = cconvert->Cb_b_tab;
-  register INT32 * Crgtab = cconvert->Cr_g_tab;
-  register INT32 * Cbgtab = cconvert->Cb_g_tab;
+  register JLONG * Crgtab = cconvert->Cr_g_tab;
+  register JLONG * Cbgtab = cconvert->Cb_g_tab;
   SHIFT_TEMPS
 
   while (--num_rows >= 0) {
-    INT32 rgb;
+    JLONG rgb;
     unsigned int r, g, b;
     inptr0 = input_buf[0][input_row];
     inptr1 = input_buf[1][input_row];
@@ -52,7 +53,7 @@
                                             SCALEBITS))];
       b = range_limit[y + Cbbtab[cb]];
       rgb = PACK_SHORT_565(r, g, b);
-      *(INT16*)outptr = rgb;
+      *(INT16*)outptr = (INT16)rgb;
       outptr += 2;
       num_cols--;
     }
@@ -87,7 +88,7 @@
                                             SCALEBITS))];
       b = range_limit[y + Cbbtab[cb]];
       rgb = PACK_SHORT_565(r, g, b);
-      *(INT16*)outptr = rgb;
+      *(INT16*)outptr = (INT16)rgb;
     }
   }
 }
@@ -109,13 +110,13 @@
   register JSAMPLE * range_limit = cinfo->sample_range_limit;
   register int * Crrtab = cconvert->Cr_r_tab;
   register int * Cbbtab = cconvert->Cb_b_tab;
-  register INT32 * Crgtab = cconvert->Cr_g_tab;
-  register INT32 * Cbgtab = cconvert->Cb_g_tab;
-  INT32 d0 = dither_matrix[cinfo->output_scanline & DITHER_MASK];
+  register JLONG * Crgtab = cconvert->Cr_g_tab;
+  register JLONG * Cbgtab = cconvert->Cb_g_tab;
+  JLONG d0 = dither_matrix[cinfo->output_scanline & DITHER_MASK];
   SHIFT_TEMPS
 
   while (--num_rows >= 0) {
-    INT32 rgb;
+    JLONG rgb;
     unsigned int r, g, b;
 
     inptr0 = input_buf[0][input_row];
@@ -133,7 +134,7 @@
                                                      SCALEBITS)), d0)];
       b = range_limit[DITHER_565_B(y + Cbbtab[cb], d0)];
       rgb = PACK_SHORT_565(r, g, b);
-      *(INT16*)outptr = rgb;
+      *(INT16*)outptr = (INT16)rgb;
       outptr += 2;
       num_cols--;
     }
@@ -173,7 +174,7 @@
                                                      SCALEBITS)), d0)];
       b = range_limit[DITHER_565_B(y + Cbbtab[cb], d0)];
       rgb = PACK_SHORT_565(r, g, b);
-      *(INT16*)outptr = rgb;
+      *(INT16*)outptr = (INT16)rgb;
     }
   }
 }
@@ -192,7 +193,7 @@
   SHIFT_TEMPS
 
   while (--num_rows >= 0) {
-    INT32 rgb;
+    JLONG rgb;
     unsigned int r, g, b;
 
     inptr0 = input_buf[0][input_row];
@@ -205,7 +206,7 @@
       g = GETJSAMPLE(*inptr1++);
       b = GETJSAMPLE(*inptr2++);
       rgb = PACK_SHORT_565(r, g, b);
-      *(INT16*)outptr = rgb;
+      *(INT16*)outptr = (INT16)rgb;
       outptr += 2;
       num_cols--;
     }
@@ -228,7 +229,7 @@
       g = GETJSAMPLE(*inptr1);
       b = GETJSAMPLE(*inptr2);
       rgb = PACK_SHORT_565(r, g, b);
-      *(INT16*)outptr = rgb;
+      *(INT16*)outptr = (INT16)rgb;
     }
   }
 }
@@ -245,11 +246,11 @@
   register JDIMENSION col;
   register JSAMPLE * range_limit = cinfo->sample_range_limit;
   JDIMENSION num_cols = cinfo->output_width;
-  INT32 d0 = dither_matrix[cinfo->output_scanline & DITHER_MASK];
+  JLONG d0 = dither_matrix[cinfo->output_scanline & DITHER_MASK];
   SHIFT_TEMPS
 
   while (--num_rows >= 0) {
-    INT32 rgb;
+    JLONG rgb;
     unsigned int r, g, b;
 
     inptr0 = input_buf[0][input_row];
@@ -262,7 +263,7 @@
       g = range_limit[DITHER_565_G(GETJSAMPLE(*inptr1++), d0)];
       b = range_limit[DITHER_565_B(GETJSAMPLE(*inptr2++), d0)];
       rgb = PACK_SHORT_565(r, g, b);
-      *(INT16*)outptr = rgb;
+      *(INT16*)outptr = (INT16)rgb;
       outptr += 2;
       num_cols--;
     }
@@ -287,7 +288,7 @@
       g = range_limit[DITHER_565_G(GETJSAMPLE(*inptr1), d0)];
       b = range_limit[DITHER_565_B(GETJSAMPLE(*inptr2), d0)];
       rgb = PACK_SHORT_565(r, g, b);
-      *(INT16*)outptr = rgb;
+      *(INT16*)outptr = (INT16)rgb;
     }
   }
 }
@@ -304,7 +305,7 @@
   JDIMENSION num_cols = cinfo->output_width;
 
   while (--num_rows >= 0) {
-    INT32 rgb;
+    JLONG rgb;
     unsigned int g;
 
     inptr = input_buf[0][input_row++];
@@ -312,7 +313,7 @@
     if (PACK_NEED_ALIGNMENT(outptr)) {
       g = *inptr++;
       rgb = PACK_SHORT_565(g, g, g);
-      *(INT16*)outptr = rgb;
+      *(INT16*)outptr = (INT16)rgb;
       outptr += 2;
       num_cols--;
     }
@@ -327,7 +328,7 @@
     if (num_cols & 1) {
       g = *inptr;
       rgb = PACK_SHORT_565(g, g, g);
-      *(INT16*)outptr = rgb;
+      *(INT16*)outptr = (INT16)rgb;
     }
   }
 }
@@ -343,10 +344,10 @@
   register JDIMENSION col;
   register JSAMPLE * range_limit = cinfo->sample_range_limit;
   JDIMENSION num_cols = cinfo->output_width;
-  INT32 d0 = dither_matrix[cinfo->output_scanline & DITHER_MASK];
+  JLONG d0 = dither_matrix[cinfo->output_scanline & DITHER_MASK];
 
   while (--num_rows >= 0) {
-    INT32 rgb;
+    JLONG rgb;
     unsigned int g;
 
     inptr = input_buf[0][input_row++];
@@ -355,7 +356,7 @@
       g = *inptr++;
       g = range_limit[DITHER_565_R(g, d0)];
       rgb = PACK_SHORT_565(g, g, g);
-      *(INT16*)outptr = rgb;
+      *(INT16*)outptr = (INT16)rgb;
       outptr += 2;
       num_cols--;
     }
@@ -377,7 +378,7 @@
       g = *inptr;
       g = range_limit[DITHER_565_R(g, d0)];
       rgb = PACK_SHORT_565(g, g, g);
-      *(INT16*)outptr = rgb;
+      *(INT16*)outptr = (INT16)rgb;
     }
   }
 }
diff --git a/jdcolext.c b/jdcolext.c
index f72cab0..59b676c 100644
--- a/jdcolext.c
+++ b/jdcolext.c
@@ -4,8 +4,9 @@
  * This file was part of the Independent JPEG Group's software:
  * Copyright (C) 1991-1997, Thomas G. Lane.
  * libjpeg-turbo Modifications:
- * Copyright (C) 2009, 2011, D. R. Commander.
- * For conditions of distribution and use, see the accompanying README file.
+ * Copyright (C) 2009, 2011, 2015, D. R. Commander.
+ * For conditions of distribution and use, see the accompanying README.ijg
+ * file.
  *
  * This file contains output colorspace conversion routines.
  */
@@ -41,8 +42,8 @@
   register JSAMPLE * range_limit = cinfo->sample_range_limit;
   register int * Crrtab = cconvert->Cr_r_tab;
   register int * Cbbtab = cconvert->Cb_b_tab;
-  register INT32 * Crgtab = cconvert->Cr_g_tab;
-  register INT32 * Cbgtab = cconvert->Cb_g_tab;
+  register JLONG * Crgtab = cconvert->Cr_g_tab;
+  register JLONG * Cbgtab = cconvert->Cb_g_tab;
   SHIFT_TEMPS
 
   while (--num_rows >= 0) {
diff --git a/jdcolor.c b/jdcolor.c
index 38db90f..ab8fa24 100644
--- a/jdcolor.c
+++ b/jdcolor.c
@@ -8,7 +8,8 @@
  * Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
  * Copyright (C) 2009, 2011-2012, 2014-2015, D. R. Commander.
  * Copyright (C) 2013, Linaro Limited.
- * For conditions of distribution and use, see the accompanying README file.
+ * For conditions of distribution and use, see the accompanying README.ijg
+ * file.
  *
  * This file contains output colorspace conversion routines.
  */
@@ -26,16 +27,16 @@
   struct jpeg_color_deconverter pub; /* public fields */
 
   /* Private state for YCC->RGB conversion */
-  int * Cr_r_tab;               /* => table for Cr to R conversion */
-  int * Cb_b_tab;               /* => table for Cb to B conversion */
-  INT32 * Cr_g_tab;             /* => table for Cr to G conversion */
-  INT32 * Cb_g_tab;             /* => table for Cb to G conversion */
+  int *Cr_r_tab;                /* => table for Cr to R conversion */
+  int *Cb_b_tab;                /* => table for Cb to B conversion */
+  JLONG *Cr_g_tab;              /* => table for Cr to G conversion */
+  JLONG *Cb_g_tab;              /* => table for Cb to G conversion */
 
   /* Private state for RGB->Y conversion */
-  INT32 * rgb_y_tab;            /* => table for RGB to Y conversion */
+  JLONG *rgb_y_tab;             /* => table for RGB to Y conversion */
 } my_color_deconverter;
 
-typedef my_color_deconverter * my_cconvert_ptr;
+typedef my_color_deconverter *my_cconvert_ptr;
 
 
 /**************** YCbCr -> RGB conversion: most common case **************/
@@ -73,8 +74,8 @@
  */
 
 #define SCALEBITS       16      /* speediest right-shift on some machines */
-#define ONE_HALF        ((INT32) 1 << (SCALEBITS-1))
-#define FIX(x)          ((INT32) ((x) * (1L<<SCALEBITS) + 0.5))
+#define ONE_HALF        ((JLONG) 1 << (SCALEBITS-1))
+#define FIX(x)          ((JLONG) ((x) * (1L<<SCALEBITS) + 0.5))
 
 /* We allocate one big table for RGB->Y conversion and divide it up into
  * three parts, instead of doing three alloc_small requests.  This lets us
@@ -211,7 +212,7 @@
 {
   my_cconvert_ptr cconvert = (my_cconvert_ptr) cinfo->cconvert;
   int i;
-  INT32 x;
+  JLONG x;
   SHIFT_TEMPS
 
   cconvert->Cr_r_tab = (int *)
@@ -220,12 +221,12 @@
   cconvert->Cb_b_tab = (int *)
     (*cinfo->mem->alloc_small) ((j_common_ptr) cinfo, JPOOL_IMAGE,
                                 (MAXJSAMPLE+1) * sizeof(int));
-  cconvert->Cr_g_tab = (INT32 *)
+  cconvert->Cr_g_tab = (JLONG *)
     (*cinfo->mem->alloc_small) ((j_common_ptr) cinfo, JPOOL_IMAGE,
-                                (MAXJSAMPLE+1) * sizeof(INT32));
-  cconvert->Cb_g_tab = (INT32 *)
+                                (MAXJSAMPLE+1) * sizeof(JLONG));
+  cconvert->Cb_g_tab = (JLONG *)
     (*cinfo->mem->alloc_small) ((j_common_ptr) cinfo, JPOOL_IMAGE,
-                                (MAXJSAMPLE+1) * sizeof(INT32));
+                                (MAXJSAMPLE+1) * sizeof(JLONG));
 
   for (i = 0, x = -CENTERJSAMPLE; i <= MAXJSAMPLE; i++, x++) {
     /* i is the actual input pixel value, in the range 0..MAXJSAMPLE */
@@ -302,13 +303,13 @@
 build_rgb_y_table (j_decompress_ptr cinfo)
 {
   my_cconvert_ptr cconvert = (my_cconvert_ptr) cinfo->cconvert;
-  INT32 * rgb_y_tab;
-  INT32 i;
+  JLONG *rgb_y_tab;
+  JLONG i;
 
   /* Allocate and fill in the conversion tables. */
-  cconvert->rgb_y_tab = rgb_y_tab = (INT32 *)
+  cconvert->rgb_y_tab = rgb_y_tab = (JLONG *)
     (*cinfo->mem->alloc_small) ((j_common_ptr) cinfo, JPOOL_IMAGE,
-                                (TABLE_SIZE * sizeof(INT32)));
+                                (TABLE_SIZE * sizeof(JLONG)));
 
   for (i = 0; i <= MAXJSAMPLE; i++) {
     rgb_y_tab[i+R_Y_OFF] = FIX(0.29900) * i;
@@ -329,7 +330,7 @@
 {
   my_cconvert_ptr cconvert = (my_cconvert_ptr) cinfo->cconvert;
   register int r, g, b;
-  register INT32 * ctab = cconvert->rgb_y_tab;
+  register JLONG *ctab = cconvert->rgb_y_tab;
   register JSAMPROW outptr;
   register JSAMPROW inptr0, inptr1, inptr2;
   register JDIMENSION col;
@@ -542,11 +543,11 @@
   register JDIMENSION col;
   JDIMENSION num_cols = cinfo->output_width;
   /* copy these pointers into registers if possible */
-  register JSAMPLE * range_limit = cinfo->sample_range_limit;
-  register int * Crrtab = cconvert->Cr_r_tab;
-  register int * Cbbtab = cconvert->Cb_b_tab;
-  register INT32 * Crgtab = cconvert->Cr_g_tab;
-  register INT32 * Cbgtab = cconvert->Cb_g_tab;
+  register JSAMPLE *range_limit = cinfo->sample_range_limit;
+  register int *Crrtab = cconvert->Cr_r_tab;
+  register int *Cbbtab = cconvert->Cb_b_tab;
+  register JLONG *Crgtab = cconvert->Cr_g_tab;
+  register JLONG *Cbgtab = cconvert->Cb_g_tab;
   SHIFT_TEMPS
 
   while (--num_rows >= 0) {
@@ -603,8 +604,8 @@
  */
 
 #define DITHER_MASK       0x3
-#define DITHER_ROTATE(x)  (((x) << 24) | (((x) >> 8) & 0x00FFFFFF))
-static const INT32 dither_matrix[4] = {
+#define DITHER_ROTATE(x)  ((((x) & 0xFF) << 24) | (((x) >> 8) & 0x00FFFFFF))
+static const JLONG dither_matrix[4] = {
   0x0008020A,
   0x0C040E06,
   0x030B0109,
diff --git a/jdct.h b/jdct.h
index 6f8b159..faf8e1c 100644
--- a/jdct.h
+++ b/jdct.h
@@ -3,9 +3,10 @@
  *
  * This file was part of the Independent JPEG Group's software:
  * Copyright (C) 1994-1996, Thomas G. Lane.
- * It was modified by The libjpeg-turbo Project to include only code relevant
- * to libjpeg-turbo.
- * For conditions of distribution and use, see the accompanying README file.
+ * libjpeg-turbo Modifications:
+ * Copyright (C) 2015, D. R. Commander.
+ * For conditions of distribution and use, see the accompanying README.ijg
+ * file.
  *
  * This include file contains common declarations for the forward and
  * inverse DCT modules.  These declarations are private to the DCT managers
@@ -18,7 +19,7 @@
 /*
  * A forward DCT routine is given a pointer to a work area of type DCTELEM[];
  * the DCT is to be performed in-place in that buffer.  Type DCTELEM is int
- * for 8-bit samples, INT32 for 12-bit samples.  (NOTE: Floating-point DCT
+ * for 8-bit samples, JLONG for 12-bit samples.  (NOTE: Floating-point DCT
  * implementations use an array of type FAST_FLOAT, instead.)
  * The DCT inputs are expected to be signed (range +-CENTERJSAMPLE).
  * The DCT outputs are returned scaled up by a factor of 8; they therefore
@@ -40,7 +41,7 @@
 typedef unsigned int UDCTELEM2;
 #endif
 #else
-typedef INT32 DCTELEM;          /* must have 32 bits */
+typedef JLONG DCTELEM;          /* must have 32 bits */
 typedef unsigned long long UDCTELEM2;
 #endif
 
@@ -67,7 +68,7 @@
 typedef MULTIPLIER IFAST_MULT_TYPE; /* 16 bits is OK, use short if faster */
 #define IFAST_SCALE_BITS  2     /* fractional bits in scale factors */
 #else
-typedef INT32 IFAST_MULT_TYPE;  /* need 32 bits for scaled quantizers */
+typedef JLONG IFAST_MULT_TYPE;  /* need 32 bits for scaled quantizers */
 #define IFAST_SCALE_BITS  13    /* fractional bits in scale factors */
 #endif
 typedef FAST_FLOAT FLOAT_MULT_TYPE; /* preferred floating type */
@@ -89,63 +90,63 @@
 
 /* Extern declarations for the forward and inverse DCT routines. */
 
-EXTERN(void) jpeg_fdct_islow (DCTELEM * data);
-EXTERN(void) jpeg_fdct_ifast (DCTELEM * data);
-EXTERN(void) jpeg_fdct_float (FAST_FLOAT * data);
+EXTERN(void) jpeg_fdct_islow (DCTELEM *data);
+EXTERN(void) jpeg_fdct_ifast (DCTELEM *data);
+EXTERN(void) jpeg_fdct_float (FAST_FLOAT *data);
 
 EXTERN(void) jpeg_idct_islow
-        (j_decompress_ptr cinfo, jpeg_component_info * compptr,
+        (j_decompress_ptr cinfo, jpeg_component_info *compptr,
          JCOEFPTR coef_block, JSAMPARRAY output_buf, JDIMENSION output_col);
 EXTERN(void) jpeg_idct_ifast
-        (j_decompress_ptr cinfo, jpeg_component_info * compptr,
+        (j_decompress_ptr cinfo, jpeg_component_info *compptr,
          JCOEFPTR coef_block, JSAMPARRAY output_buf, JDIMENSION output_col);
 EXTERN(void) jpeg_idct_float
-        (j_decompress_ptr cinfo, jpeg_component_info * compptr,
+        (j_decompress_ptr cinfo, jpeg_component_info *compptr,
          JCOEFPTR coef_block, JSAMPARRAY output_buf, JDIMENSION output_col);
 EXTERN(void) jpeg_idct_7x7
-        (j_decompress_ptr cinfo, jpeg_component_info * compptr,
+        (j_decompress_ptr cinfo, jpeg_component_info *compptr,
          JCOEFPTR coef_block, JSAMPARRAY output_buf, JDIMENSION output_col);
 EXTERN(void) jpeg_idct_6x6
-        (j_decompress_ptr cinfo, jpeg_component_info * compptr,
+        (j_decompress_ptr cinfo, jpeg_component_info *compptr,
          JCOEFPTR coef_block, JSAMPARRAY output_buf, JDIMENSION output_col);
 EXTERN(void) jpeg_idct_5x5
-        (j_decompress_ptr cinfo, jpeg_component_info * compptr,
+        (j_decompress_ptr cinfo, jpeg_component_info *compptr,
          JCOEFPTR coef_block, JSAMPARRAY output_buf, JDIMENSION output_col);
 EXTERN(void) jpeg_idct_4x4
-        (j_decompress_ptr cinfo, jpeg_component_info * compptr,
+        (j_decompress_ptr cinfo, jpeg_component_info *compptr,
          JCOEFPTR coef_block, JSAMPARRAY output_buf, JDIMENSION output_col);
 EXTERN(void) jpeg_idct_3x3
-        (j_decompress_ptr cinfo, jpeg_component_info * compptr,
+        (j_decompress_ptr cinfo, jpeg_component_info *compptr,
          JCOEFPTR coef_block, JSAMPARRAY output_buf, JDIMENSION output_col);
 EXTERN(void) jpeg_idct_2x2
-        (j_decompress_ptr cinfo, jpeg_component_info * compptr,
+        (j_decompress_ptr cinfo, jpeg_component_info *compptr,
          JCOEFPTR coef_block, JSAMPARRAY output_buf, JDIMENSION output_col);
 EXTERN(void) jpeg_idct_1x1
-        (j_decompress_ptr cinfo, jpeg_component_info * compptr,
+        (j_decompress_ptr cinfo, jpeg_component_info *compptr,
          JCOEFPTR coef_block, JSAMPARRAY output_buf, JDIMENSION output_col);
 EXTERN(void) jpeg_idct_9x9
-        (j_decompress_ptr cinfo, jpeg_component_info * compptr,
+        (j_decompress_ptr cinfo, jpeg_component_info *compptr,
          JCOEFPTR coef_block, JSAMPARRAY output_buf, JDIMENSION output_col);
 EXTERN(void) jpeg_idct_10x10
-        (j_decompress_ptr cinfo, jpeg_component_info * compptr,
+        (j_decompress_ptr cinfo, jpeg_component_info *compptr,
          JCOEFPTR coef_block, JSAMPARRAY output_buf, JDIMENSION output_col);
 EXTERN(void) jpeg_idct_11x11
-        (j_decompress_ptr cinfo, jpeg_component_info * compptr,
+        (j_decompress_ptr cinfo, jpeg_component_info *compptr,
          JCOEFPTR coef_block, JSAMPARRAY output_buf, JDIMENSION output_col);
 EXTERN(void) jpeg_idct_12x12
-        (j_decompress_ptr cinfo, jpeg_component_info * compptr,
+        (j_decompress_ptr cinfo, jpeg_component_info *compptr,
          JCOEFPTR coef_block, JSAMPARRAY output_buf, JDIMENSION output_col);
 EXTERN(void) jpeg_idct_13x13
-        (j_decompress_ptr cinfo, jpeg_component_info * compptr,
+        (j_decompress_ptr cinfo, jpeg_component_info *compptr,
          JCOEFPTR coef_block, JSAMPARRAY output_buf, JDIMENSION output_col);
 EXTERN(void) jpeg_idct_14x14
-        (j_decompress_ptr cinfo, jpeg_component_info * compptr,
+        (j_decompress_ptr cinfo, jpeg_component_info *compptr,
          JCOEFPTR coef_block, JSAMPARRAY output_buf, JDIMENSION output_col);
 EXTERN(void) jpeg_idct_15x15
-        (j_decompress_ptr cinfo, jpeg_component_info * compptr,
+        (j_decompress_ptr cinfo, jpeg_component_info *compptr,
          JCOEFPTR coef_block, JSAMPARRAY output_buf, JDIMENSION output_col);
 EXTERN(void) jpeg_idct_16x16
-        (j_decompress_ptr cinfo, jpeg_component_info * compptr,
+        (j_decompress_ptr cinfo, jpeg_component_info *compptr,
          JCOEFPTR coef_block, JSAMPARRAY output_buf, JDIMENSION output_col);
 
 
@@ -153,13 +154,13 @@
  * Macros for handling fixed-point arithmetic; these are used by many
  * but not all of the DCT/IDCT modules.
  *
- * All values are expected to be of type INT32.
+ * All values are expected to be of type JLONG.
  * Fractional constants are scaled left by CONST_BITS bits.
  * CONST_BITS is defined within each module using these macros,
  * and may differ from one module to the next.
  */
 
-#define ONE     ((INT32) 1)
+#define ONE     ((JLONG) 1)
 #define CONST_SCALE (ONE << CONST_BITS)
 
 /* Convert a positive real constant to an integer scaled by CONST_SCALE.
@@ -167,16 +168,16 @@
  * thus causing a lot of useless floating-point operations at run time.
  */
 
-#define FIX(x)  ((INT32) ((x) * CONST_SCALE + 0.5))
+#define FIX(x)  ((JLONG) ((x) * CONST_SCALE + 0.5))
 
-/* Descale and correctly round an INT32 value that's scaled by N bits.
+/* Descale and correctly round a JLONG value that's scaled by N bits.
  * We assume RIGHT_SHIFT rounds towards minus infinity, so adding
  * the fudge factor is correct for either sign of X.
  */
 
 #define DESCALE(x,n)  RIGHT_SHIFT((x) + (ONE << ((n)-1)), n)
 
-/* Multiply an INT32 variable by an INT32 constant to yield an INT32 result.
+/* Multiply a JLONG variable by a JLONG constant to yield a JLONG result.
  * This macro is used only when the two inputs will actually be no more than
  * 16 bits wide, so that a 16x16->32 bit multiply can be used instead of a
  * full 32x32 multiply.  This provides a useful speedup on many machines.
@@ -189,7 +190,7 @@
 #define MULTIPLY16C16(var,const)  (((INT16) (var)) * ((INT16) (const)))
 #endif
 #ifdef SHORTxLCONST_32          /* known to work with Microsoft C 6.0 */
-#define MULTIPLY16C16(var,const)  (((INT16) (var)) * ((INT32) (const)))
+#define MULTIPLY16C16(var,const)  (((INT16) (var)) * ((JLONG) (const)))
 #endif
 
 #ifndef MULTIPLY16C16           /* default definition */
diff --git a/jddctmgr.c b/jddctmgr.c
index 6cc3310..bdf7c53 100644
--- a/jddctmgr.c
+++ b/jddctmgr.c
@@ -6,9 +6,10 @@
  * Modified 2002-2010 by Guido Vollbeding.
  * libjpeg-turbo Modifications:
  * Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
- * Copyright (C) 2010, D. R. Commander.
+ * Copyright (C) 2010, 2015, D. R. Commander.
  * Copyright (C) 2013, MIPS Technologies, Inc., California
- * For conditions of distribution and use, see the accompanying README file.
+ * For conditions of distribution and use, see the accompanying README.ijg
+ * file.
  *
  * This file contains the inverse-DCT management logic.
  * This code selects a particular IDCT implementation to be used,
@@ -58,7 +59,7 @@
   int cur_method[MAX_COMPONENTS];
 } my_idct_controller;
 
-typedef my_idct_controller * my_idct_ptr;
+typedef my_idct_controller *my_idct_ptr;
 
 
 /* Allocated multiplier tables: big enough for any supported variant */
@@ -100,7 +101,7 @@
   jpeg_component_info *compptr;
   int method = 0;
   inverse_DCT_method_ptr method_ptr = NULL;
-  JQUANT_TBL * qtbl;
+  JQUANT_TBL *qtbl;
 
   for (ci = 0, compptr = cinfo->comp_info; ci < cinfo->num_components;
        ci++, compptr++) {
@@ -245,7 +246,7 @@
         /* For LL&M IDCT method, multipliers are equal to raw quantization
          * coefficients, but are stored as ints to ensure access efficiency.
          */
-        ISLOW_MULT_TYPE * ismtbl = (ISLOW_MULT_TYPE *) compptr->dct_table;
+        ISLOW_MULT_TYPE *ismtbl = (ISLOW_MULT_TYPE *) compptr->dct_table;
         for (i = 0; i < DCTSIZE2; i++) {
           ismtbl[i] = (ISLOW_MULT_TYPE) qtbl->quantval[i];
         }
@@ -262,7 +263,7 @@
          * For integer operation, the multiplier table is to be scaled by
          * IFAST_SCALE_BITS.
          */
-        IFAST_MULT_TYPE * ifmtbl = (IFAST_MULT_TYPE *) compptr->dct_table;
+        IFAST_MULT_TYPE *ifmtbl = (IFAST_MULT_TYPE *) compptr->dct_table;
 #define CONST_BITS 14
         static const INT16 aanscales[DCTSIZE2] = {
           /* precomputed values scaled up by 14 bits */
@@ -279,8 +280,8 @@
 
         for (i = 0; i < DCTSIZE2; i++) {
           ifmtbl[i] = (IFAST_MULT_TYPE)
-            DESCALE(MULTIPLY16V16((INT32) qtbl->quantval[i],
-                                  (INT32) aanscales[i]),
+            DESCALE(MULTIPLY16V16((JLONG) qtbl->quantval[i],
+                                  (JLONG) aanscales[i]),
                     CONST_BITS-IFAST_SCALE_BITS);
         }
       }
@@ -294,7 +295,7 @@
          *   scalefactor[0] = 1
          *   scalefactor[k] = cos(k*PI/16) * sqrt(2)    for k=1..7
          */
-        FLOAT_MULT_TYPE * fmtbl = (FLOAT_MULT_TYPE *) compptr->dct_table;
+        FLOAT_MULT_TYPE *fmtbl = (FLOAT_MULT_TYPE *) compptr->dct_table;
         int row, col;
         static const double aanscalefactor[DCTSIZE] = {
           1.0, 1.387039845, 1.306562965, 1.175875602,
diff --git a/jdhuff.c b/jdhuff.c
index 2ab44a4..338f2a4 100644
--- a/jdhuff.c
+++ b/jdhuff.c
@@ -5,7 +5,8 @@
  * Copyright (C) 1991-1997, Thomas G. Lane.
  * libjpeg-turbo Modifications:
  * Copyright (C) 2009-2011, 2016, D. R. Commander.
- * For conditions of distribution and use, see the accompanying README file.
+ * For conditions of distribution and use, see the accompanying README.ijg
+ * file.
  *
  * This file contains Huffman entropy decoding routines.
  *
@@ -66,20 +67,20 @@
   unsigned int restarts_to_go;  /* MCUs left in this restart interval */
 
   /* Pointers to derived tables (these workspaces have image lifespan) */
-  d_derived_tbl * dc_derived_tbls[NUM_HUFF_TBLS];
-  d_derived_tbl * ac_derived_tbls[NUM_HUFF_TBLS];
+  d_derived_tbl *dc_derived_tbls[NUM_HUFF_TBLS];
+  d_derived_tbl *ac_derived_tbls[NUM_HUFF_TBLS];
 
   /* Precalculated info set up by start_pass for use in decode_mcu: */
 
   /* Pointers to derived tables to be used for each block within an MCU */
-  d_derived_tbl * dc_cur_tbls[D_MAX_BLOCKS_IN_MCU];
-  d_derived_tbl * ac_cur_tbls[D_MAX_BLOCKS_IN_MCU];
+  d_derived_tbl *dc_cur_tbls[D_MAX_BLOCKS_IN_MCU];
+  d_derived_tbl *ac_cur_tbls[D_MAX_BLOCKS_IN_MCU];
   /* Whether we care about the DC and AC coefficient values for each block */
   boolean dc_needed[D_MAX_BLOCKS_IN_MCU];
   boolean ac_needed[D_MAX_BLOCKS_IN_MCU];
 } huff_entropy_decoder;
 
-typedef huff_entropy_decoder * huff_entropy_ptr;
+typedef huff_entropy_decoder *huff_entropy_ptr;
 
 
 /*
@@ -92,7 +93,7 @@
   huff_entropy_ptr entropy = (huff_entropy_ptr) cinfo->entropy;
   int ci, blkn, dctbl, actbl;
   d_derived_tbl **pdtbl;
-  jpeg_component_info * compptr;
+  jpeg_component_info *compptr;
 
   /* Check that the scan parameters Ss, Se, Ah/Al are OK for sequential JPEG.
    * This ought to be an error condition, but we make it a warning because
@@ -152,7 +153,7 @@
 
 GLOBAL(void)
 jpeg_make_d_derived_tbl (j_decompress_ptr cinfo, boolean isDC, int tblno,
-                         d_derived_tbl ** pdtbl)
+                         d_derived_tbl **pdtbl)
 {
   JHUFF_TBL *htbl;
   d_derived_tbl *dtbl;
@@ -209,7 +210,7 @@
     /* code is now 1 more than the last code used for codelength si; but
      * it must still fit in si bits, since no code is allowed to be all ones.
      */
-    if (((INT32) code) >= (((INT32) 1) << si))
+    if (((JLONG) code) >= (((JLONG) 1) << si))
       ERREXIT(cinfo, JERR_BAD_HUFF_TABLE);
     code <<= 1;
     si++;
@@ -223,7 +224,7 @@
       /* valoffset[l] = huffval[] index of 1st symbol of code length l,
        * minus the minimum code of length l
        */
-      dtbl->valoffset[l] = (INT32) p - (INT32) huffcode[p];
+      dtbl->valoffset[l] = (JLONG) p - (JLONG) huffcode[p];
       p += htbl->bits[l];
       dtbl->maxcode[l] = huffcode[p-1]; /* maximum code of length l */
     } else {
@@ -295,13 +296,13 @@
 
 
 GLOBAL(boolean)
-jpeg_fill_bit_buffer (bitread_working_state * state,
+jpeg_fill_bit_buffer (bitread_working_state *state,
                       register bit_buf_type get_buffer, register int bits_left,
                       int nbits)
 /* Load up the bit buffer to a depth of at least nbits */
 {
   /* Copy heavily used state fields into locals (hopefully registers) */
-  register const JOCTET * next_input_byte = state->next_input_byte;
+  register const JOCTET *next_input_byte = state->next_input_byte;
   register size_t bytes_in_buffer = state->bytes_in_buffer;
   j_decompress_ptr cinfo = state->cinfo;
 
@@ -445,12 +446,12 @@
  */
 
 GLOBAL(int)
-jpeg_huff_decode (bitread_working_state * state,
+jpeg_huff_decode (bitread_working_state *state,
                   register bit_buf_type get_buffer, register int bits_left,
-                  d_derived_tbl * htbl, int min_bits)
+                  d_derived_tbl *htbl, int min_bits)
 {
   register int l = min_bits;
-  register INT32 code;
+  register JLONG code;
 
   /* HUFF_DECODE has determined that the code is at least min_bits */
   /* bits long, so fetch that many bits in one swoop. */
@@ -564,9 +565,9 @@
   ASSIGN_STATE(state, entropy->saved);
 
   for (blkn = 0; blkn < cinfo->blocks_in_MCU; blkn++) {
-    JBLOCKROW block = MCU_data[blkn];
-    d_derived_tbl * dctbl = entropy->dc_cur_tbls[blkn];
-    d_derived_tbl * actbl = entropy->ac_cur_tbls[blkn];
+    JBLOCKROW block = MCU_data ? MCU_data[blkn] : NULL;
+    d_derived_tbl *dctbl = entropy->dc_cur_tbls[blkn];
+    d_derived_tbl *actbl = entropy->ac_cur_tbls[blkn];
     register int s, k, r;
 
     /* Decode a single block's worth of coefficients */
@@ -584,11 +585,13 @@
       int ci = cinfo->MCU_membership[blkn];
       s += state.last_dc_val[ci];
       state.last_dc_val[ci] = s;
-      /* Output the DC coefficient (assumes jpeg_natural_order[0] = 0) */
-      (*block)[0] = (JCOEF) s;
+      if (block) {
+        /* Output the DC coefficient (assumes jpeg_natural_order[0] = 0) */
+        (*block)[0] = (JCOEF) s;
+      }
     }
 
-    if (entropy->ac_needed[blkn]) {
+    if (entropy->ac_needed[blkn] && block) {
 
       /* Section F.2.2.2: decode the AC coefficients */
       /* Since zeroes are skipped, output area must be cleared beforehand */
@@ -661,9 +664,9 @@
   ASSIGN_STATE(state, entropy->saved);
 
   for (blkn = 0; blkn < cinfo->blocks_in_MCU; blkn++) {
-    JBLOCKROW block = MCU_data[blkn];
-    d_derived_tbl * dctbl = entropy->dc_cur_tbls[blkn];
-    d_derived_tbl * actbl = entropy->ac_cur_tbls[blkn];
+    JBLOCKROW block = MCU_data ? MCU_data[blkn] : NULL;
+    d_derived_tbl *dctbl = entropy->dc_cur_tbls[blkn];
+    d_derived_tbl *actbl = entropy->ac_cur_tbls[blkn];
     register int s, k, r, l;
 
     HUFF_DECODE_FAST(s, l, dctbl);
@@ -677,10 +680,11 @@
       int ci = cinfo->MCU_membership[blkn];
       s += state.last_dc_val[ci];
       state.last_dc_val[ci] = s;
-      (*block)[0] = (JCOEF) s;
+      if (block)
+        (*block)[0] = (JCOEF) s;
     }
 
-    if (entropy->ac_needed[blkn]) {
+    if (entropy->ac_needed[blkn] && block) {
 
       for (k = 1; k < DCTSIZE2; k++) {
         HUFF_DECODE_FAST(s, l, actbl);
diff --git a/jdhuff.h b/jdhuff.h
index db71067..87d4465 100644
--- a/jdhuff.h
+++ b/jdhuff.h
@@ -5,7 +5,8 @@
  * Copyright (C) 1991-1997, Thomas G. Lane.
  * libjpeg-turbo Modifications:
  * Copyright (C) 2010-2011, 2015-2016, D. R. Commander.
- * For conditions of distribution and use, see the accompanying README file.
+ * For conditions of distribution and use, see the accompanying README.ijg
+ * file.
  *
  * This file contains declarations for Huffman entropy decoding routines
  * that are shared between the sequential decoder (jdhuff.c) and the
@@ -21,9 +22,9 @@
 
 typedef struct {
   /* Basic tables: (element [0] of each array is unused) */
-  INT32 maxcode[18];            /* largest code of length k (-1 if none) */
+  JLONG maxcode[18];            /* largest code of length k (-1 if none) */
   /* (maxcode[17] is a sentinel to ensure jpeg_huff_decode terminates) */
-  INT32 valoffset[18];          /* huffval[] offset for codes of length k */
+  JLONG valoffset[18];          /* huffval[] offset for codes of length k */
   /* valoffset[k] = huffval[] index of 1st symbol of code length k, less
    * the smallest code of length k; so given a code of length k, the
    * corresponding symbol is huffval[code + valoffset[k]]
@@ -75,12 +76,12 @@
 
 #if SIZEOF_SIZE_T==8 || defined(_WIN64)
 
-typedef size_t bit_buf_type;    /* type of bit-extraction buffer */
+typedef size_t bit_buf_type;            /* type of bit-extraction buffer */
 #define BIT_BUF_SIZE  64                /* size of buffer in bits */
 
 #else
 
-typedef INT32 bit_buf_type;     /* type of bit-extraction buffer */
+typedef unsigned long bit_buf_type;     /* type of bit-extraction buffer */
 #define BIT_BUF_SIZE  32                /* size of buffer in bits */
 
 #endif
@@ -100,7 +101,7 @@
 typedef struct {                /* Bitreading working state within an MCU */
   /* Current data source location */
   /* We need a copy, rather than munging the original, in case of suspension */
-  const JOCTET * next_input_byte; /* => next byte to read from source */
+  const JOCTET *next_input_byte; /* => next byte to read from source */
   size_t bytes_in_buffer;       /* # of bytes remaining in source buffer */
   /* Bit input buffer --- note these values are kept in register variables,
    * not in this struct, inside the inner loops.
@@ -165,7 +166,7 @@
 
 /* Load up the bit buffer to a depth of at least nbits */
 EXTERN(boolean) jpeg_fill_bit_buffer
-        (bitread_working_state * state, register bit_buf_type get_buffer,
+        (bitread_working_state *state, register bit_buf_type get_buffer,
          register int bits_left, int nbits);
 
 
@@ -229,5 +230,5 @@
 
 /* Out-of-line case for Huffman code fetching */
 EXTERN(int) jpeg_huff_decode
-        (bitread_working_state * state, register bit_buf_type get_buffer,
-         register int bits_left, d_derived_tbl * htbl, int min_bits);
+        (bitread_working_state *state, register bit_buf_type get_buffer,
+         register int bits_left, d_derived_tbl *htbl, int min_bits);
diff --git a/jdinput.c b/jdinput.c
index 6f4ea7b..32a6b42 100644
--- a/jdinput.c
+++ b/jdinput.c
@@ -4,8 +4,10 @@
  * This file was part of the Independent JPEG Group's software:
  * Copyright (C) 1991-1997, Thomas G. Lane.
  * libjpeg-turbo Modifications:
- * Copyright (C) 2010, D. R. Commander.
- * For conditions of distribution and use, see the accompanying README file.
+ * Copyright (C) 2010, 2016, D. R. Commander.
+ * Copyright (C) 2015, Google, Inc.
+ * For conditions of distribution and use, see the accompanying README.ijg
+ * file.
  *
  * This file contains input control logic for the JPEG decompressor.
  * These routines are concerned with controlling the decompressor's input
@@ -27,7 +29,7 @@
   boolean inheaders;            /* TRUE until first SOS is reached */
 } my_input_controller;
 
-typedef my_input_controller * my_inputctl_ptr;
+typedef my_input_controller *my_inputctl_ptr;
 
 
 /* Forward declarations */
@@ -104,6 +106,11 @@
     compptr->height_in_blocks = (JDIMENSION)
       jdiv_round_up((long) cinfo->image_height * (long) compptr->v_samp_factor,
                     (long) (cinfo->max_v_samp_factor * DCTSIZE));
+    /* Set the first and last MCU columns to decompress from multi-scan images.
+     * By default, decompress all of the MCU columns.
+     */
+    cinfo->master->first_MCU_col[ci] = 0;
+    cinfo->master->last_MCU_col[ci] = compptr->width_in_blocks - 1;
     /* downsampled_width and downsampled_height will also be overridden by
      * jdmaster.c if we are doing full decompression.  The transcoder library
      * doesn't use these values, but the calling application might.
@@ -238,7 +245,7 @@
 {
   int ci, qtblno;
   jpeg_component_info *compptr;
-  JQUANT_TBL * qtbl;
+  JQUANT_TBL *qtbl;
 
   for (ci = 0; ci < cinfo->comps_in_scan; ci++) {
     compptr = cinfo->cur_comp_info[ci];
diff --git a/jdmainct.c b/jdmainct.c
index 7f7bd33..ebb069b 100644
--- a/jdmainct.c
+++ b/jdmainct.c
@@ -4,8 +4,9 @@
  * This file was part of the Independent JPEG Group's software:
  * Copyright (C) 1994-1996, Thomas G. Lane.
  * libjpeg-turbo Modifications:
- * Copyright (C) 2010, D. R. Commander.
- * For conditions of distribution and use, see the accompanying README file.
+ * Copyright (C) 2010, 2016, D. R. Commander.
+ * For conditions of distribution and use, see the accompanying README.ijg
+ * file.
  *
  * This file contains the main buffer controller for decompression.
  * The main buffer lies between the JPEG decompressor proper and the
@@ -15,10 +16,8 @@
  * supplies the equivalent of the main buffer in that case.
  */
 
-#define JPEG_INTERNALS
 #include "jinclude.h"
-#include "jpeglib.h"
-#include "jpegcomp.h"
+#include "jdmainct.h"
 
 
 /*
@@ -112,36 +111,6 @@
  */
 
 
-/* Private buffer controller object */
-
-typedef struct {
-  struct jpeg_d_main_controller pub; /* public fields */
-
-  /* Pointer to allocated workspace (M or M+2 row groups). */
-  JSAMPARRAY buffer[MAX_COMPONENTS];
-
-  boolean buffer_full;          /* Have we gotten an iMCU row from decoder? */
-  JDIMENSION rowgroup_ctr;      /* counts row groups output to postprocessor */
-
-  /* Remaining fields are only used in the context case. */
-
-  /* These are the master pointers to the funny-order pointer lists. */
-  JSAMPIMAGE xbuffer[2];        /* pointers to weird pointer lists */
-
-  int whichptr;                 /* indicates which pointer set is now in use */
-  int context_state;            /* process_data state machine status */
-  JDIMENSION rowgroups_avail;   /* row groups available to postprocessor */
-  JDIMENSION iMCU_row_ctr;      /* counts iMCU rows to detect image top/bot */
-} my_main_controller;
-
-typedef my_main_controller * my_main_ptr;
-
-/* context_state values: */
-#define CTX_PREPARE_FOR_IMCU    0       /* need to prepare for MCU row */
-#define CTX_PROCESS_IMCU        1       /* feeding iMCU to postprocessor */
-#define CTX_POSTPONED_ROW       2       /* feeding postponed row group */
-
-
 /* Forward declarations */
 METHODDEF(void) process_data_simple_main
         (j_decompress_ptr cinfo, JSAMPARRAY output_buf,
@@ -238,34 +207,6 @@
 
 
 LOCAL(void)
-set_wraparound_pointers (j_decompress_ptr cinfo)
-/* Set up the "wraparound" pointers at top and bottom of the pointer lists.
- * This changes the pointer list state from top-of-image to the normal state.
- */
-{
-  my_main_ptr main_ptr = (my_main_ptr) cinfo->main;
-  int ci, i, rgroup;
-  int M = cinfo->_min_DCT_scaled_size;
-  jpeg_component_info *compptr;
-  JSAMPARRAY xbuf0, xbuf1;
-
-  for (ci = 0, compptr = cinfo->comp_info; ci < cinfo->num_components;
-       ci++, compptr++) {
-    rgroup = (compptr->v_samp_factor * compptr->_DCT_scaled_size) /
-      cinfo->_min_DCT_scaled_size; /* height of a row group of component */
-    xbuf0 = main_ptr->xbuffer[0][ci];
-    xbuf1 = main_ptr->xbuffer[1][ci];
-    for (i = 0; i < rgroup; i++) {
-      xbuf0[i - rgroup] = xbuf0[rgroup*(M+1) + i];
-      xbuf1[i - rgroup] = xbuf1[rgroup*(M+1) + i];
-      xbuf0[rgroup*(M+2) + i] = xbuf0[i];
-      xbuf1[rgroup*(M+2) + i] = xbuf1[i];
-    }
-  }
-}
-
-
-LOCAL(void)
 set_bottom_pointers (j_decompress_ptr cinfo)
 /* Change the pointer lists to duplicate the last sample row at the bottom
  * of the image.  whichptr indicates which xbuffer holds the final iMCU row.
diff --git a/jdmainct.h b/jdmainct.h
new file mode 100644
index 0000000..3090301
--- /dev/null
+++ b/jdmainct.h
@@ -0,0 +1,71 @@
+/*
+ * jdmainct.h
+ *
+ * This file was part of the Independent JPEG Group's software:
+ * Copyright (C) 1994-1996, Thomas G. Lane.
+ * For conditions of distribution and use, see the accompanying README.ijg
+ * file.
+ */
+
+#define JPEG_INTERNALS
+#include "jpeglib.h"
+#include "jpegcomp.h"
+
+
+/* Private buffer controller object */
+
+typedef struct {
+  struct jpeg_d_main_controller pub; /* public fields */
+
+  /* Pointer to allocated workspace (M or M+2 row groups). */
+  JSAMPARRAY buffer[MAX_COMPONENTS];
+
+  boolean buffer_full;          /* Have we gotten an iMCU row from decoder? */
+  JDIMENSION rowgroup_ctr;      /* counts row groups output to postprocessor */
+
+  /* Remaining fields are only used in the context case. */
+
+  /* These are the master pointers to the funny-order pointer lists. */
+  JSAMPIMAGE xbuffer[2];        /* pointers to weird pointer lists */
+
+  int whichptr;                 /* indicates which pointer set is now in use */
+  int context_state;            /* process_data state machine status */
+  JDIMENSION rowgroups_avail;   /* row groups available to postprocessor */
+  JDIMENSION iMCU_row_ctr;      /* counts iMCU rows to detect image top/bot */
+} my_main_controller;
+
+typedef my_main_controller *my_main_ptr;
+
+
+/* context_state values: */
+#define CTX_PREPARE_FOR_IMCU    0       /* need to prepare for MCU row */
+#define CTX_PROCESS_IMCU        1       /* feeding iMCU to postprocessor */
+#define CTX_POSTPONED_ROW       2       /* feeding postponed row group */
+
+
+LOCAL(void)
+set_wraparound_pointers (j_decompress_ptr cinfo)
+/* Set up the "wraparound" pointers at top and bottom of the pointer lists.
+ * This changes the pointer list state from top-of-image to the normal state.
+ */
+{
+  my_main_ptr main_ptr = (my_main_ptr) cinfo->main;
+  int ci, i, rgroup;
+  int M = cinfo->_min_DCT_scaled_size;
+  jpeg_component_info *compptr;
+  JSAMPARRAY xbuf0, xbuf1;
+
+  for (ci = 0, compptr = cinfo->comp_info; ci < cinfo->num_components;
+       ci++, compptr++) {
+    rgroup = (compptr->v_samp_factor * compptr->_DCT_scaled_size) /
+      cinfo->_min_DCT_scaled_size; /* height of a row group of component */
+    xbuf0 = main_ptr->xbuffer[0][ci];
+    xbuf1 = main_ptr->xbuffer[1][ci];
+    for (i = 0; i < rgroup; i++) {
+      xbuf0[i - rgroup] = xbuf0[rgroup*(M+1) + i];
+      xbuf1[i - rgroup] = xbuf1[rgroup*(M+1) + i];
+      xbuf0[rgroup*(M+2) + i] = xbuf0[i];
+      xbuf1[rgroup*(M+2) + i] = xbuf1[i];
+    }
+  }
+}
diff --git a/jdmarker.c b/jdmarker.c
index d1357af..e3b612c 100644
--- a/jdmarker.c
+++ b/jdmarker.c
@@ -4,8 +4,9 @@
  * This file was part of the Independent JPEG Group's software:
  * Copyright (C) 1991-1998, Thomas G. Lane.
  * libjpeg-turbo Modifications:
- * Copyright (C) 2012, D. R. Commander.
- * For conditions of distribution and use, see the accompanying README file.
+ * Copyright (C) 2012, 2015, D. R. Commander.
+ * For conditions of distribution and use, see the accompanying README.ijg
+ * file.
  *
  * This file contains routines to decode JPEG datastream markers.
  * Most of the complexity arises from our desire to support input
@@ -106,7 +107,7 @@
   /* Note: cur_marker is not linked into marker_list until it's all read. */
 } my_marker_reader;
 
-typedef my_marker_reader * my_marker_ptr;
+typedef my_marker_reader *my_marker_ptr;
 
 
 /*
@@ -119,8 +120,8 @@
 
 /* Declare and initialize local copies of input pointer/count */
 #define INPUT_VARS(cinfo)  \
-        struct jpeg_source_mgr * datasrc = (cinfo)->src;  \
-        const JOCTET * next_input_byte = datasrc->next_input_byte;  \
+        struct jpeg_source_mgr *datasrc = (cinfo)->src;  \
+        const JOCTET *next_input_byte = datasrc->next_input_byte;  \
         size_t bytes_in_buffer = datasrc->bytes_in_buffer
 
 /* Unload the local copies --- do this only at a restart boundary */
@@ -153,7 +154,7 @@
                   V = GETJOCTET(*next_input_byte++); )
 
 /* As above, but read two bytes interpreted as an unsigned 16-bit integer.
- * V should be declared unsigned int or perhaps INT32.
+ * V should be declared unsigned int or perhaps JLONG.
  */
 #define INPUT_2BYTES(cinfo,V,action)  \
         MAKESTMT( MAKE_BYTE_AVAIL(cinfo,action); \
@@ -239,9 +240,9 @@
 get_sof (j_decompress_ptr cinfo, boolean is_prog, boolean is_arith)
 /* Process a SOFn marker */
 {
-  INT32 length;
+  JLONG length;
   int c, ci;
-  jpeg_component_info * compptr;
+  jpeg_component_info *compptr;
   INPUT_VARS(cinfo);
 
   cinfo->progressive_mode = is_prog;
@@ -303,9 +304,9 @@
 get_sos (j_decompress_ptr cinfo)
 /* Process a SOS marker */
 {
-  INT32 length;
+  JLONG length;
   int i, ci, n, c, cc, pi;
-  jpeg_component_info * compptr;
+  jpeg_component_info *compptr;
   INPUT_VARS(cinfo);
 
   if (! cinfo->marker->saw_SOF)
@@ -386,7 +387,7 @@
 get_dac (j_decompress_ptr cinfo)
 /* Process a DAC marker */
 {
-  INT32 length;
+  JLONG length;
   int index, val;
   INPUT_VARS(cinfo);
 
@@ -432,7 +433,7 @@
 get_dht (j_decompress_ptr cinfo)
 /* Process a DHT marker */
 {
-  INT32 length;
+  JLONG length;
   UINT8 bits[17];
   UINT8 huffval[256];
   int i, index, count;
@@ -466,7 +467,7 @@
     /* Here we just do minimal validation of the counts to avoid walking
      * off the end of our table space.  jdhuff.c will check more carefully.
      */
-    if (count > 256 || ((INT32) count) > length)
+    if (count > 256 || ((JLONG) count) > length)
       ERREXIT(cinfo, JERR_BAD_HUFF_TABLE);
 
     for (i = 0; i < count; i++)
@@ -506,7 +507,7 @@
 get_dqt (j_decompress_ptr cinfo)
 /* Process a DQT marker */
 {
-  INT32 length;
+  JLONG length;
   int n, i, prec;
   unsigned int tmp;
   JQUANT_TBL *quant_ptr;
@@ -564,7 +565,7 @@
 get_dri (j_decompress_ptr cinfo)
 /* Process a DRI marker */
 {
-  INT32 length;
+  JLONG length;
   unsigned int tmp;
   INPUT_VARS(cinfo);
 
@@ -597,14 +598,14 @@
 
 
 LOCAL(void)
-examine_app0 (j_decompress_ptr cinfo, JOCTET * data,
-              unsigned int datalen, INT32 remaining)
+examine_app0 (j_decompress_ptr cinfo, JOCTET *data,
+              unsigned int datalen, JLONG remaining)
 /* Examine first few bytes from an APP0.
  * Take appropriate action if it is a JFIF marker.
  * datalen is # of bytes at data[], remaining is length of rest of marker data.
  */
 {
-  INT32 totallen = (INT32) datalen + remaining;
+  JLONG totallen = (JLONG) datalen + remaining;
 
   if (datalen >= APP0_DATA_LEN &&
       GETJOCTET(data[0]) == 0x4A &&
@@ -638,7 +639,7 @@
                GETJOCTET(data[12]), GETJOCTET(data[13]));
     totallen -= APP0_DATA_LEN;
     if (totallen !=
-        ((INT32)GETJOCTET(data[12]) * (INT32)GETJOCTET(data[13]) * (INT32) 3))
+        ((JLONG)GETJOCTET(data[12]) * (JLONG)GETJOCTET(data[13]) * (JLONG) 3))
       TRACEMS1(cinfo, 1, JTRC_JFIF_BADTHUMBNAILSIZE, (int) totallen);
   } else if (datalen >= 6 &&
       GETJOCTET(data[0]) == 0x4A &&
@@ -673,8 +674,8 @@
 
 
 LOCAL(void)
-examine_app14 (j_decompress_ptr cinfo, JOCTET * data,
-               unsigned int datalen, INT32 remaining)
+examine_app14 (j_decompress_ptr cinfo, JOCTET *data,
+               unsigned int datalen, JLONG remaining)
 /* Examine first few bytes from an APP14.
  * Take appropriate action if it is an Adobe marker.
  * datalen is # of bytes at data[], remaining is length of rest of marker data.
@@ -707,7 +708,7 @@
 get_interesting_appn (j_decompress_ptr cinfo)
 /* Process an APP0 or APP14 marker without saving it */
 {
-  INT32 length;
+  JLONG length;
   JOCTET b[APPN_DATA_LEN];
   unsigned int i, numtoread;
   INPUT_VARS(cinfo);
@@ -758,8 +759,8 @@
   my_marker_ptr marker = (my_marker_ptr) cinfo->marker;
   jpeg_saved_marker_ptr cur_marker = marker->cur_marker;
   unsigned int bytes_read, data_length;
-  JOCTET * data;
-  INT32 length = 0;
+  JOCTET *data;
+  JLONG length = 0;
   INPUT_VARS(cinfo);
 
   if (cur_marker == NULL) {
@@ -861,7 +862,7 @@
 skip_variable (j_decompress_ptr cinfo)
 /* Skip over an unknown or uninteresting variable-length marker */
 {
-  INT32 length;
+  JLONG length;
   INPUT_VARS(cinfo);
 
   INPUT_2BYTES(cinfo, length, return FALSE);
diff --git a/jdmaster.c b/jdmaster.c
index 604e291..7908849 100644
--- a/jdmaster.c
+++ b/jdmaster.c
@@ -5,9 +5,11 @@
  * Copyright (C) 1991-1997, Thomas G. Lane.
  * Modified 2002-2009 by Guido Vollbeding.
  * libjpeg-turbo Modifications:
- * Copyright (C) 2009-2011, D. R. Commander.
+ * Copyright (C) 2009-2011, 2016, D. R. Commander.
  * Copyright (C) 2013, Linaro Limited.
- * For conditions of distribution and use, see the accompanying README file.
+ * Copyright (C) 2015, Google, Inc.
+ * For conditions of distribution and use, see the accompanying README.ijg
+ * file.
  *
  * This file contains master control logic for the JPEG decompressor.
  * These routines are concerned with selecting the modules to be executed
@@ -19,25 +21,7 @@
 #include "jinclude.h"
 #include "jpeglib.h"
 #include "jpegcomp.h"
-
-
-/* Private state */
-
-typedef struct {
-  struct jpeg_decomp_master pub; /* public fields */
-
-  int pass_number;              /* # of passes completed */
-
-  boolean using_merged_upsample; /* TRUE if using merged upsample/cconvert */
-
-  /* Saved references to initialized quantizer modules,
-   * in case we need to switch modes.
-   */
-  struct jpeg_color_quantizer * quantizer_1pass;
-  struct jpeg_color_quantizer * quantizer_2pass;
-} my_decomp_master;
-
-typedef my_decomp_master * my_master_ptr;
+#include "jdmaster.h"
 
 
 /*
@@ -424,7 +408,7 @@
 prepare_range_limit_table (j_decompress_ptr cinfo)
 /* Allocate and fill in the sample_range_limit table */
 {
-  JSAMPLE * table;
+  JSAMPLE *table;
   int i;
 
   table = (JSAMPLE *)
@@ -578,6 +562,12 @@
   /* Initialize input side of decompressor to consume first scan. */
   (*cinfo->inputctl->start_input_pass) (cinfo);
 
+  /* Set the first and last iMCU columns to decompress from single-scan images.
+   * By default, decompress all of the iMCU columns.
+   */
+  cinfo->master->first_iMCU_col = 0;
+  cinfo->master->last_iMCU_col = cinfo->MCUs_per_row - 1;
+
 #ifdef D_MULTISCAN_FILES_SUPPORTED
   /* If jpeg_start_decompress will read the whole file, initialize
    * progress monitoring appropriately.  The input step is counted
@@ -722,16 +712,13 @@
 GLOBAL(void)
 jinit_master_decompress (j_decompress_ptr cinfo)
 {
-  my_master_ptr master;
+  my_master_ptr master = (my_master_ptr) cinfo->master;
 
-  master = (my_master_ptr)
-      (*cinfo->mem->alloc_small) ((j_common_ptr) cinfo, JPOOL_IMAGE,
-                                  sizeof(my_decomp_master));
-  cinfo->master = (struct jpeg_decomp_master *) master;
   master->pub.prepare_for_output_pass = prepare_for_output_pass;
   master->pub.finish_output_pass = finish_output_pass;
 
   master->pub.is_dummy_pass = FALSE;
+  master->pub.jinit_upsampler_no_alloc = FALSE;
 
   master_selection(cinfo);
 }
diff --git a/jdmaster.h b/jdmaster.h
new file mode 100644
index 0000000..76897e2
--- /dev/null
+++ b/jdmaster.h
@@ -0,0 +1,28 @@
+/*
+ * jdmaster.h
+ *
+ * This file was part of the Independent JPEG Group's software:
+ * Copyright (C) 1991-1995, Thomas G. Lane.
+ * For conditions of distribution and use, see the accompanying README.ijg
+ * file.
+ *
+ * This file contains the master control structure for the JPEG decompressor.
+ */
+
+/* Private state */
+
+typedef struct {
+  struct jpeg_decomp_master pub; /* public fields */
+
+  int pass_number;              /* # of passes completed */
+
+  boolean using_merged_upsample; /* TRUE if using merged upsample/cconvert */
+
+  /* Saved references to initialized quantizer modules,
+   * in case we need to switch modes.
+   */
+  struct jpeg_color_quantizer *quantizer_1pass;
+  struct jpeg_color_quantizer *quantizer_2pass;
+} my_decomp_master;
+
+typedef my_decomp_master *my_master_ptr;
diff --git a/jdmerge.c b/jdmerge.c
index e13adb9..6276dd0 100644
--- a/jdmerge.c
+++ b/jdmerge.c
@@ -3,11 +3,12 @@
  *
  * This file was part of the Independent JPEG Group's software:
  * Copyright (C) 1994-1996, Thomas G. Lane.
- * Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
  * libjpeg-turbo Modifications:
- * Copyright (C) 2009, 2011, 2014 D. R. Commander.
+ * Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
+ * Copyright (C) 2009, 2011, 2014-2015, D. R. Commander.
  * Copyright (C) 2013, Linaro Limited.
- * For conditions of distribution and use, see the accompanying README file.
+ * For conditions of distribution and use, see the accompanying README.ijg
+ * file.
  *
  * This file contains code for merged upsampling/color conversion.
  *
@@ -55,10 +56,10 @@
                     JDIMENSION in_row_group_ctr, JSAMPARRAY output_buf);
 
   /* Private state for YCC->RGB conversion */
-  int * Cr_r_tab;               /* => table for Cr to R conversion */
-  int * Cb_b_tab;               /* => table for Cb to B conversion */
-  INT32 * Cr_g_tab;             /* => table for Cr to G conversion */
-  INT32 * Cb_g_tab;             /* => table for Cb to G conversion */
+  int *Cr_r_tab;                /* => table for Cr to R conversion */
+  int *Cb_b_tab;                /* => table for Cb to B conversion */
+  JLONG *Cr_g_tab;              /* => table for Cr to G conversion */
+  JLONG *Cb_g_tab;              /* => table for Cb to G conversion */
 
   /* For 2:1 vertical sampling, we produce two output rows at a time.
    * We need a "spare" row buffer to hold the second output row if the
@@ -72,11 +73,11 @@
   JDIMENSION rows_to_go;        /* counts rows remaining in image */
 } my_upsampler;
 
-typedef my_upsampler * my_upsample_ptr;
+typedef my_upsampler *my_upsample_ptr;
 
 #define SCALEBITS       16      /* speediest right-shift on some machines */
-#define ONE_HALF        ((INT32) 1 << (SCALEBITS-1))
-#define FIX(x)          ((INT32) ((x) * (1L<<SCALEBITS) + 0.5))
+#define ONE_HALF        ((JLONG) 1 << (SCALEBITS-1))
+#define FIX(x)          ((JLONG) ((x) * (1L<<SCALEBITS) + 0.5))
 
 
 /* Include inline routines for colorspace extensions */
@@ -190,7 +191,7 @@
 {
   my_upsample_ptr upsample = (my_upsample_ptr) cinfo->upsample;
   int i;
-  INT32 x;
+  JLONG x;
   SHIFT_TEMPS
 
   upsample->Cr_r_tab = (int *)
@@ -199,12 +200,12 @@
   upsample->Cb_b_tab = (int *)
     (*cinfo->mem->alloc_small) ((j_common_ptr) cinfo, JPOOL_IMAGE,
                                 (MAXJSAMPLE+1) * sizeof(int));
-  upsample->Cr_g_tab = (INT32 *)
+  upsample->Cr_g_tab = (JLONG *)
     (*cinfo->mem->alloc_small) ((j_common_ptr) cinfo, JPOOL_IMAGE,
-                                (MAXJSAMPLE+1) * sizeof(INT32));
-  upsample->Cb_g_tab = (INT32 *)
+                                (MAXJSAMPLE+1) * sizeof(JLONG));
+  upsample->Cb_g_tab = (JLONG *)
     (*cinfo->mem->alloc_small) ((j_common_ptr) cinfo, JPOOL_IMAGE,
-                                (MAXJSAMPLE+1) * sizeof(INT32));
+                                (MAXJSAMPLE+1) * sizeof(JLONG));
 
   for (i = 0, x = -CENTERJSAMPLE; i <= MAXJSAMPLE; i++, x++) {
     /* i is the actual input pixel value, in the range 0..MAXJSAMPLE */
@@ -435,12 +436,12 @@
 #define PACK_NEED_ALIGNMENT(ptr)  (((size_t)(ptr)) & 3)
 
 #define WRITE_TWO_PIXELS_LE(addr, pixels) {  \
-  ((INT16*)(addr))[0] = (pixels);  \
-  ((INT16*)(addr))[1] = (pixels) >> 16;  \
+  ((INT16*)(addr))[0] = (INT16)(pixels);  \
+  ((INT16*)(addr))[1] = (INT16)((pixels) >> 16);  \
 }
 #define WRITE_TWO_PIXELS_BE(addr, pixels) {  \
-  ((INT16*)(addr))[1] = (pixels);  \
-  ((INT16*)(addr))[0] = (pixels) >> 16;  \
+  ((INT16*)(addr))[1] = (INT16)(pixels);  \
+  ((INT16*)(addr))[0] = (INT16)((pixels) >> 16);  \
 }
 
 #define DITHER_565_R(r, dither)  ((r) + ((dither) & 0xFF))
@@ -455,8 +456,8 @@
  */
 
 #define DITHER_MASK       0x3
-#define DITHER_ROTATE(x)  (((x) << 24) | (((x) >> 8) & 0x00FFFFFF))
-static const INT32 dither_matrix[4] = {
+#define DITHER_ROTATE(x)  ((((x) & 0xFF) << 24) | (((x) >> 8) & 0x00FFFFFF))
+static const JLONG dither_matrix[4] = {
   0x0008020A,
   0x0C040E06,
   0x030B0109,
diff --git a/jdmrg565.c b/jdmrg565.c
index 0a10bcc..18287b3 100644
--- a/jdmrg565.c
+++ b/jdmrg565.c
@@ -5,8 +5,9 @@
  * Copyright (C) 1994-1996, Thomas G. Lane.
  * libjpeg-turbo Modifications:
  * Copyright (C) 2013, Linaro Limited.
- * Copyright (C) 2014, D. R. Commander.
- * For conditions of distribution and use, see the accompanying README file.
+ * Copyright (C) 2014-2015, D. R. Commander.
+ * For conditions of distribution and use, see the accompanying README.ijg
+ * file.
  *
  * This file contains code for merged upsampling/color conversion.
  */
@@ -29,10 +30,10 @@
   register JSAMPLE * range_limit = cinfo->sample_range_limit;
   int * Crrtab = upsample->Cr_r_tab;
   int * Cbbtab = upsample->Cb_b_tab;
-  INT32 * Crgtab = upsample->Cr_g_tab;
-  INT32 * Cbgtab = upsample->Cb_g_tab;
+  JLONG * Crgtab = upsample->Cr_g_tab;
+  JLONG * Cbgtab = upsample->Cb_g_tab;
   unsigned int r, g, b;
-  INT32 rgb;
+  JLONG rgb;
   SHIFT_TEMPS
 
   inptr0 = input_buf[0][in_row_group_ctr];
@@ -78,7 +79,7 @@
     g = range_limit[y + cgreen];
     b = range_limit[y + cblue];
     rgb = PACK_SHORT_565(r, g, b);
-    *(INT16*)outptr = rgb;
+    *(INT16*)outptr = (INT16)rgb;
    }
  }
 
@@ -100,11 +101,11 @@
   register JSAMPLE * range_limit = cinfo->sample_range_limit;
   int * Crrtab = upsample->Cr_r_tab;
   int * Cbbtab = upsample->Cb_b_tab;
-  INT32 * Crgtab = upsample->Cr_g_tab;
-  INT32 * Cbgtab = upsample->Cb_g_tab;
-  INT32 d0 = dither_matrix[cinfo->output_scanline & DITHER_MASK];
+  JLONG * Crgtab = upsample->Cr_g_tab;
+  JLONG * Cbgtab = upsample->Cb_g_tab;
+  JLONG d0 = dither_matrix[cinfo->output_scanline & DITHER_MASK];
   unsigned int r, g, b;
-  INT32 rgb;
+  JLONG rgb;
   SHIFT_TEMPS
 
   inptr0 = input_buf[0][in_row_group_ctr];
@@ -152,7 +153,7 @@
     g = range_limit[DITHER_565_G(y + cgreen, d0)];
     b = range_limit[DITHER_565_B(y + cblue, d0)];
     rgb = PACK_SHORT_565(r, g, b);
-    *(INT16*)outptr = rgb;
+    *(INT16*)outptr = (INT16)rgb;
   }
 }
 
@@ -174,10 +175,10 @@
   register JSAMPLE * range_limit = cinfo->sample_range_limit;
   int * Crrtab = upsample->Cr_r_tab;
   int * Cbbtab = upsample->Cb_b_tab;
-  INT32 * Crgtab = upsample->Cr_g_tab;
-  INT32 * Cbgtab = upsample->Cb_g_tab;
+  JLONG * Crgtab = upsample->Cr_g_tab;
+  JLONG * Cbgtab = upsample->Cb_g_tab;
   unsigned int r, g, b;
-  INT32 rgb;
+  JLONG rgb;
   SHIFT_TEMPS
 
   inptr00 = input_buf[0][in_row_group_ctr * 2];
@@ -241,14 +242,14 @@
     g = range_limit[y + cgreen];
     b = range_limit[y + cblue];
     rgb = PACK_SHORT_565(r, g, b);
-    *(INT16*)outptr0 = rgb;
+    *(INT16*)outptr0 = (INT16)rgb;
 
     y  = GETJSAMPLE(*inptr01);
     r = range_limit[y + cred];
     g = range_limit[y + cgreen];
     b = range_limit[y + cblue];
     rgb = PACK_SHORT_565(r, g, b);
-    *(INT16*)outptr1 = rgb;
+    *(INT16*)outptr1 = (INT16)rgb;
   }
 }
 
@@ -270,12 +271,12 @@
   register JSAMPLE * range_limit = cinfo->sample_range_limit;
   int * Crrtab = upsample->Cr_r_tab;
   int * Cbbtab = upsample->Cb_b_tab;
-  INT32 * Crgtab = upsample->Cr_g_tab;
-  INT32 * Cbgtab = upsample->Cb_g_tab;
-  INT32 d0 = dither_matrix[cinfo->output_scanline & DITHER_MASK];
-  INT32 d1 = dither_matrix[(cinfo->output_scanline+1) & DITHER_MASK];
+  JLONG * Crgtab = upsample->Cr_g_tab;
+  JLONG * Cbgtab = upsample->Cb_g_tab;
+  JLONG d0 = dither_matrix[cinfo->output_scanline & DITHER_MASK];
+  JLONG d1 = dither_matrix[(cinfo->output_scanline+1) & DITHER_MASK];
   unsigned int r, g, b;
-  INT32 rgb;
+  JLONG rgb;
   SHIFT_TEMPS
 
   inptr00 = input_buf[0][in_row_group_ctr*2];
@@ -343,13 +344,13 @@
     g = range_limit[DITHER_565_G(y + cgreen, d0)];
     b = range_limit[DITHER_565_B(y + cblue, d0)];
     rgb = PACK_SHORT_565(r, g, b);
-    *(INT16*)outptr0 = rgb;
+    *(INT16*)outptr0 = (INT16)rgb;
 
     y  = GETJSAMPLE(*inptr01);
     r = range_limit[DITHER_565_R(y + cred, d1)];
     g = range_limit[DITHER_565_G(y + cgreen, d1)];
     b = range_limit[DITHER_565_B(y + cblue, d1)];
     rgb = PACK_SHORT_565(r, g, b);
-    *(INT16*)outptr1 = rgb;
+    *(INT16*)outptr1 = (INT16)rgb;
   }
 }
diff --git a/jdmrgext.c b/jdmrgext.c
index 1f0a550..9d7d2af 100644
--- a/jdmrgext.c
+++ b/jdmrgext.c
@@ -4,8 +4,9 @@
  * This file was part of the Independent JPEG Group's software:
  * Copyright (C) 1994-1996, Thomas G. Lane.
  * libjpeg-turbo Modifications:
- * Copyright (C) 2011, D. R. Commander.
- * For conditions of distribution and use, see the accompanying README file.
+ * Copyright (C) 2011, 2015, D. R. Commander.
+ * For conditions of distribution and use, see the accompanying README.ijg
+ * file.
  *
  * This file contains code for merged upsampling/color conversion.
  */
@@ -35,8 +36,8 @@
   register JSAMPLE * range_limit = cinfo->sample_range_limit;
   int * Crrtab = upsample->Cr_r_tab;
   int * Cbbtab = upsample->Cb_b_tab;
-  INT32 * Crgtab = upsample->Cr_g_tab;
-  INT32 * Cbgtab = upsample->Cb_g_tab;
+  JLONG * Crgtab = upsample->Cr_g_tab;
+  JLONG * Cbgtab = upsample->Cb_g_tab;
   SHIFT_TEMPS
 
   inptr0 = input_buf[0][in_row_group_ctr];
@@ -108,8 +109,8 @@
   register JSAMPLE * range_limit = cinfo->sample_range_limit;
   int * Crrtab = upsample->Cr_r_tab;
   int * Cbbtab = upsample->Cb_b_tab;
-  INT32 * Crgtab = upsample->Cr_g_tab;
-  INT32 * Cbgtab = upsample->Cb_g_tab;
+  JLONG * Crgtab = upsample->Cr_g_tab;
+  JLONG * Cbgtab = upsample->Cb_g_tab;
   SHIFT_TEMPS
 
   inptr00 = input_buf[0][in_row_group_ctr*2];
diff --git a/jdphuff.c b/jdphuff.c
index 7ef8e7b..42a7068 100644
--- a/jdphuff.c
+++ b/jdphuff.c
@@ -5,7 +5,8 @@
  * Copyright (C) 1995-1997, Thomas G. Lane.
  * libjpeg-turbo Modifications:
  * Copyright (C) 2015, D. R. Commander.
- * For conditions of distribution and use, see the accompanying README file.
+ * For conditions of distribution and use, see the accompanying README.ijg
+ * file.
  *
  * This file contains Huffman entropy decoding routines for progressive JPEG.
  *
@@ -68,12 +69,12 @@
   unsigned int restarts_to_go;  /* MCUs left in this restart interval */
 
   /* Pointers to derived tables (these workspaces have image lifespan) */
-  d_derived_tbl * derived_tbls[NUM_HUFF_TBLS];
+  d_derived_tbl *derived_tbls[NUM_HUFF_TBLS];
 
-  d_derived_tbl * ac_derived_tbl; /* active table during an AC scan */
+  d_derived_tbl *ac_derived_tbl; /* active table during an AC scan */
 } phuff_entropy_decoder;
 
-typedef phuff_entropy_decoder * phuff_entropy_ptr;
+typedef phuff_entropy_decoder *phuff_entropy_ptr;
 
 /* Forward declarations */
 METHODDEF(boolean) decode_mcu_DC_first (j_decompress_ptr cinfo,
@@ -98,7 +99,7 @@
   int ci, coefi, tbl;
   d_derived_tbl **pdtbl;
   int *coef_bit_ptr;
-  jpeg_component_info * compptr;
+  jpeg_component_info *compptr;
 
   is_DC_band = (cinfo->Ss == 0);
 
@@ -297,8 +298,8 @@
   JBLOCKROW block;
   BITREAD_STATE_VARS;
   savable_state state;
-  d_derived_tbl * tbl;
-  jpeg_component_info * compptr;
+  d_derived_tbl *tbl;
+  jpeg_component_info *compptr;
 
   /* Process restart marker if needed; may have to suspend */
   if (cinfo->restart_interval) {
@@ -368,7 +369,7 @@
   unsigned int EOBRUN;
   JBLOCKROW block;
   BITREAD_STATE_VARS;
-  d_derived_tbl * tbl;
+  d_derived_tbl *tbl;
 
   /* Process restart marker if needed; may have to suspend */
   if (cinfo->restart_interval) {
@@ -504,7 +505,7 @@
   JBLOCKROW block;
   JCOEFPTR thiscoef;
   BITREAD_STATE_VARS;
-  d_derived_tbl * tbl;
+  d_derived_tbl *tbl;
   int num_newnz;
   int newnz_pos[DCTSIZE2];
 
diff --git a/jdpostct.c b/jdpostct.c
index 9eef9ee..601fc2a 100644
--- a/jdpostct.c
+++ b/jdpostct.c
@@ -5,7 +5,8 @@
  * Copyright (C) 1994-1996, Thomas G. Lane.
  * It was modified by The libjpeg-turbo Project to include only code relevant
  * to libjpeg-turbo.
- * For conditions of distribution and use, see the accompanying README file.
+ * For conditions of distribution and use, see the accompanying README.ijg
+ * file.
  *
  * This file contains the decompression postprocessing controller.
  * This controller manages the upsampling, color conversion, and color
@@ -41,7 +42,7 @@
   JDIMENSION next_row;          /* index of next row to fill/empty in strip */
 } my_post_controller;
 
-typedef my_post_controller * my_post_ptr;
+typedef my_post_controller *my_post_ptr;
 
 
 /* Forward declarations */
diff --git a/jdsample.c b/jdsample.c
index 2752966..39b3725 100644
--- a/jdsample.c
+++ b/jdsample.c
@@ -5,9 +5,11 @@
  * Copyright (C) 1991-1996, Thomas G. Lane.
  * libjpeg-turbo Modifications:
  * Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
- * Copyright (C) 2010, D. R. Commander.
+ * Copyright (C) 2010, 2015-2016, D. R. Commander.
  * Copyright (C) 2014, MIPS Technologies, Inc., California
- * For conditions of distribution and use, see the accompanying README file.
+ * Copyright (C) 2015, Google, Inc.
+ * For conditions of distribution and use, see the accompanying README.ijg
+ * file.
  *
  * This file contains upsampling routines.
  *
@@ -22,51 +24,12 @@
  *   Pub. by IEEE Computer Society Press, Los Alamitos, CA. ISBN 0-8186-8944-7.
  */
 
-#define JPEG_INTERNALS
 #include "jinclude.h"
-#include "jpeglib.h"
+#include "jdsample.h"
 #include "jsimd.h"
 #include "jpegcomp.h"
 
 
-/* Pointer to routine to upsample a single component */
-typedef void (*upsample1_ptr) (j_decompress_ptr cinfo,
-                               jpeg_component_info * compptr,
-                               JSAMPARRAY input_data,
-                               JSAMPARRAY * output_data_ptr);
-
-/* Private subobject */
-
-typedef struct {
-  struct jpeg_upsampler pub;    /* public fields */
-
-  /* Color conversion buffer.  When using separate upsampling and color
-   * conversion steps, this buffer holds one upsampled row group until it
-   * has been color converted and output.
-   * Note: we do not allocate any storage for component(s) which are full-size,
-   * ie do not need rescaling.  The corresponding entry of color_buf[] is
-   * simply set to point to the input data array, thereby avoiding copying.
-   */
-  JSAMPARRAY color_buf[MAX_COMPONENTS];
-
-  /* Per-component upsampling method pointers */
-  upsample1_ptr methods[MAX_COMPONENTS];
-
-  int next_row_out;             /* counts rows emitted from color_buf */
-  JDIMENSION rows_to_go;        /* counts rows remaining in image */
-
-  /* Height of an input row group for each component. */
-  int rowgroup_height[MAX_COMPONENTS];
-
-  /* These arrays save pixel expansion factors so that int_expand need not
-   * recompute them each time.  They are unused for other upsampling methods.
-   */
-  UINT8 h_expand[MAX_COMPONENTS];
-  UINT8 v_expand[MAX_COMPONENTS];
-} my_upsampler;
-
-typedef my_upsampler * my_upsample_ptr;
-
 
 /*
  * Initialize for an upsampling pass.
@@ -101,7 +64,7 @@
 {
   my_upsample_ptr upsample = (my_upsample_ptr) cinfo->upsample;
   int ci;
-  jpeg_component_info * compptr;
+  jpeg_component_info *compptr;
   JDIMENSION num_rows;
 
   /* Fill the conversion buffer, if it's empty */
@@ -161,8 +124,8 @@
  */
 
 METHODDEF(void)
-fullsize_upsample (j_decompress_ptr cinfo, jpeg_component_info * compptr,
-                   JSAMPARRAY input_data, JSAMPARRAY * output_data_ptr)
+fullsize_upsample (j_decompress_ptr cinfo, jpeg_component_info *compptr,
+                   JSAMPARRAY input_data, JSAMPARRAY *output_data_ptr)
 {
   *output_data_ptr = input_data;
 }
@@ -174,8 +137,8 @@
  */
 
 METHODDEF(void)
-noop_upsample (j_decompress_ptr cinfo, jpeg_component_info * compptr,
-               JSAMPARRAY input_data, JSAMPARRAY * output_data_ptr)
+noop_upsample (j_decompress_ptr cinfo, jpeg_component_info *compptr,
+               JSAMPARRAY input_data, JSAMPARRAY *output_data_ptr)
 {
   *output_data_ptr = NULL;      /* safety check */
 }
@@ -193,8 +156,8 @@
  */
 
 METHODDEF(void)
-int_upsample (j_decompress_ptr cinfo, jpeg_component_info * compptr,
-              JSAMPARRAY input_data, JSAMPARRAY * output_data_ptr)
+int_upsample (j_decompress_ptr cinfo, jpeg_component_info *compptr,
+              JSAMPARRAY input_data, JSAMPARRAY *output_data_ptr)
 {
   my_upsample_ptr upsample = (my_upsample_ptr) cinfo->upsample;
   JSAMPARRAY output_data = *output_data_ptr;
@@ -237,8 +200,8 @@
  */
 
 METHODDEF(void)
-h2v1_upsample (j_decompress_ptr cinfo, jpeg_component_info * compptr,
-               JSAMPARRAY input_data, JSAMPARRAY * output_data_ptr)
+h2v1_upsample (j_decompress_ptr cinfo, jpeg_component_info *compptr,
+               JSAMPARRAY input_data, JSAMPARRAY *output_data_ptr)
 {
   JSAMPARRAY output_data = *output_data_ptr;
   register JSAMPROW inptr, outptr;
@@ -265,8 +228,8 @@
  */
 
 METHODDEF(void)
-h2v2_upsample (j_decompress_ptr cinfo, jpeg_component_info * compptr,
-               JSAMPARRAY input_data, JSAMPARRAY * output_data_ptr)
+h2v2_upsample (j_decompress_ptr cinfo, jpeg_component_info *compptr,
+               JSAMPARRAY input_data, JSAMPARRAY *output_data_ptr)
 {
   JSAMPARRAY output_data = *output_data_ptr;
   register JSAMPROW inptr, outptr;
@@ -308,8 +271,8 @@
  */
 
 METHODDEF(void)
-h2v1_fancy_upsample (j_decompress_ptr cinfo, jpeg_component_info * compptr,
-                     JSAMPARRAY input_data, JSAMPARRAY * output_data_ptr)
+h2v1_fancy_upsample (j_decompress_ptr cinfo, jpeg_component_info *compptr,
+                     JSAMPARRAY input_data, JSAMPARRAY *output_data_ptr)
 {
   JSAMPARRAY output_data = *output_data_ptr;
   register JSAMPROW inptr, outptr;
@@ -349,15 +312,15 @@
  */
 
 METHODDEF(void)
-h2v2_fancy_upsample (j_decompress_ptr cinfo, jpeg_component_info * compptr,
-                     JSAMPARRAY input_data, JSAMPARRAY * output_data_ptr)
+h2v2_fancy_upsample (j_decompress_ptr cinfo, jpeg_component_info *compptr,
+                     JSAMPARRAY input_data, JSAMPARRAY *output_data_ptr)
 {
   JSAMPARRAY output_data = *output_data_ptr;
   register JSAMPROW inptr0, inptr1, outptr;
 #if BITS_IN_JSAMPLE == 8
   register int thiscolsum, lastcolsum, nextcolsum;
 #else
-  register INT32 thiscolsum, lastcolsum, nextcolsum;
+  register JLONG thiscolsum, lastcolsum, nextcolsum;
 #endif
   register JDIMENSION colctr;
   int inrow, outrow, v;
@@ -407,17 +370,20 @@
 {
   my_upsample_ptr upsample;
   int ci;
-  jpeg_component_info * compptr;
+  jpeg_component_info *compptr;
   boolean need_buffer, do_fancy;
   int h_in_group, v_in_group, h_out_group, v_out_group;
 
-  upsample = (my_upsample_ptr)
-    (*cinfo->mem->alloc_small) ((j_common_ptr) cinfo, JPOOL_IMAGE,
-                                sizeof(my_upsampler));
-  cinfo->upsample = (struct jpeg_upsampler *) upsample;
-  upsample->pub.start_pass = start_pass_upsample;
-  upsample->pub.upsample = sep_upsample;
-  upsample->pub.need_context_rows = FALSE; /* until we find out differently */
+  if (!cinfo->master->jinit_upsampler_no_alloc) {
+    upsample = (my_upsample_ptr)
+      (*cinfo->mem->alloc_small) ((j_common_ptr) cinfo, JPOOL_IMAGE,
+                                  sizeof(my_upsampler));
+    cinfo->upsample = (struct jpeg_upsampler *) upsample;
+    upsample->pub.start_pass = start_pass_upsample;
+    upsample->pub.upsample = sep_upsample;
+    upsample->pub.need_context_rows = FALSE; /* until we find out differently */
+  } else
+    upsample = (my_upsample_ptr) cinfo->upsample;
 
   if (cinfo->CCIR601_sampling)  /* this isn't supported */
     ERREXIT(cinfo, JERR_CCIR601_NOTIMPL);
@@ -493,7 +459,7 @@
       upsample->v_expand[ci] = (UINT8) (v_out_group / v_in_group);
     } else
       ERREXIT(cinfo, JERR_FRACT_SAMPLE_NOTIMPL);
-    if (need_buffer) {
+    if (need_buffer && !cinfo->master->jinit_upsampler_no_alloc) {
       upsample->color_buf[ci] = (*cinfo->mem->alloc_sarray)
         ((j_common_ptr) cinfo, JPOOL_IMAGE,
          (JDIMENSION) jround_up((long) cinfo->output_width,
diff --git a/jdsample.h b/jdsample.h
new file mode 100644
index 0000000..a6bf08a
--- /dev/null
+++ b/jdsample.h
@@ -0,0 +1,50 @@
+/*
+ * jdsample.h
+ *
+ * This file was part of the Independent JPEG Group's software:
+ * Copyright (C) 1991-1996, Thomas G. Lane.
+ * For conditions of distribution and use, see the accompanying README.ijg
+ * file.
+ */
+
+#define JPEG_INTERNALS
+#include "jpeglib.h"
+
+
+/* Pointer to routine to upsample a single component */
+typedef void (*upsample1_ptr) (j_decompress_ptr cinfo,
+                               jpeg_component_info *compptr,
+                               JSAMPARRAY input_data,
+                               JSAMPARRAY *output_data_ptr);
+
+/* Private subobject */
+
+typedef struct {
+  struct jpeg_upsampler pub;    /* public fields */
+
+  /* Color conversion buffer.  When using separate upsampling and color
+   * conversion steps, this buffer holds one upsampled row group until it
+   * has been color converted and output.
+   * Note: we do not allocate any storage for component(s) which are full-size,
+   * ie do not need rescaling.  The corresponding entry of color_buf[] is
+   * simply set to point to the input data array, thereby avoiding copying.
+   */
+  JSAMPARRAY color_buf[MAX_COMPONENTS];
+
+  /* Per-component upsampling method pointers */
+  upsample1_ptr methods[MAX_COMPONENTS];
+
+  int next_row_out;             /* counts rows emitted from color_buf */
+  JDIMENSION rows_to_go;        /* counts rows remaining in image */
+
+  /* Height of an input row group for each component. */
+  int rowgroup_height[MAX_COMPONENTS];
+
+  /* These arrays save pixel expansion factors so that int_expand need not
+   * recompute them each time.  They are unused for other upsampling methods.
+   */
+  UINT8 h_expand[MAX_COMPONENTS];
+  UINT8 v_expand[MAX_COMPONENTS];
+} my_upsampler;
+
+typedef my_upsampler *my_upsample_ptr;
diff --git a/jdtrans.c b/jdtrans.c
index 86fda3b..cfc85dd 100644
--- a/jdtrans.c
+++ b/jdtrans.c
@@ -5,7 +5,8 @@
  * Copyright (C) 1995-1997, Thomas G. Lane.
  * It was modified by The libjpeg-turbo Project to include only code relevant
  * to libjpeg-turbo.
- * For conditions of distribution and use, see the accompanying README file.
+ * For conditions of distribution and use, see the accompanying README.ijg
+ * file.
  *
  * This file contains library routines for transcoding decompression,
  * that is, reading raw DCT coefficient arrays from an input JPEG file.
diff --git a/jerror.c b/jerror.c
index cd3098d..c31acd9 100644
--- a/jerror.c
+++ b/jerror.c
@@ -5,7 +5,8 @@
  * Copyright (C) 1991-1998, Thomas G. Lane.
  * It was modified by The libjpeg-turbo Project to include only code relevant
  * to libjpeg-turbo.
- * For conditions of distribution and use, see the accompanying README file.
+ * For conditions of distribution and use, see the accompanying README.ijg
+ * file.
  *
  * This file contains simple error-reporting and trace-message routines.
  * These are suitable for Unix-like systems and others where writing to
@@ -125,7 +126,7 @@
 METHODDEF(void)
 emit_message (j_common_ptr cinfo, int msg_level)
 {
-  struct jpeg_error_mgr * err = cinfo->err;
+  struct jpeg_error_mgr *err = cinfo->err;
 
   if (msg_level < 0) {
     /* It's a warning message.  Since corrupt files may generate many warnings,
@@ -152,12 +153,12 @@
  */
 
 METHODDEF(void)
-format_message (j_common_ptr cinfo, char * buffer)
+format_message (j_common_ptr cinfo, char *buffer)
 {
-  struct jpeg_error_mgr * err = cinfo->err;
+  struct jpeg_error_mgr *err = cinfo->err;
   int msg_code = err->msg_code;
-  const char * msgtext = NULL;
-  const char * msgptr;
+  const char *msgtext = NULL;
+  const char *msgptr;
   char ch;
   boolean isstring;
 
@@ -226,7 +227,7 @@
  */
 
 GLOBAL(struct jpeg_error_mgr *)
-jpeg_std_error (struct jpeg_error_mgr * err)
+jpeg_std_error (struct jpeg_error_mgr *err)
 {
   err->error_exit = error_exit;
   err->emit_message = emit_message;
diff --git a/jerror.h b/jerror.h
index 402613e..11a07cb 100644
--- a/jerror.h
+++ b/jerror.h
@@ -6,7 +6,8 @@
  * Modified 1997-2009 by Guido Vollbeding.
  * libjpeg-turbo Modifications:
  * Copyright (C) 2014, D. R. Commander.
- * For conditions of distribution and use, see the accompanying README file.
+ * For conditions of distribution and use, see the accompanying README.ijg
+ * file.
  *
  * This file defines the error and message codes for the JPEG library.
  * Edit this file to add new codes, or to translate the message strings to
diff --git a/jfdctflt.c b/jfdctflt.c
index a8367c6..b3da3eb 100644
--- a/jfdctflt.c
+++ b/jfdctflt.c
@@ -3,7 +3,8 @@
  *
  * Copyright (C) 1994-1996, Thomas G. Lane.
  * This file is part of the Independent JPEG Group's software.
- * For conditions of distribution and use, see the accompanying README file.
+ * For conditions of distribution and use, see the accompanying README.ijg
+ * file.
  *
  * This file contains a floating-point implementation of the
  * forward DCT (Discrete Cosine Transform).
@@ -20,8 +21,8 @@
  * This implementation is based on Arai, Agui, and Nakajima's algorithm for
  * scaled DCT.  Their original paper (Trans. IEICE E-71(11):1095) is in
  * Japanese, but the algorithm is described in the Pennebaker & Mitchell
- * JPEG textbook (see REFERENCES section in file README).  The following code
- * is based directly on figure 4-8 in P&M.
+ * JPEG textbook (see REFERENCES section in file README.ijg).  The following
+ * code is based directly on figure 4-8 in P&M.
  * While an 8-point DCT cannot be done in less than 11 multiplies, it is
  * possible to arrange the computation so that many of the multiplies are
  * simple scalings of the final outputs.  These multiplies can then be
@@ -56,7 +57,7 @@
  */
 
 GLOBAL(void)
-jpeg_fdct_float (FAST_FLOAT * data)
+jpeg_fdct_float (FAST_FLOAT *data)
 {
   FAST_FLOAT tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7;
   FAST_FLOAT tmp10, tmp11, tmp12, tmp13;
diff --git a/jfdctfst.c b/jfdctfst.c
index 4936d47..82b2515 100644
--- a/jfdctfst.c
+++ b/jfdctfst.c
@@ -1,9 +1,12 @@
 /*
  * jfdctfst.c
  *
+ * This file was part of the Independent JPEG Group's software:
  * Copyright (C) 1994-1996, Thomas G. Lane.
- * This file is part of the Independent JPEG Group's software.
- * For conditions of distribution and use, see the accompanying README file.
+ * libjpeg-turbo Modifications:
+ * Copyright (C) 2015, D. R. Commander
+ * For conditions of distribution and use, see the accompanying README.ijg
+ * file.
  *
  * This file contains a fast, not so accurate integer implementation of the
  * forward DCT (Discrete Cosine Transform).
@@ -15,8 +18,8 @@
  * This implementation is based on Arai, Agui, and Nakajima's algorithm for
  * scaled DCT.  Their original paper (Trans. IEICE E-71(11):1095) is in
  * Japanese, but the algorithm is described in the Pennebaker & Mitchell
- * JPEG textbook (see REFERENCES section in file README).  The following code
- * is based directly on figure 4-8 in P&M.
+ * JPEG textbook (see REFERENCES section in file README.ijg).  The following
+ * code is based directly on figure 4-8 in P&M.
  * While an 8-point DCT cannot be done in less than 11 multiplies, it is
  * possible to arrange the computation so that many of the multiplies are
  * simple scalings of the final outputs.  These multiplies can then be
@@ -76,10 +79,10 @@
  */
 
 #if CONST_BITS == 8
-#define FIX_0_382683433  ((INT32)   98)         /* FIX(0.382683433) */
-#define FIX_0_541196100  ((INT32)  139)         /* FIX(0.541196100) */
-#define FIX_0_707106781  ((INT32)  181)         /* FIX(0.707106781) */
-#define FIX_1_306562965  ((INT32)  334)         /* FIX(1.306562965) */
+#define FIX_0_382683433  ((JLONG)   98)         /* FIX(0.382683433) */
+#define FIX_0_541196100  ((JLONG)  139)         /* FIX(0.541196100) */
+#define FIX_0_707106781  ((JLONG)  181)         /* FIX(0.707106781) */
+#define FIX_1_306562965  ((JLONG)  334)         /* FIX(1.306562965) */
 #else
 #define FIX_0_382683433  FIX(0.382683433)
 #define FIX_0_541196100  FIX(0.541196100)
@@ -99,7 +102,7 @@
 #endif
 
 
-/* Multiply a DCTELEM variable by an INT32 constant, and immediately
+/* Multiply a DCTELEM variable by an JLONG constant, and immediately
  * descale to yield a DCTELEM result.
  */
 
@@ -111,7 +114,7 @@
  */
 
 GLOBAL(void)
-jpeg_fdct_ifast (DCTELEM * data)
+jpeg_fdct_ifast (DCTELEM *data)
 {
   DCTELEM tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7;
   DCTELEM tmp10, tmp11, tmp12, tmp13;
diff --git a/jfdctint.c b/jfdctint.c
index 68b0835..73e0b59 100644
--- a/jfdctint.c
+++ b/jfdctint.c
@@ -5,7 +5,8 @@
  * Copyright (C) 1991-1996, Thomas G. Lane.
  * libjpeg-turbo Modifications:
  * Copyright (C) 2015, D. R. Commander
- * For conditions of distribution and use, see the accompanying README file.
+ * For conditions of distribution and use, see the accompanying README.ijg
+ * file.
  *
  * This file contains a slow-but-accurate integer implementation of the
  * forward DCT (Discrete Cosine Transform).
@@ -69,7 +70,7 @@
  * they are represented to better-than-integral precision.  These outputs
  * require BITS_IN_JSAMPLE + PASS1_BITS + 3 bits; this fits in a 16-bit word
  * with the recommended scaling.  (For 12-bit sample data, the intermediate
- * array is INT32 anyway.)
+ * array is JLONG anyway.)
  *
  * To avoid overflow of the 32-bit intermediate results in pass 2, we must
  * have BITS_IN_JSAMPLE + CONST_BITS + PASS1_BITS <= 26.  Error analysis
@@ -92,18 +93,18 @@
  */
 
 #if CONST_BITS == 13
-#define FIX_0_298631336  ((INT32)  2446)        /* FIX(0.298631336) */
-#define FIX_0_390180644  ((INT32)  3196)        /* FIX(0.390180644) */
-#define FIX_0_541196100  ((INT32)  4433)        /* FIX(0.541196100) */
-#define FIX_0_765366865  ((INT32)  6270)        /* FIX(0.765366865) */
-#define FIX_0_899976223  ((INT32)  7373)        /* FIX(0.899976223) */
-#define FIX_1_175875602  ((INT32)  9633)        /* FIX(1.175875602) */
-#define FIX_1_501321110  ((INT32)  12299)       /* FIX(1.501321110) */
-#define FIX_1_847759065  ((INT32)  15137)       /* FIX(1.847759065) */
-#define FIX_1_961570560  ((INT32)  16069)       /* FIX(1.961570560) */
-#define FIX_2_053119869  ((INT32)  16819)       /* FIX(2.053119869) */
-#define FIX_2_562915447  ((INT32)  20995)       /* FIX(2.562915447) */
-#define FIX_3_072711026  ((INT32)  25172)       /* FIX(3.072711026) */
+#define FIX_0_298631336  ((JLONG)  2446)        /* FIX(0.298631336) */
+#define FIX_0_390180644  ((JLONG)  3196)        /* FIX(0.390180644) */
+#define FIX_0_541196100  ((JLONG)  4433)        /* FIX(0.541196100) */
+#define FIX_0_765366865  ((JLONG)  6270)        /* FIX(0.765366865) */
+#define FIX_0_899976223  ((JLONG)  7373)        /* FIX(0.899976223) */
+#define FIX_1_175875602  ((JLONG)  9633)        /* FIX(1.175875602) */
+#define FIX_1_501321110  ((JLONG)  12299)       /* FIX(1.501321110) */
+#define FIX_1_847759065  ((JLONG)  15137)       /* FIX(1.847759065) */
+#define FIX_1_961570560  ((JLONG)  16069)       /* FIX(1.961570560) */
+#define FIX_2_053119869  ((JLONG)  16819)       /* FIX(2.053119869) */
+#define FIX_2_562915447  ((JLONG)  20995)       /* FIX(2.562915447) */
+#define FIX_3_072711026  ((JLONG)  25172)       /* FIX(3.072711026) */
 #else
 #define FIX_0_298631336  FIX(0.298631336)
 #define FIX_0_390180644  FIX(0.390180644)
@@ -120,7 +121,7 @@
 #endif
 
 
-/* Multiply an INT32 variable by an INT32 constant to yield an INT32 result.
+/* Multiply an JLONG variable by an JLONG constant to yield an JLONG result.
  * For 8-bit samples with the recommended scaling, all the variable
  * and constant values involved are no more than 16 bits wide, so a
  * 16x16->32 bit multiply can be used instead of a full 32x32 multiply.
@@ -139,11 +140,11 @@
  */
 
 GLOBAL(void)
-jpeg_fdct_islow (DCTELEM * data)
+jpeg_fdct_islow (DCTELEM *data)
 {
-  INT32 tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7;
-  INT32 tmp10, tmp11, tmp12, tmp13;
-  INT32 z1, z2, z3, z4, z5;
+  JLONG tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7;
+  JLONG tmp10, tmp11, tmp12, tmp13;
+  JLONG z1, z2, z3, z4, z5;
   DCTELEM *dataptr;
   int ctr;
   SHIFT_TEMPS
diff --git a/jidctflt.c b/jidctflt.c
index 324a2cb..68c521e 100644
--- a/jidctflt.c
+++ b/jidctflt.c
@@ -6,7 +6,8 @@
  * Modified 2010 by Guido Vollbeding.
  * libjpeg-turbo Modifications:
  * Copyright (C) 2014, D. R. Commander.
- * For conditions of distribution and use, see the accompanying README file.
+ * For conditions of distribution and use, see the accompanying README.ijg
+ * file.
  *
  * This file contains a floating-point implementation of the
  * inverse DCT (Discrete Cosine Transform).  In the IJG code, this routine
@@ -25,8 +26,8 @@
  * This implementation is based on Arai, Agui, and Nakajima's algorithm for
  * scaled DCT.  Their original paper (Trans. IEICE E-71(11):1095) is in
  * Japanese, but the algorithm is described in the Pennebaker & Mitchell
- * JPEG textbook (see REFERENCES section in file README).  The following code
- * is based directly on figure 4-8 in P&M.
+ * JPEG textbook (see REFERENCES section in file README.ijg).  The following
+ * code is based directly on figure 4-8 in P&M.
  * While an 8-point DCT cannot be done in less than 11 multiplies, it is
  * possible to arrange the computation so that many of the multiplies are
  * simple scalings of the final outputs.  These multiplies can then be
@@ -68,7 +69,7 @@
  */
 
 GLOBAL(void)
-jpeg_idct_float (j_decompress_ptr cinfo, jpeg_component_info * compptr,
+jpeg_idct_float (j_decompress_ptr cinfo, jpeg_component_info *compptr,
                  JCOEFPTR coef_block,
                  JSAMPARRAY output_buf, JDIMENSION output_col)
 {
@@ -76,8 +77,8 @@
   FAST_FLOAT tmp10, tmp11, tmp12, tmp13;
   FAST_FLOAT z5, z10, z11, z12, z13;
   JCOEFPTR inptr;
-  FLOAT_MULT_TYPE * quantptr;
-  FAST_FLOAT * wsptr;
+  FLOAT_MULT_TYPE *quantptr;
+  FAST_FLOAT *wsptr;
   JSAMPROW outptr;
   JSAMPLE *range_limit = cinfo->sample_range_limit;
   int ctr;
diff --git a/jidctfst.c b/jidctfst.c
index cae22b9..10db739 100644
--- a/jidctfst.c
+++ b/jidctfst.c
@@ -1,9 +1,12 @@
 /*
  * jidctfst.c
  *
+ * This file was part of the Independent JPEG Group's software:
  * Copyright (C) 1994-1998, Thomas G. Lane.
- * This file is part of the Independent JPEG Group's software.
- * For conditions of distribution and use, see the accompanying README file.
+ * libjpeg-turbo Modifications:
+ * Copyright (C) 2015, D. R. Commander.
+ * For conditions of distribution and use, see the accompanying README.ijg
+ * file.
  *
  * This file contains a fast, not so accurate integer implementation of the
  * inverse DCT (Discrete Cosine Transform).  In the IJG code, this routine
@@ -17,8 +20,8 @@
  * This implementation is based on Arai, Agui, and Nakajima's algorithm for
  * scaled DCT.  Their original paper (Trans. IEICE E-71(11):1095) is in
  * Japanese, but the algorithm is described in the Pennebaker & Mitchell
- * JPEG textbook (see REFERENCES section in file README).  The following code
- * is based directly on figure 4-8 in P&M.
+ * JPEG textbook (see REFERENCES section in file README.ijg).  The following
+ * code is based directly on figure 4-8 in P&M.
  * While an 8-point DCT cannot be done in less than 11 multiplies, it is
  * possible to arrange the computation so that many of the multiplies are
  * simple scalings of the final outputs.  These multiplies can then be
@@ -89,10 +92,10 @@
  */
 
 #if CONST_BITS == 8
-#define FIX_1_082392200  ((INT32)  277)         /* FIX(1.082392200) */
-#define FIX_1_414213562  ((INT32)  362)         /* FIX(1.414213562) */
-#define FIX_1_847759065  ((INT32)  473)         /* FIX(1.847759065) */
-#define FIX_2_613125930  ((INT32)  669)         /* FIX(2.613125930) */
+#define FIX_1_082392200  ((JLONG)  277)         /* FIX(1.082392200) */
+#define FIX_1_414213562  ((JLONG)  362)         /* FIX(1.414213562) */
+#define FIX_1_847759065  ((JLONG)  473)         /* FIX(1.847759065) */
+#define FIX_2_613125930  ((JLONG)  669)         /* FIX(2.613125930) */
 #else
 #define FIX_1_082392200  FIX(1.082392200)
 #define FIX_1_414213562  FIX(1.414213562)
@@ -112,7 +115,7 @@
 #endif
 
 
-/* Multiply a DCTELEM variable by an INT32 constant, and immediately
+/* Multiply a DCTELEM variable by an JLONG constant, and immediately
  * descale to yield a DCTELEM result.
  */
 
@@ -122,7 +125,7 @@
 /* Dequantize a coefficient by multiplying it by the multiplier-table
  * entry; produce a DCTELEM result.  For 8-bit data a 16x16->16
  * multiplication will do.  For 12-bit data, the multiplier table is
- * declared INT32, so a 32-bit multiply will be used.
+ * declared JLONG, so a 32-bit multiply will be used.
  */
 
 #if BITS_IN_JSAMPLE == 8
@@ -134,7 +137,7 @@
 
 
 /* Like DESCALE, but applies to a DCTELEM and produces an int.
- * We assume that int right shift is unsigned if INT32 right shift is.
+ * We assume that int right shift is unsigned if JLONG right shift is.
  */
 
 #ifdef RIGHT_SHIFT_IS_UNSIGNED
@@ -165,7 +168,7 @@
  */
 
 GLOBAL(void)
-jpeg_idct_ifast (j_decompress_ptr cinfo, jpeg_component_info * compptr,
+jpeg_idct_ifast (j_decompress_ptr cinfo, jpeg_component_info *compptr,
                  JCOEFPTR coef_block,
                  JSAMPARRAY output_buf, JDIMENSION output_col)
 {
@@ -173,8 +176,8 @@
   DCTELEM tmp10, tmp11, tmp12, tmp13;
   DCTELEM z5, z10, z11, z12, z13;
   JCOEFPTR inptr;
-  IFAST_MULT_TYPE * quantptr;
-  int * wsptr;
+  IFAST_MULT_TYPE *quantptr;
+  int *wsptr;
   JSAMPROW outptr;
   JSAMPLE *range_limit = IDCT_range_limit(cinfo);
   int ctr;
diff --git a/jidctint.c b/jidctint.c
index 3429795..a2d03fc 100644
--- a/jidctint.c
+++ b/jidctint.c
@@ -6,7 +6,8 @@
  * Modification developed 2002-2009 by Guido Vollbeding.
  * libjpeg-turbo Modifications:
  * Copyright (C) 2015, D. R. Commander
- * For conditions of distribution and use, see the accompanying README file.
+ * For conditions of distribution and use, see the accompanying README.ijg
+ * file.
  *
  * This file contains a slow-but-accurate integer implementation of the
  * inverse DCT (Discrete Cosine Transform).  In the IJG code, this routine
@@ -91,7 +92,7 @@
  * they are represented to better-than-integral precision.  These outputs
  * require BITS_IN_JSAMPLE + PASS1_BITS + 3 bits; this fits in a 16-bit word
  * with the recommended scaling.  (To scale up 12-bit sample data further, an
- * intermediate INT32 array would be needed.)
+ * intermediate JLONG array would be needed.)
  *
  * To avoid overflow of the 32-bit intermediate results in pass 2, we must
  * have BITS_IN_JSAMPLE + CONST_BITS + PASS1_BITS <= 26.  Error analysis
@@ -114,18 +115,18 @@
  */
 
 #if CONST_BITS == 13
-#define FIX_0_298631336  ((INT32)  2446)        /* FIX(0.298631336) */
-#define FIX_0_390180644  ((INT32)  3196)        /* FIX(0.390180644) */
-#define FIX_0_541196100  ((INT32)  4433)        /* FIX(0.541196100) */
-#define FIX_0_765366865  ((INT32)  6270)        /* FIX(0.765366865) */
-#define FIX_0_899976223  ((INT32)  7373)        /* FIX(0.899976223) */
-#define FIX_1_175875602  ((INT32)  9633)        /* FIX(1.175875602) */
-#define FIX_1_501321110  ((INT32)  12299)       /* FIX(1.501321110) */
-#define FIX_1_847759065  ((INT32)  15137)       /* FIX(1.847759065) */
-#define FIX_1_961570560  ((INT32)  16069)       /* FIX(1.961570560) */
-#define FIX_2_053119869  ((INT32)  16819)       /* FIX(2.053119869) */
-#define FIX_2_562915447  ((INT32)  20995)       /* FIX(2.562915447) */
-#define FIX_3_072711026  ((INT32)  25172)       /* FIX(3.072711026) */
+#define FIX_0_298631336  ((JLONG)  2446)        /* FIX(0.298631336) */
+#define FIX_0_390180644  ((JLONG)  3196)        /* FIX(0.390180644) */
+#define FIX_0_541196100  ((JLONG)  4433)        /* FIX(0.541196100) */
+#define FIX_0_765366865  ((JLONG)  6270)        /* FIX(0.765366865) */
+#define FIX_0_899976223  ((JLONG)  7373)        /* FIX(0.899976223) */
+#define FIX_1_175875602  ((JLONG)  9633)        /* FIX(1.175875602) */
+#define FIX_1_501321110  ((JLONG)  12299)       /* FIX(1.501321110) */
+#define FIX_1_847759065  ((JLONG)  15137)       /* FIX(1.847759065) */
+#define FIX_1_961570560  ((JLONG)  16069)       /* FIX(1.961570560) */
+#define FIX_2_053119869  ((JLONG)  16819)       /* FIX(2.053119869) */
+#define FIX_2_562915447  ((JLONG)  20995)       /* FIX(2.562915447) */
+#define FIX_3_072711026  ((JLONG)  25172)       /* FIX(3.072711026) */
 #else
 #define FIX_0_298631336  FIX(0.298631336)
 #define FIX_0_390180644  FIX(0.390180644)
@@ -142,7 +143,7 @@
 #endif
 
 
-/* Multiply an INT32 variable by an INT32 constant to yield an INT32 result.
+/* Multiply an JLONG variable by an JLONG constant to yield an JLONG result.
  * For 8-bit samples with the recommended scaling, all the variable
  * and constant values involved are no more than 16 bits wide, so a
  * 16x16->32 bit multiply can be used instead of a full 32x32 multiply.
@@ -169,16 +170,16 @@
  */
 
 GLOBAL(void)
-jpeg_idct_islow (j_decompress_ptr cinfo, jpeg_component_info * compptr,
+jpeg_idct_islow (j_decompress_ptr cinfo, jpeg_component_info *compptr,
                  JCOEFPTR coef_block,
                  JSAMPARRAY output_buf, JDIMENSION output_col)
 {
-  INT32 tmp0, tmp1, tmp2, tmp3;
-  INT32 tmp10, tmp11, tmp12, tmp13;
-  INT32 z1, z2, z3, z4, z5;
+  JLONG tmp0, tmp1, tmp2, tmp3;
+  JLONG tmp10, tmp11, tmp12, tmp13;
+  JLONG z1, z2, z3, z4, z5;
   JCOEFPTR inptr;
-  ISLOW_MULT_TYPE * quantptr;
-  int * wsptr;
+  ISLOW_MULT_TYPE *quantptr;
+  int *wsptr;
   JSAMPROW outptr;
   JSAMPLE *range_limit = IDCT_range_limit(cinfo);
   int ctr;
@@ -313,7 +314,7 @@
     if (wsptr[1] == 0 && wsptr[2] == 0 && wsptr[3] == 0 && wsptr[4] == 0 &&
         wsptr[5] == 0 && wsptr[6] == 0 && wsptr[7] == 0) {
       /* AC terms all zero */
-      JSAMPLE dcval = range_limit[(int) DESCALE((INT32) wsptr[0], PASS1_BITS+3)
+      JSAMPLE dcval = range_limit[(int) DESCALE((JLONG) wsptr[0], PASS1_BITS+3)
                                   & RANGE_MASK];
 
       outptr[0] = dcval;
@@ -333,15 +334,15 @@
     /* Even part: reverse the even part of the forward DCT. */
     /* The rotator is sqrt(2)*c(-6). */
 
-    z2 = (INT32) wsptr[2];
-    z3 = (INT32) wsptr[6];
+    z2 = (JLONG) wsptr[2];
+    z3 = (JLONG) wsptr[6];
 
     z1 = MULTIPLY(z2 + z3, FIX_0_541196100);
     tmp2 = z1 + MULTIPLY(z3, - FIX_1_847759065);
     tmp3 = z1 + MULTIPLY(z2, FIX_0_765366865);
 
-    tmp0 = LEFT_SHIFT((INT32) wsptr[0] + (INT32) wsptr[4], CONST_BITS);
-    tmp1 = LEFT_SHIFT((INT32) wsptr[0] - (INT32) wsptr[4], CONST_BITS);
+    tmp0 = LEFT_SHIFT((JLONG) wsptr[0] + (JLONG) wsptr[4], CONST_BITS);
+    tmp1 = LEFT_SHIFT((JLONG) wsptr[0] - (JLONG) wsptr[4], CONST_BITS);
 
     tmp10 = tmp0 + tmp3;
     tmp13 = tmp0 - tmp3;
@@ -352,10 +353,10 @@
      * transpose is its inverse.  i0..i3 are y7,y5,y3,y1 respectively.
      */
 
-    tmp0 = (INT32) wsptr[7];
-    tmp1 = (INT32) wsptr[5];
-    tmp2 = (INT32) wsptr[3];
-    tmp3 = (INT32) wsptr[1];
+    tmp0 = (JLONG) wsptr[7];
+    tmp1 = (JLONG) wsptr[5];
+    tmp2 = (JLONG) wsptr[3];
+    tmp3 = (JLONG) wsptr[1];
 
     z1 = tmp0 + tmp3;
     z2 = tmp1 + tmp2;
@@ -423,15 +424,15 @@
  */
 
 GLOBAL(void)
-jpeg_idct_7x7 (j_decompress_ptr cinfo, jpeg_component_info * compptr,
+jpeg_idct_7x7 (j_decompress_ptr cinfo, jpeg_component_info *compptr,
                JCOEFPTR coef_block,
                JSAMPARRAY output_buf, JDIMENSION output_col)
 {
-  INT32 tmp0, tmp1, tmp2, tmp10, tmp11, tmp12, tmp13;
-  INT32 z1, z2, z3;
+  JLONG tmp0, tmp1, tmp2, tmp10, tmp11, tmp12, tmp13;
+  JLONG z1, z2, z3;
   JCOEFPTR inptr;
-  ISLOW_MULT_TYPE * quantptr;
-  int * wsptr;
+  ISLOW_MULT_TYPE *quantptr;
+  int *wsptr;
   JSAMPROW outptr;
   JSAMPLE *range_limit = IDCT_range_limit(cinfo);
   int ctr;
@@ -501,12 +502,12 @@
     /* Even part */
 
     /* Add fudge factor here for final descale. */
-    tmp13 = (INT32) wsptr[0] + (ONE << (PASS1_BITS+2));
+    tmp13 = (JLONG) wsptr[0] + (ONE << (PASS1_BITS+2));
     tmp13 = LEFT_SHIFT(tmp13, CONST_BITS);
 
-    z1 = (INT32) wsptr[2];
-    z2 = (INT32) wsptr[4];
-    z3 = (INT32) wsptr[6];
+    z1 = (JLONG) wsptr[2];
+    z2 = (JLONG) wsptr[4];
+    z3 = (JLONG) wsptr[6];
 
     tmp10 = MULTIPLY(z2 - z3, FIX(0.881747734));     /* c4 */
     tmp12 = MULTIPLY(z1 - z2, FIX(0.314692123));     /* c6 */
@@ -520,9 +521,9 @@
 
     /* Odd part */
 
-    z1 = (INT32) wsptr[1];
-    z2 = (INT32) wsptr[3];
-    z3 = (INT32) wsptr[5];
+    z1 = (JLONG) wsptr[1];
+    z2 = (JLONG) wsptr[3];
+    z3 = (JLONG) wsptr[5];
 
     tmp1 = MULTIPLY(z1 + z2, FIX(0.935414347));      /* (c3+c1-c5)/2 */
     tmp2 = MULTIPLY(z1 - z2, FIX(0.170262339));      /* (c3+c5-c1)/2 */
@@ -572,15 +573,15 @@
  */
 
 GLOBAL(void)
-jpeg_idct_6x6 (j_decompress_ptr cinfo, jpeg_component_info * compptr,
+jpeg_idct_6x6 (j_decompress_ptr cinfo, jpeg_component_info *compptr,
                JCOEFPTR coef_block,
                JSAMPARRAY output_buf, JDIMENSION output_col)
 {
-  INT32 tmp0, tmp1, tmp2, tmp10, tmp11, tmp12;
-  INT32 z1, z2, z3;
+  JLONG tmp0, tmp1, tmp2, tmp10, tmp11, tmp12;
+  JLONG z1, z2, z3;
   JCOEFPTR inptr;
-  ISLOW_MULT_TYPE * quantptr;
-  int * wsptr;
+  ISLOW_MULT_TYPE *quantptr;
+  int *wsptr;
   JSAMPROW outptr;
   JSAMPLE *range_limit = IDCT_range_limit(cinfo);
   int ctr;
@@ -637,22 +638,22 @@
     /* Even part */
 
     /* Add fudge factor here for final descale. */
-    tmp0 = (INT32) wsptr[0] + (ONE << (PASS1_BITS+2));
+    tmp0 = (JLONG) wsptr[0] + (ONE << (PASS1_BITS+2));
     tmp0 = LEFT_SHIFT(tmp0, CONST_BITS);
-    tmp2 = (INT32) wsptr[4];
+    tmp2 = (JLONG) wsptr[4];
     tmp10 = MULTIPLY(tmp2, FIX(0.707106781));   /* c4 */
     tmp1 = tmp0 + tmp10;
     tmp11 = tmp0 - tmp10 - tmp10;
-    tmp10 = (INT32) wsptr[2];
+    tmp10 = (JLONG) wsptr[2];
     tmp0 = MULTIPLY(tmp10, FIX(1.224744871));   /* c2 */
     tmp10 = tmp1 + tmp0;
     tmp12 = tmp1 - tmp0;
 
     /* Odd part */
 
-    z1 = (INT32) wsptr[1];
-    z2 = (INT32) wsptr[3];
-    z3 = (INT32) wsptr[5];
+    z1 = (JLONG) wsptr[1];
+    z2 = (JLONG) wsptr[3];
+    z3 = (JLONG) wsptr[5];
     tmp1 = MULTIPLY(z1 + z3, FIX(0.366025404)); /* c5 */
     tmp0 = tmp1 + LEFT_SHIFT(z1 + z2, CONST_BITS);
     tmp2 = tmp1 + LEFT_SHIFT(z3 - z2, CONST_BITS);
@@ -693,15 +694,15 @@
  */
 
 GLOBAL(void)
-jpeg_idct_5x5 (j_decompress_ptr cinfo, jpeg_component_info * compptr,
+jpeg_idct_5x5 (j_decompress_ptr cinfo, jpeg_component_info *compptr,
                JCOEFPTR coef_block,
                JSAMPARRAY output_buf, JDIMENSION output_col)
 {
-  INT32 tmp0, tmp1, tmp10, tmp11, tmp12;
-  INT32 z1, z2, z3;
+  JLONG tmp0, tmp1, tmp10, tmp11, tmp12;
+  JLONG z1, z2, z3;
   JCOEFPTR inptr;
-  ISLOW_MULT_TYPE * quantptr;
-  int * wsptr;
+  ISLOW_MULT_TYPE *quantptr;
+  int *wsptr;
   JSAMPROW outptr;
   JSAMPLE *range_limit = IDCT_range_limit(cinfo);
   int ctr;
@@ -756,10 +757,10 @@
     /* Even part */
 
     /* Add fudge factor here for final descale. */
-    tmp12 = (INT32) wsptr[0] + (ONE << (PASS1_BITS+2));
+    tmp12 = (JLONG) wsptr[0] + (ONE << (PASS1_BITS+2));
     tmp12 = LEFT_SHIFT(tmp12, CONST_BITS);
-    tmp0 = (INT32) wsptr[2];
-    tmp1 = (INT32) wsptr[4];
+    tmp0 = (JLONG) wsptr[2];
+    tmp1 = (JLONG) wsptr[4];
     z1 = MULTIPLY(tmp0 + tmp1, FIX(0.790569415)); /* (c2+c4)/2 */
     z2 = MULTIPLY(tmp0 - tmp1, FIX(0.353553391)); /* (c2-c4)/2 */
     z3 = tmp12 + z2;
@@ -769,8 +770,8 @@
 
     /* Odd part */
 
-    z2 = (INT32) wsptr[1];
-    z3 = (INT32) wsptr[3];
+    z2 = (JLONG) wsptr[1];
+    z3 = (JLONG) wsptr[3];
 
     z1 = MULTIPLY(z2 + z3, FIX(0.831253876));     /* c3 */
     tmp0 = z1 + MULTIPLY(z2, FIX(0.513743148));   /* c1-c3 */
@@ -808,14 +809,14 @@
  */
 
 GLOBAL(void)
-jpeg_idct_3x3 (j_decompress_ptr cinfo, jpeg_component_info * compptr,
+jpeg_idct_3x3 (j_decompress_ptr cinfo, jpeg_component_info *compptr,
                JCOEFPTR coef_block,
                JSAMPARRAY output_buf, JDIMENSION output_col)
 {
-  INT32 tmp0, tmp2, tmp10, tmp12;
+  JLONG tmp0, tmp2, tmp10, tmp12;
   JCOEFPTR inptr;
-  ISLOW_MULT_TYPE * quantptr;
-  int * wsptr;
+  ISLOW_MULT_TYPE *quantptr;
+  int *wsptr;
   JSAMPROW outptr;
   JSAMPLE *range_limit = IDCT_range_limit(cinfo);
   int ctr;
@@ -860,16 +861,16 @@
     /* Even part */
 
     /* Add fudge factor here for final descale. */
-    tmp0 = (INT32) wsptr[0] + (ONE << (PASS1_BITS+2));
+    tmp0 = (JLONG) wsptr[0] + (ONE << (PASS1_BITS+2));
     tmp0 = LEFT_SHIFT(tmp0, CONST_BITS);
-    tmp2 = (INT32) wsptr[2];
+    tmp2 = (JLONG) wsptr[2];
     tmp12 = MULTIPLY(tmp2, FIX(0.707106781)); /* c2 */
     tmp10 = tmp0 + tmp12;
     tmp2 = tmp0 - tmp12 - tmp12;
 
     /* Odd part */
 
-    tmp12 = (INT32) wsptr[1];
+    tmp12 = (JLONG) wsptr[1];
     tmp0 = MULTIPLY(tmp12, FIX(1.224744871)); /* c1 */
 
     /* Final output stage */
@@ -898,15 +899,15 @@
  */
 
 GLOBAL(void)
-jpeg_idct_9x9 (j_decompress_ptr cinfo, jpeg_component_info * compptr,
+jpeg_idct_9x9 (j_decompress_ptr cinfo, jpeg_component_info *compptr,
                JCOEFPTR coef_block,
                JSAMPARRAY output_buf, JDIMENSION output_col)
 {
-  INT32 tmp0, tmp1, tmp2, tmp3, tmp10, tmp11, tmp12, tmp13, tmp14;
-  INT32 z1, z2, z3, z4;
+  JLONG tmp0, tmp1, tmp2, tmp3, tmp10, tmp11, tmp12, tmp13, tmp14;
+  JLONG z1, z2, z3, z4;
   JCOEFPTR inptr;
-  ISLOW_MULT_TYPE * quantptr;
-  int * wsptr;
+  ISLOW_MULT_TYPE *quantptr;
+  int *wsptr;
   JSAMPROW outptr;
   JSAMPLE *range_limit = IDCT_range_limit(cinfo);
   int ctr;
@@ -985,12 +986,12 @@
     /* Even part */
 
     /* Add fudge factor here for final descale. */
-    tmp0 = (INT32) wsptr[0] + (ONE << (PASS1_BITS+2));
+    tmp0 = (JLONG) wsptr[0] + (ONE << (PASS1_BITS+2));
     tmp0 = LEFT_SHIFT(tmp0, CONST_BITS);
 
-    z1 = (INT32) wsptr[2];
-    z2 = (INT32) wsptr[4];
-    z3 = (INT32) wsptr[6];
+    z1 = (JLONG) wsptr[2];
+    z2 = (JLONG) wsptr[4];
+    z3 = (JLONG) wsptr[6];
 
     tmp3 = MULTIPLY(z3, FIX(0.707106781));      /* c6 */
     tmp1 = tmp0 + tmp3;
@@ -1010,10 +1011,10 @@
 
     /* Odd part */
 
-    z1 = (INT32) wsptr[1];
-    z2 = (INT32) wsptr[3];
-    z3 = (INT32) wsptr[5];
-    z4 = (INT32) wsptr[7];
+    z1 = (JLONG) wsptr[1];
+    z2 = (JLONG) wsptr[3];
+    z3 = (JLONG) wsptr[5];
+    z4 = (JLONG) wsptr[7];
 
     z2 = MULTIPLY(z2, - FIX(1.224744871));           /* -c3 */
 
@@ -1069,16 +1070,16 @@
  */
 
 GLOBAL(void)
-jpeg_idct_10x10 (j_decompress_ptr cinfo, jpeg_component_info * compptr,
+jpeg_idct_10x10 (j_decompress_ptr cinfo, jpeg_component_info *compptr,
                  JCOEFPTR coef_block,
                  JSAMPARRAY output_buf, JDIMENSION output_col)
 {
-  INT32 tmp10, tmp11, tmp12, tmp13, tmp14;
-  INT32 tmp20, tmp21, tmp22, tmp23, tmp24;
-  INT32 z1, z2, z3, z4, z5;
+  JLONG tmp10, tmp11, tmp12, tmp13, tmp14;
+  JLONG tmp20, tmp21, tmp22, tmp23, tmp24;
+  JLONG z1, z2, z3, z4, z5;
   JCOEFPTR inptr;
-  ISLOW_MULT_TYPE * quantptr;
-  int * wsptr;
+  ISLOW_MULT_TYPE *quantptr;
+  int *wsptr;
   JSAMPROW outptr;
   JSAMPLE *range_limit = IDCT_range_limit(cinfo);
   int ctr;
@@ -1168,9 +1169,9 @@
     /* Even part */
 
     /* Add fudge factor here for final descale. */
-    z3 = (INT32) wsptr[0] + (ONE << (PASS1_BITS+2));
+    z3 = (JLONG) wsptr[0] + (ONE << (PASS1_BITS+2));
     z3 = LEFT_SHIFT(z3, CONST_BITS);
-    z4 = (INT32) wsptr[4];
+    z4 = (JLONG) wsptr[4];
     z1 = MULTIPLY(z4, FIX(1.144122806));         /* c4 */
     z2 = MULTIPLY(z4, FIX(0.437016024));         /* c8 */
     tmp10 = z3 + z1;
@@ -1178,8 +1179,8 @@
 
     tmp22 = z3 - LEFT_SHIFT(z1 - z2, 1);         /* c0 = (c4-c8)*2 */
 
-    z2 = (INT32) wsptr[2];
-    z3 = (INT32) wsptr[6];
+    z2 = (JLONG) wsptr[2];
+    z3 = (JLONG) wsptr[6];
 
     z1 = MULTIPLY(z2 + z3, FIX(0.831253876));    /* c6 */
     tmp12 = z1 + MULTIPLY(z2, FIX(0.513743148)); /* c2-c6 */
@@ -1192,11 +1193,11 @@
 
     /* Odd part */
 
-    z1 = (INT32) wsptr[1];
-    z2 = (INT32) wsptr[3];
-    z3 = (INT32) wsptr[5];
+    z1 = (JLONG) wsptr[1];
+    z2 = (JLONG) wsptr[3];
+    z3 = (JLONG) wsptr[5];
     z3 = LEFT_SHIFT(z3, CONST_BITS);
-    z4 = (INT32) wsptr[7];
+    z4 = (JLONG) wsptr[7];
 
     tmp11 = z2 + z4;
     tmp13 = z2 - z4;
@@ -1264,16 +1265,16 @@
  */
 
 GLOBAL(void)
-jpeg_idct_11x11 (j_decompress_ptr cinfo, jpeg_component_info * compptr,
+jpeg_idct_11x11 (j_decompress_ptr cinfo, jpeg_component_info *compptr,
                  JCOEFPTR coef_block,
                  JSAMPARRAY output_buf, JDIMENSION output_col)
 {
-  INT32 tmp10, tmp11, tmp12, tmp13, tmp14;
-  INT32 tmp20, tmp21, tmp22, tmp23, tmp24, tmp25;
-  INT32 z1, z2, z3, z4;
+  JLONG tmp10, tmp11, tmp12, tmp13, tmp14;
+  JLONG tmp20, tmp21, tmp22, tmp23, tmp24, tmp25;
+  JLONG z1, z2, z3, z4;
   JCOEFPTR inptr;
-  ISLOW_MULT_TYPE * quantptr;
-  int * wsptr;
+  ISLOW_MULT_TYPE *quantptr;
+  int *wsptr;
   JSAMPROW outptr;
   JSAMPLE *range_limit = IDCT_range_limit(cinfo);
   int ctr;
@@ -1361,12 +1362,12 @@
     /* Even part */
 
     /* Add fudge factor here for final descale. */
-    tmp10 = (INT32) wsptr[0] + (ONE << (PASS1_BITS+2));
+    tmp10 = (JLONG) wsptr[0] + (ONE << (PASS1_BITS+2));
     tmp10 = LEFT_SHIFT(tmp10, CONST_BITS);
 
-    z1 = (INT32) wsptr[2];
-    z2 = (INT32) wsptr[4];
-    z3 = (INT32) wsptr[6];
+    z1 = (JLONG) wsptr[2];
+    z2 = (JLONG) wsptr[4];
+    z3 = (JLONG) wsptr[6];
 
     tmp20 = MULTIPLY(z2 - z3, FIX(2.546640132));     /* c2+c4 */
     tmp23 = MULTIPLY(z2 - z1, FIX(0.430815045));     /* c2-c6 */
@@ -1386,10 +1387,10 @@
 
     /* Odd part */
 
-    z1 = (INT32) wsptr[1];
-    z2 = (INT32) wsptr[3];
-    z3 = (INT32) wsptr[5];
-    z4 = (INT32) wsptr[7];
+    z1 = (JLONG) wsptr[1];
+    z2 = (JLONG) wsptr[3];
+    z3 = (JLONG) wsptr[5];
+    z4 = (JLONG) wsptr[7];
 
     tmp11 = z1 + z2;
     tmp14 = MULTIPLY(tmp11 + z3 + z4, FIX(0.398430003)); /* c9 */
@@ -1458,16 +1459,16 @@
  */
 
 GLOBAL(void)
-jpeg_idct_12x12 (j_decompress_ptr cinfo, jpeg_component_info * compptr,
+jpeg_idct_12x12 (j_decompress_ptr cinfo, jpeg_component_info *compptr,
                  JCOEFPTR coef_block,
                  JSAMPARRAY output_buf, JDIMENSION output_col)
 {
-  INT32 tmp10, tmp11, tmp12, tmp13, tmp14, tmp15;
-  INT32 tmp20, tmp21, tmp22, tmp23, tmp24, tmp25;
-  INT32 z1, z2, z3, z4;
+  JLONG tmp10, tmp11, tmp12, tmp13, tmp14, tmp15;
+  JLONG tmp20, tmp21, tmp22, tmp23, tmp24, tmp25;
+  JLONG z1, z2, z3, z4;
   JCOEFPTR inptr;
-  ISLOW_MULT_TYPE * quantptr;
-  int * wsptr;
+  ISLOW_MULT_TYPE *quantptr;
+  int *wsptr;
   JSAMPROW outptr;
   JSAMPLE *range_limit = IDCT_range_limit(cinfo);
   int ctr;
@@ -1565,19 +1566,19 @@
     /* Even part */
 
     /* Add fudge factor here for final descale. */
-    z3 = (INT32) wsptr[0] + (ONE << (PASS1_BITS+2));
+    z3 = (JLONG) wsptr[0] + (ONE << (PASS1_BITS+2));
     z3 = LEFT_SHIFT(z3, CONST_BITS);
 
-    z4 = (INT32) wsptr[4];
+    z4 = (JLONG) wsptr[4];
     z4 = MULTIPLY(z4, FIX(1.224744871)); /* c4 */
 
     tmp10 = z3 + z4;
     tmp11 = z3 - z4;
 
-    z1 = (INT32) wsptr[2];
+    z1 = (JLONG) wsptr[2];
     z4 = MULTIPLY(z1, FIX(1.366025404)); /* c2 */
     z1 = LEFT_SHIFT(z1, CONST_BITS);
-    z2 = (INT32) wsptr[6];
+    z2 = (JLONG) wsptr[6];
     z2 = LEFT_SHIFT(z2, CONST_BITS);
 
     tmp12 = z1 - z2;
@@ -1597,10 +1598,10 @@
 
     /* Odd part */
 
-    z1 = (INT32) wsptr[1];
-    z2 = (INT32) wsptr[3];
-    z3 = (INT32) wsptr[5];
-    z4 = (INT32) wsptr[7];
+    z1 = (JLONG) wsptr[1];
+    z2 = (JLONG) wsptr[3];
+    z3 = (JLONG) wsptr[5];
+    z4 = (JLONG) wsptr[7];
 
     tmp11 = MULTIPLY(z2, FIX(1.306562965));                  /* c3 */
     tmp14 = MULTIPLY(z2, - FIX_0_541196100);                 /* -c9 */
@@ -1674,16 +1675,16 @@
  */
 
 GLOBAL(void)
-jpeg_idct_13x13 (j_decompress_ptr cinfo, jpeg_component_info * compptr,
+jpeg_idct_13x13 (j_decompress_ptr cinfo, jpeg_component_info *compptr,
                  JCOEFPTR coef_block,
                  JSAMPARRAY output_buf, JDIMENSION output_col)
 {
-  INT32 tmp10, tmp11, tmp12, tmp13, tmp14, tmp15;
-  INT32 tmp20, tmp21, tmp22, tmp23, tmp24, tmp25, tmp26;
-  INT32 z1, z2, z3, z4;
+  JLONG tmp10, tmp11, tmp12, tmp13, tmp14, tmp15;
+  JLONG tmp20, tmp21, tmp22, tmp23, tmp24, tmp25, tmp26;
+  JLONG z1, z2, z3, z4;
   JCOEFPTR inptr;
-  ISLOW_MULT_TYPE * quantptr;
-  int * wsptr;
+  ISLOW_MULT_TYPE *quantptr;
+  int *wsptr;
   JSAMPROW outptr;
   JSAMPLE *range_limit = IDCT_range_limit(cinfo);
   int ctr;
@@ -1786,12 +1787,12 @@
     /* Even part */
 
     /* Add fudge factor here for final descale. */
-    z1 = (INT32) wsptr[0] + (ONE << (PASS1_BITS+2));
+    z1 = (JLONG) wsptr[0] + (ONE << (PASS1_BITS+2));
     z1 = LEFT_SHIFT(z1, CONST_BITS);
 
-    z2 = (INT32) wsptr[2];
-    z3 = (INT32) wsptr[4];
-    z4 = (INT32) wsptr[6];
+    z2 = (JLONG) wsptr[2];
+    z3 = (JLONG) wsptr[4];
+    z4 = (JLONG) wsptr[6];
 
     tmp10 = z3 + z4;
     tmp11 = z3 - z4;
@@ -1818,10 +1819,10 @@
 
     /* Odd part */
 
-    z1 = (INT32) wsptr[1];
-    z2 = (INT32) wsptr[3];
-    z3 = (INT32) wsptr[5];
-    z4 = (INT32) wsptr[7];
+    z1 = (JLONG) wsptr[1];
+    z2 = (JLONG) wsptr[3];
+    z3 = (JLONG) wsptr[5];
+    z4 = (JLONG) wsptr[7];
 
     tmp11 = MULTIPLY(z1 + z2, FIX(1.322312651));     /* c3 */
     tmp12 = MULTIPLY(z1 + z3, FIX(1.163874945));     /* c5 */
@@ -1902,16 +1903,16 @@
  */
 
 GLOBAL(void)
-jpeg_idct_14x14 (j_decompress_ptr cinfo, jpeg_component_info * compptr,
+jpeg_idct_14x14 (j_decompress_ptr cinfo, jpeg_component_info *compptr,
                  JCOEFPTR coef_block,
                  JSAMPARRAY output_buf, JDIMENSION output_col)
 {
-  INT32 tmp10, tmp11, tmp12, tmp13, tmp14, tmp15, tmp16;
-  INT32 tmp20, tmp21, tmp22, tmp23, tmp24, tmp25, tmp26;
-  INT32 z1, z2, z3, z4;
+  JLONG tmp10, tmp11, tmp12, tmp13, tmp14, tmp15, tmp16;
+  JLONG tmp20, tmp21, tmp22, tmp23, tmp24, tmp25, tmp26;
+  JLONG z1, z2, z3, z4;
   JCOEFPTR inptr;
-  ISLOW_MULT_TYPE * quantptr;
-  int * wsptr;
+  ISLOW_MULT_TYPE *quantptr;
+  int *wsptr;
   JSAMPROW outptr;
   JSAMPLE *range_limit = IDCT_range_limit(cinfo);
   int ctr;
@@ -2013,9 +2014,9 @@
     /* Even part */
 
     /* Add fudge factor here for final descale. */
-    z1 = (INT32) wsptr[0] + (ONE << (PASS1_BITS+2));
+    z1 = (JLONG) wsptr[0] + (ONE << (PASS1_BITS+2));
     z1 = LEFT_SHIFT(z1, CONST_BITS);
-    z4 = (INT32) wsptr[4];
+    z4 = (JLONG) wsptr[4];
     z2 = MULTIPLY(z4, FIX(1.274162392));         /* c4 */
     z3 = MULTIPLY(z4, FIX(0.314692123));         /* c12 */
     z4 = MULTIPLY(z4, FIX(0.881747734));         /* c8 */
@@ -2026,8 +2027,8 @@
 
     tmp23 = z1 - LEFT_SHIFT(z2 + z3 - z4, 1);    /* c0 = (c4+c12-c8)*2 */
 
-    z1 = (INT32) wsptr[2];
-    z2 = (INT32) wsptr[6];
+    z1 = (JLONG) wsptr[2];
+    z2 = (JLONG) wsptr[6];
 
     z3 = MULTIPLY(z1 + z2, FIX(1.105676686));    /* c6 */
 
@@ -2045,10 +2046,10 @@
 
     /* Odd part */
 
-    z1 = (INT32) wsptr[1];
-    z2 = (INT32) wsptr[3];
-    z3 = (INT32) wsptr[5];
-    z4 = (INT32) wsptr[7];
+    z1 = (JLONG) wsptr[1];
+    z2 = (JLONG) wsptr[3];
+    z3 = (JLONG) wsptr[5];
+    z4 = (JLONG) wsptr[7];
     z4 = LEFT_SHIFT(z4, CONST_BITS);
 
     tmp14 = z1 + z3;
@@ -2128,16 +2129,16 @@
  */
 
 GLOBAL(void)
-jpeg_idct_15x15 (j_decompress_ptr cinfo, jpeg_component_info * compptr,
+jpeg_idct_15x15 (j_decompress_ptr cinfo, jpeg_component_info *compptr,
                  JCOEFPTR coef_block,
                  JSAMPARRAY output_buf, JDIMENSION output_col)
 {
-  INT32 tmp10, tmp11, tmp12, tmp13, tmp14, tmp15, tmp16;
-  INT32 tmp20, tmp21, tmp22, tmp23, tmp24, tmp25, tmp26, tmp27;
-  INT32 z1, z2, z3, z4;
+  JLONG tmp10, tmp11, tmp12, tmp13, tmp14, tmp15, tmp16;
+  JLONG tmp20, tmp21, tmp22, tmp23, tmp24, tmp25, tmp26, tmp27;
+  JLONG z1, z2, z3, z4;
   JCOEFPTR inptr;
-  ISLOW_MULT_TYPE * quantptr;
-  int * wsptr;
+  ISLOW_MULT_TYPE *quantptr;
+  int *wsptr;
   JSAMPROW outptr;
   JSAMPLE *range_limit = IDCT_range_limit(cinfo);
   int ctr;
@@ -2245,12 +2246,12 @@
     /* Even part */
 
     /* Add fudge factor here for final descale. */
-    z1 = (INT32) wsptr[0] + (ONE << (PASS1_BITS+2));
+    z1 = (JLONG) wsptr[0] + (ONE << (PASS1_BITS+2));
     z1 = LEFT_SHIFT(z1, CONST_BITS);
 
-    z2 = (INT32) wsptr[2];
-    z3 = (INT32) wsptr[4];
-    z4 = (INT32) wsptr[6];
+    z2 = (JLONG) wsptr[2];
+    z3 = (JLONG) wsptr[4];
+    z4 = (JLONG) wsptr[6];
 
     tmp10 = MULTIPLY(z4, FIX(0.437016024)); /* c12 */
     tmp11 = MULTIPLY(z4, FIX(1.144122806)); /* c6 */
@@ -2285,11 +2286,11 @@
 
     /* Odd part */
 
-    z1 = (INT32) wsptr[1];
-    z2 = (INT32) wsptr[3];
-    z4 = (INT32) wsptr[5];
+    z1 = (JLONG) wsptr[1];
+    z2 = (JLONG) wsptr[3];
+    z4 = (JLONG) wsptr[5];
     z3 = MULTIPLY(z4, FIX(1.224744871));                    /* c5 */
-    z4 = (INT32) wsptr[7];
+    z4 = (JLONG) wsptr[7];
 
     tmp13 = z2 - z4;
     tmp15 = MULTIPLY(z1 + tmp13, FIX(0.831253876));         /* c9 */
@@ -2370,16 +2371,16 @@
  */
 
 GLOBAL(void)
-jpeg_idct_16x16 (j_decompress_ptr cinfo, jpeg_component_info * compptr,
+jpeg_idct_16x16 (j_decompress_ptr cinfo, jpeg_component_info *compptr,
                  JCOEFPTR coef_block,
                  JSAMPARRAY output_buf, JDIMENSION output_col)
 {
-  INT32 tmp0, tmp1, tmp2, tmp3, tmp10, tmp11, tmp12, tmp13;
-  INT32 tmp20, tmp21, tmp22, tmp23, tmp24, tmp25, tmp26, tmp27;
-  INT32 z1, z2, z3, z4;
+  JLONG tmp0, tmp1, tmp2, tmp3, tmp10, tmp11, tmp12, tmp13;
+  JLONG tmp20, tmp21, tmp22, tmp23, tmp24, tmp25, tmp26, tmp27;
+  JLONG z1, z2, z3, z4;
   JCOEFPTR inptr;
-  ISLOW_MULT_TYPE * quantptr;
-  int * wsptr;
+  ISLOW_MULT_TYPE *quantptr;
+  int *wsptr;
   JSAMPROW outptr;
   JSAMPLE *range_limit = IDCT_range_limit(cinfo);
   int ctr;
@@ -2496,10 +2497,10 @@
     /* Even part */
 
     /* Add fudge factor here for final descale. */
-    tmp0 = (INT32) wsptr[0] + (ONE << (PASS1_BITS+2));
+    tmp0 = (JLONG) wsptr[0] + (ONE << (PASS1_BITS+2));
     tmp0 = LEFT_SHIFT(tmp0, CONST_BITS);
 
-    z1 = (INT32) wsptr[4];
+    z1 = (JLONG) wsptr[4];
     tmp1 = MULTIPLY(z1, FIX(1.306562965));      /* c4[16] = c2[8] */
     tmp2 = MULTIPLY(z1, FIX_0_541196100);       /* c12[16] = c6[8] */
 
@@ -2508,8 +2509,8 @@
     tmp12 = tmp0 + tmp2;
     tmp13 = tmp0 - tmp2;
 
-    z1 = (INT32) wsptr[2];
-    z2 = (INT32) wsptr[6];
+    z1 = (JLONG) wsptr[2];
+    z2 = (JLONG) wsptr[6];
     z3 = z1 - z2;
     z4 = MULTIPLY(z3, FIX(0.275899379));        /* c14[16] = c7[8] */
     z3 = MULTIPLY(z3, FIX(1.387039845));        /* c2[16] = c1[8] */
@@ -2530,10 +2531,10 @@
 
     /* Odd part */
 
-    z1 = (INT32) wsptr[1];
-    z2 = (INT32) wsptr[3];
-    z3 = (INT32) wsptr[5];
-    z4 = (INT32) wsptr[7];
+    z1 = (JLONG) wsptr[1];
+    z2 = (JLONG) wsptr[3];
+    z3 = (JLONG) wsptr[5];
+    z4 = (JLONG) wsptr[7];
 
     tmp11 = z1 + z3;
 
diff --git a/jidctred.c b/jidctred.c
index 4598698..2d5b546 100644
--- a/jidctred.c
+++ b/jidctred.c
@@ -5,7 +5,8 @@
  * Copyright (C) 1994-1998, Thomas G. Lane.
  * libjpeg-turbo Modifications:
  * Copyright (C) 2015, D. R. Commander
- * For conditions of distribution and use, see the accompanying README file.
+ * For conditions of distribution and use, see the accompanying README.ijg
+ * file.
  *
  * This file contains inverse-DCT routines that produce reduced-size output:
  * either 4x4, 2x2, or 1x1 pixels from an 8x8 DCT block.
@@ -57,20 +58,20 @@
  */
 
 #if CONST_BITS == 13
-#define FIX_0_211164243  ((INT32)  1730)        /* FIX(0.211164243) */
-#define FIX_0_509795579  ((INT32)  4176)        /* FIX(0.509795579) */
-#define FIX_0_601344887  ((INT32)  4926)        /* FIX(0.601344887) */
-#define FIX_0_720959822  ((INT32)  5906)        /* FIX(0.720959822) */
-#define FIX_0_765366865  ((INT32)  6270)        /* FIX(0.765366865) */
-#define FIX_0_850430095  ((INT32)  6967)        /* FIX(0.850430095) */
-#define FIX_0_899976223  ((INT32)  7373)        /* FIX(0.899976223) */
-#define FIX_1_061594337  ((INT32)  8697)        /* FIX(1.061594337) */
-#define FIX_1_272758580  ((INT32)  10426)       /* FIX(1.272758580) */
-#define FIX_1_451774981  ((INT32)  11893)       /* FIX(1.451774981) */
-#define FIX_1_847759065  ((INT32)  15137)       /* FIX(1.847759065) */
-#define FIX_2_172734803  ((INT32)  17799)       /* FIX(2.172734803) */
-#define FIX_2_562915447  ((INT32)  20995)       /* FIX(2.562915447) */
-#define FIX_3_624509785  ((INT32)  29692)       /* FIX(3.624509785) */
+#define FIX_0_211164243  ((JLONG)  1730)        /* FIX(0.211164243) */
+#define FIX_0_509795579  ((JLONG)  4176)        /* FIX(0.509795579) */
+#define FIX_0_601344887  ((JLONG)  4926)        /* FIX(0.601344887) */
+#define FIX_0_720959822  ((JLONG)  5906)        /* FIX(0.720959822) */
+#define FIX_0_765366865  ((JLONG)  6270)        /* FIX(0.765366865) */
+#define FIX_0_850430095  ((JLONG)  6967)        /* FIX(0.850430095) */
+#define FIX_0_899976223  ((JLONG)  7373)        /* FIX(0.899976223) */
+#define FIX_1_061594337  ((JLONG)  8697)        /* FIX(1.061594337) */
+#define FIX_1_272758580  ((JLONG)  10426)       /* FIX(1.272758580) */
+#define FIX_1_451774981  ((JLONG)  11893)       /* FIX(1.451774981) */
+#define FIX_1_847759065  ((JLONG)  15137)       /* FIX(1.847759065) */
+#define FIX_2_172734803  ((JLONG)  17799)       /* FIX(2.172734803) */
+#define FIX_2_562915447  ((JLONG)  20995)       /* FIX(2.562915447) */
+#define FIX_3_624509785  ((JLONG)  29692)       /* FIX(3.624509785) */
 #else
 #define FIX_0_211164243  FIX(0.211164243)
 #define FIX_0_509795579  FIX(0.509795579)
@@ -89,7 +90,7 @@
 #endif
 
 
-/* Multiply an INT32 variable by an INT32 constant to yield an INT32 result.
+/* Multiply a JLONG variable by a JLONG constant to yield a JLONG result.
  * For 8-bit samples with the recommended scaling, all the variable
  * and constant values involved are no more than 16 bits wide, so a
  * 16x16->32 bit multiply can be used instead of a full 32x32 multiply.
@@ -117,15 +118,15 @@
  */
 
 GLOBAL(void)
-jpeg_idct_4x4 (j_decompress_ptr cinfo, jpeg_component_info * compptr,
+jpeg_idct_4x4 (j_decompress_ptr cinfo, jpeg_component_info *compptr,
                JCOEFPTR coef_block,
                JSAMPARRAY output_buf, JDIMENSION output_col)
 {
-  INT32 tmp0, tmp2, tmp10, tmp12;
-  INT32 z1, z2, z3, z4;
+  JLONG tmp0, tmp2, tmp10, tmp12;
+  JLONG z1, z2, z3, z4;
   JCOEFPTR inptr;
-  ISLOW_MULT_TYPE * quantptr;
-  int * wsptr;
+  ISLOW_MULT_TYPE *quantptr;
+  int *wsptr;
   JSAMPROW outptr;
   JSAMPLE *range_limit = IDCT_range_limit(cinfo);
   int ctr;
@@ -205,7 +206,7 @@
     if (wsptr[1] == 0 && wsptr[2] == 0 && wsptr[3] == 0 &&
         wsptr[5] == 0 && wsptr[6] == 0 && wsptr[7] == 0) {
       /* AC terms all zero */
-      JSAMPLE dcval = range_limit[(int) DESCALE((INT32) wsptr[0], PASS1_BITS+3)
+      JSAMPLE dcval = range_limit[(int) DESCALE((JLONG) wsptr[0], PASS1_BITS+3)
                                   & RANGE_MASK];
 
       outptr[0] = dcval;
@@ -220,20 +221,20 @@
 
     /* Even part */
 
-    tmp0 = LEFT_SHIFT((INT32) wsptr[0], CONST_BITS+1);
+    tmp0 = LEFT_SHIFT((JLONG) wsptr[0], CONST_BITS+1);
 
-    tmp2 = MULTIPLY((INT32) wsptr[2], FIX_1_847759065)
-         + MULTIPLY((INT32) wsptr[6], - FIX_0_765366865);
+    tmp2 = MULTIPLY((JLONG) wsptr[2], FIX_1_847759065)
+         + MULTIPLY((JLONG) wsptr[6], - FIX_0_765366865);
 
     tmp10 = tmp0 + tmp2;
     tmp12 = tmp0 - tmp2;
 
     /* Odd part */
 
-    z1 = (INT32) wsptr[7];
-    z2 = (INT32) wsptr[5];
-    z3 = (INT32) wsptr[3];
-    z4 = (INT32) wsptr[1];
+    z1 = (JLONG) wsptr[7];
+    z2 = (JLONG) wsptr[5];
+    z3 = (JLONG) wsptr[3];
+    z4 = (JLONG) wsptr[1];
 
     tmp0 = MULTIPLY(z1, - FIX_0_211164243) /* sqrt(2) * (c3-c1) */
          + MULTIPLY(z2, FIX_1_451774981) /* sqrt(2) * (c3+c7) */
@@ -271,14 +272,14 @@
  */
 
 GLOBAL(void)
-jpeg_idct_2x2 (j_decompress_ptr cinfo, jpeg_component_info * compptr,
+jpeg_idct_2x2 (j_decompress_ptr cinfo, jpeg_component_info *compptr,
                JCOEFPTR coef_block,
                JSAMPARRAY output_buf, JDIMENSION output_col)
 {
-  INT32 tmp0, tmp10, z1;
+  JLONG tmp0, tmp10, z1;
   JCOEFPTR inptr;
-  ISLOW_MULT_TYPE * quantptr;
-  int * wsptr;
+  ISLOW_MULT_TYPE *quantptr;
+  int *wsptr;
   JSAMPROW outptr;
   JSAMPLE *range_limit = IDCT_range_limit(cinfo);
   int ctr;
@@ -338,7 +339,7 @@
 #ifndef NO_ZERO_ROW_TEST
     if (wsptr[1] == 0 && wsptr[3] == 0 && wsptr[5] == 0 && wsptr[7] == 0) {
       /* AC terms all zero */
-      JSAMPLE dcval = range_limit[(int) DESCALE((INT32) wsptr[0], PASS1_BITS+3)
+      JSAMPLE dcval = range_limit[(int) DESCALE((JLONG) wsptr[0], PASS1_BITS+3)
                                   & RANGE_MASK];
 
       outptr[0] = dcval;
@@ -351,14 +352,14 @@
 
     /* Even part */
 
-    tmp10 = LEFT_SHIFT((INT32) wsptr[0], CONST_BITS+2);
+    tmp10 = LEFT_SHIFT((JLONG) wsptr[0], CONST_BITS+2);
 
     /* Odd part */
 
-    tmp0 = MULTIPLY((INT32) wsptr[7], - FIX_0_720959822) /* sqrt(2) * (c7-c5+c3-c1) */
-         + MULTIPLY((INT32) wsptr[5], FIX_0_850430095) /* sqrt(2) * (-c1+c3+c5+c7) */
-         + MULTIPLY((INT32) wsptr[3], - FIX_1_272758580) /* sqrt(2) * (-c1+c3-c5-c7) */
-         + MULTIPLY((INT32) wsptr[1], FIX_3_624509785); /* sqrt(2) * (c1+c3+c5+c7) */
+    tmp0 = MULTIPLY((JLONG) wsptr[7], - FIX_0_720959822) /* sqrt(2) * (c7-c5+c3-c1) */
+         + MULTIPLY((JLONG) wsptr[5], FIX_0_850430095) /* sqrt(2) * (-c1+c3+c5+c7) */
+         + MULTIPLY((JLONG) wsptr[3], - FIX_1_272758580) /* sqrt(2) * (-c1+c3-c5-c7) */
+         + MULTIPLY((JLONG) wsptr[1], FIX_3_624509785); /* sqrt(2) * (c1+c3+c5+c7) */
 
     /* Final output stage */
 
@@ -380,12 +381,12 @@
  */
 
 GLOBAL(void)
-jpeg_idct_1x1 (j_decompress_ptr cinfo, jpeg_component_info * compptr,
+jpeg_idct_1x1 (j_decompress_ptr cinfo, jpeg_component_info *compptr,
                JCOEFPTR coef_block,
                JSAMPARRAY output_buf, JDIMENSION output_col)
 {
   int dcval;
-  ISLOW_MULT_TYPE * quantptr;
+  ISLOW_MULT_TYPE *quantptr;
   JSAMPLE *range_limit = IDCT_range_limit(cinfo);
   SHIFT_TEMPS
 
@@ -394,7 +395,7 @@
    */
   quantptr = (ISLOW_MULT_TYPE *) compptr->dct_table;
   dcval = DEQUANTIZE(coef_block[0], quantptr[0]);
-  dcval = (int) DESCALE((INT32) dcval, 3);
+  dcval = (int) DESCALE((JLONG) dcval, 3);
 
   output_buf[0][output_col] = range_limit[dcval & RANGE_MASK];
 }
diff --git a/jinclude.h b/jinclude.h
index 4dced6e..d461a1a 100644
--- a/jinclude.h
+++ b/jinclude.h
@@ -5,7 +5,8 @@
  * Copyright (C) 1991-1994, Thomas G. Lane.
  * It was modified by The libjpeg-turbo Project to include only code relevant
  * to libjpeg-turbo.
- * For conditions of distribution and use, see the accompanying README file.
+ * For conditions of distribution and use, see the accompanying README.ijg
+ * file.
  *
  * This file exists to provide a single place to fix any problems with
  * including the wrong system include files.  (Common problems are taken
diff --git a/jmemmgr.c b/jmemmgr.c
index 4b0fcac..9174ad3 100644
--- a/jmemmgr.c
+++ b/jmemmgr.c
@@ -5,7 +5,8 @@
  * Copyright (C) 1991-1997, Thomas G. Lane.
  * libjpeg-turbo Modifications:
  * Copyright (C) 2016, D. R. Commander.
- * For conditions of distribution and use, see the accompanying README file.
+ * For conditions of distribution and use, see the accompanying README.ijg
+ * file.
  *
  * This file contains the JPEG system-independent memory management
  * routines.  This code is usable across a wide variety of machines; most
@@ -34,7 +35,7 @@
 
 #ifndef NO_GETENV
 #ifndef HAVE_STDLIB_H           /* <stdlib.h> should declare getenv() */
-extern char * getenv (const char * name);
+extern char *getenv (const char *name);
 #endif
 #endif
 
@@ -96,7 +97,7 @@
  * Small and large pool headers are identical.
  */
 
-typedef struct small_pool_struct * small_pool_ptr;
+typedef struct small_pool_struct *small_pool_ptr;
 
 typedef struct small_pool_struct {
   small_pool_ptr next;  /* next in list of pools */
@@ -104,7 +105,7 @@
   size_t bytes_left;            /* bytes still available in this pool */
 } small_pool_hdr;
 
-typedef struct large_pool_struct * large_pool_ptr;
+typedef struct large_pool_struct *large_pool_ptr;
 
 typedef struct large_pool_struct {
   large_pool_ptr next;  /* next in list of pools */
@@ -140,7 +141,7 @@
   JDIMENSION last_rowsperchunk; /* from most recent alloc_sarray/barray */
 } my_memory_mgr;
 
-typedef my_memory_mgr * my_mem_ptr;
+typedef my_memory_mgr *my_mem_ptr;
 
 
 /*
@@ -266,7 +267,7 @@
 {
   my_mem_ptr mem = (my_mem_ptr) cinfo->mem;
   small_pool_ptr hdr_ptr, prev_hdr_ptr;
-  char * data_ptr;
+  char *data_ptr;
   size_t min_request, slop;
 
   /*
@@ -362,7 +363,7 @@
 {
   my_mem_ptr mem = (my_mem_ptr) cinfo->mem;
   large_pool_ptr hdr_ptr;
-  char * data_ptr;
+  char *data_ptr;
 
   /*
    * Round up the requested size to a multiple of ALIGN_SIZE so that
@@ -1153,7 +1154,7 @@
    * this feature.
    */
 #ifndef NO_GETENV
-  { char * memenv;
+  { char *memenv;
 
     if ((memenv = getenv("JPEGMEM")) != NULL) {
       char ch = 'x';
diff --git a/jmemnobs.c b/jmemnobs.c
index 6282832..5797198 100644
--- a/jmemnobs.c
+++ b/jmemnobs.c
@@ -5,7 +5,8 @@
  * Copyright (C) 1992-1996, Thomas G. Lane.
  * It was modified by The libjpeg-turbo Project to include only code and
  * information relevant to libjpeg-turbo.
- * For conditions of distribution and use, see the accompanying README file.
+ * For conditions of distribution and use, see the accompanying README.ijg
+ * file.
  *
  * This file provides a really simple implementation of the system-
  * dependent portion of the JPEG memory manager.  This implementation
@@ -23,7 +24,7 @@
 #include "jmemsys.h"            /* import the system-dependent declarations */
 
 #ifndef HAVE_STDLIB_H           /* <stdlib.h> should declare malloc(),free() */
-extern void * malloc (size_t size);
+extern void *malloc (size_t size);
 extern void free (void *ptr);
 #endif
 
@@ -40,7 +41,7 @@
 }
 
 GLOBAL(void)
-jpeg_free_small (j_common_ptr cinfo, void * object, size_t sizeofobject)
+jpeg_free_small (j_common_ptr cinfo, void *object, size_t sizeofobject)
 {
   free(object);
 }
@@ -57,7 +58,7 @@
 }
 
 GLOBAL(void)
-jpeg_free_large (j_common_ptr cinfo, void * object, size_t sizeofobject)
+jpeg_free_large (j_common_ptr cinfo, void *object, size_t sizeofobject)
 {
   free(object);
 }
diff --git a/jmemsys.h b/jmemsys.h
index 5026c7c..f7dfe87 100644
--- a/jmemsys.h
+++ b/jmemsys.h
@@ -5,7 +5,8 @@
  * Copyright (C) 1992-1997, Thomas G. Lane.
  * It was modified by The libjpeg-turbo Project to include only code and
  * information relevant to libjpeg-turbo.
- * For conditions of distribution and use, see the accompanying README file.
+ * For conditions of distribution and use, see the accompanying README.ijg
+ * file.
  *
  * This include file defines the interface between the system-independent
  * and system-dependent portions of the JPEG memory manager.  No other
@@ -31,7 +32,7 @@
  */
 
 EXTERN(void *) jpeg_get_small (j_common_ptr cinfo, size_t sizeofobject);
-EXTERN(void) jpeg_free_small (j_common_ptr cinfo, void * object,
+EXTERN(void) jpeg_free_small (j_common_ptr cinfo, void *object,
                               size_t sizeofobject);
 
 /*
@@ -43,7 +44,7 @@
  */
 
 EXTERN(void *) jpeg_get_large (j_common_ptr cinfo, size_t sizeofobject);
-EXTERN(void) jpeg_free_large (j_common_ptr cinfo, void * object,
+EXTERN(void) jpeg_free_large (j_common_ptr cinfo, void *object,
                               size_t sizeofobject);
 
 /*
@@ -116,15 +117,15 @@
 #endif /* USE_MAC_MEMMGR */
 
 
-typedef struct backing_store_struct * backing_store_ptr;
+typedef struct backing_store_struct *backing_store_ptr;
 
 typedef struct backing_store_struct {
   /* Methods for reading/writing/closing this backing-store object */
   void (*read_backing_store) (j_common_ptr cinfo, backing_store_ptr info,
-                              void * buffer_address, long file_offset,
+                              void *buffer_address, long file_offset,
                               long byte_count);
   void (*write_backing_store) (j_common_ptr cinfo, backing_store_ptr info,
-                               void * buffer_address, long file_offset,
+                               void *buffer_address, long file_offset,
                                long byte_count);
   void (*close_backing_store) (j_common_ptr cinfo, backing_store_ptr info);
 
@@ -141,7 +142,7 @@
   char temp_name[TEMP_NAME_LENGTH]; /* name if it's a file */
 #else
   /* For a typical implementation with temp files, we need: */
-  FILE * temp_file;             /* stdio reference to temp file */
+  FILE *temp_file;              /* stdio reference to temp file */
   char temp_name[TEMP_NAME_LENGTH]; /* name of temp file */
 #endif
 #endif
diff --git a/jmorecfg.h b/jmorecfg.h
index be89189..1d96786 100644
--- a/jmorecfg.h
+++ b/jmorecfg.h
@@ -6,7 +6,8 @@
  * Modified 1997-2009 by Guido Vollbeding.
  * libjpeg-turbo Modifications:
  * Copyright (C) 2009, 2011, 2014-2015, D. R. Commander.
- * For conditions of distribution and use, see the accompanying README file.
+ * For conditions of distribution and use, see the accompanying README.ijg
+ * file.
  *
  * This file contains additional configuration options that customize the
  * JPEG software for special applications or support machine-dependent
@@ -146,13 +147,35 @@
 typedef short INT16;
 #endif
 
-/* INT32 must hold at least signed 32-bit values. */
+/* INT32 must hold at least signed 32-bit values.
+ *
+ * NOTE: The INT32 typedef dates back to libjpeg v5 (1994.)  Integers were
+ * sometimes 16-bit back then (MS-DOS), which is why INT32 is typedef'd to
+ * long.  It also wasn't common (or at least as common) in 1994 for INT32 to be
+ * defined by platform headers.  Since then, however, INT32 is defined in
+ * several other common places:
+ *
+ * Xmd.h (X11 header) typedefs INT32 to int on 64-bit platforms and long on
+ * 32-bit platforms (i.e always a 32-bit signed type.)
+ *
+ * basetsd.h (Win32 header) typedefs INT32 to int (always a 32-bit signed type
+ * on modern platforms.)
+ *
+ * qglobal.h (Qt header) typedefs INT32 to int (always a 32-bit signed type on
+ * modern platforms.)
+ *
+ * This is a recipe for conflict, since "long" and "int" aren't always
+ * compatible types.  Since the definition of INT32 has technically been part
+ * of the libjpeg API for more than 20 years, we can't remove it, but we do not
+ * use it internally any longer.  We instead define a separate type (JLONG)
+ * for internal use, which ensures that internal behavior will always be the
+ * same regardless of any external headers that may be included.
+ */
 
 #ifndef XMD_H                   /* X11/xmd.h correctly defines INT32 */
 #ifndef _BASETSD_H_		/* Microsoft defines it in basetsd.h */
 #ifndef _BASETSD_H		/* MinGW is slightly different */
 #ifndef QGLOBAL_H		/* Qt defines it in qglobal.h */
-#define __INT32_IS_ACTUALLY_LONG
 typedef long INT32;
 #endif
 #endif
diff --git a/jpegcomp.h b/jpegcomp.h
index ed9eeab..c39275b 100644
--- a/jpegcomp.h
+++ b/jpegcomp.h
@@ -2,7 +2,8 @@
  * jpegcomp.h
  *
  * Copyright (C) 2010, D. R. Commander
- * For conditions of distribution and use, see the accompanying README file.
+ * For conditions of distribution and use, see the accompanying README.ijg
+ * file.
  *
  * JPEG compatibility macros
  * These declarations are considered internal to the JPEG library; most
diff --git a/jpegint.h b/jpegint.h
index 5f56b11..c3b4320 100644
--- a/jpegint.h
+++ b/jpegint.h
@@ -5,8 +5,10 @@
  * Copyright (C) 1991-1997, Thomas G. Lane.
  * Modified 1997-2009 by Guido Vollbeding.
  * libjpeg-turbo Modifications:
- * Copyright (C) 2015, D. R. Commander
- * For conditions of distribution and use, see the accompanying README file.
+ * Copyright (C) 2015-2016, D. R. Commander
+ * Copyright (C) 2015, Google, Inc.
+ * For conditions of distribution and use, see the accompanying README.ijg
+ * file.
  *
  * This file provides common declarations for the various JPEG modules.
  * These declarations are considered internal to the JPEG library; most
@@ -42,16 +44,16 @@
 #define DSTATE_STOPPING 210     /* looking for EOI in jpeg_finish_decompress */
 
 
+/* JLONG must hold at least signed 32-bit values. */
+typedef long JLONG;
+
+
 /*
  * Left shift macro that handles a negative operand without causing any
  * sanitizer warnings
  */
 
-#ifdef __INT32_IS_ACTUALLY_LONG
-#define LEFT_SHIFT(a, b) ((INT32)((unsigned long)(a) << (b)))
-#else
-#define LEFT_SHIFT(a, b) ((INT32)((unsigned int)(a) << (b)))
-#endif
+#define LEFT_SHIFT(a, b) ((JLONG)((unsigned long)(a) << (b)))
 
 
 /* Declarations for compression modules */
@@ -112,7 +114,7 @@
 struct jpeg_forward_dct {
   void (*start_pass) (j_compress_ptr cinfo);
   /* perhaps this should be an array??? */
-  void (*forward_DCT) (j_compress_ptr cinfo, jpeg_component_info * compptr,
+  void (*forward_DCT) (j_compress_ptr cinfo, jpeg_component_info *compptr,
                        JSAMPARRAY sample_data, JBLOCKROW coef_blocks,
                        JDIMENSION start_row, JDIMENSION start_col,
                        JDIMENSION num_blocks);
@@ -149,6 +151,13 @@
 
   /* State variables made visible to other modules */
   boolean is_dummy_pass;        /* True during 1st pass for 2-pass quant */
+
+  /* Partial decompression variables */
+  JDIMENSION first_iMCU_col;
+  JDIMENSION last_iMCU_col;
+  JDIMENSION first_MCU_col[MAX_COMPS_IN_SCAN];
+  JDIMENSION last_MCU_col[MAX_COMPS_IN_SCAN];
+  boolean jinit_upsampler_no_alloc;
 };
 
 /* Input control module */
@@ -222,7 +231,7 @@
 
 /* Inverse DCT (also performs dequantization) */
 typedef void (*inverse_DCT_method_ptr) (j_decompress_ptr cinfo,
-                                        jpeg_component_info * compptr,
+                                        jpeg_component_info *compptr,
                                         JCOEFPTR coef_block,
                                         JSAMPARRAY output_buf,
                                         JDIMENSION output_col);
@@ -275,16 +284,16 @@
  * shift" instructions that shift in copies of the sign bit.  But some
  * C compilers implement >> with an unsigned shift.  For these machines you
  * must define RIGHT_SHIFT_IS_UNSIGNED.
- * RIGHT_SHIFT provides a proper signed right shift of an INT32 quantity.
+ * RIGHT_SHIFT provides a proper signed right shift of a JLONG quantity.
  * It is only applied with constant shift counts.  SHIFT_TEMPS must be
  * included in the variables of any routine using RIGHT_SHIFT.
  */
 
 #ifdef RIGHT_SHIFT_IS_UNSIGNED
-#define SHIFT_TEMPS     INT32 shift_temp;
+#define SHIFT_TEMPS     JLONG shift_temp;
 #define RIGHT_SHIFT(x,shft)  \
         ((shift_temp = (x)) < 0 ? \
-         (shift_temp >> (shft)) | ((~((INT32) 0)) << (32-(shft))) : \
+         (shift_temp >> (shft)) | ((~((JLONG) 0)) << (32-(shft))) : \
          (shift_temp >> (shft)))
 #else
 #define SHIFT_TEMPS
@@ -339,7 +348,7 @@
                                 int num_rows, JDIMENSION num_cols);
 EXTERN(void) jcopy_block_row (JBLOCKROW input_row, JBLOCKROW output_row,
                               JDIMENSION num_blocks);
-EXTERN(void) jzero_far (void * target, size_t bytestozero);
+EXTERN(void) jzero_far (void *target, size_t bytestozero);
 /* Constant tables in jutils.c */
 #if 0                           /* This table is not actually needed in v6a */
 extern const int jpeg_zigzag_order[]; /* natural coef order to zigzag order */
@@ -347,7 +356,7 @@
 extern const int jpeg_natural_order[]; /* zigzag coef order to natural order */
 
 /* Arithmetic coding probability estimation tables in jaricom.c */
-extern const INT32 jpeg_aritab[];
+extern const JLONG jpeg_aritab[];
 
 /* Suppress undefined-structure complaints if necessary. */
 
diff --git a/jpeglib.h b/jpeglib.h
index 9615c5d..6c63f58 100644
--- a/jpeglib.h
+++ b/jpeglib.h
@@ -5,8 +5,10 @@
  * Copyright (C) 1991-1998, Thomas G. Lane.
  * Modified 2002-2009 by Guido Vollbeding.
  * libjpeg-turbo Modifications:
- * Copyright (C) 2009-2011, 2013-2014, D. R. Commander.
- * For conditions of distribution and use, see the accompanying README file.
+ * Copyright (C) 2009-2011, 2013-2014, 2016, D. R. Commander.
+ * Copyright (C) 2015, Google, Inc.
+ * For conditions of distribution and use, see the accompanying README.ijg
+ * file.
  *
  * This file defines the application interface for the JPEG library.
  * Most applications using the library need only include this file,
@@ -178,10 +180,10 @@
    * See jdinput.c comments about the need for this information.
    * This field is currently used only for decompression.
    */
-  JQUANT_TBL * quant_table;
+  JQUANT_TBL *quant_table;
 
   /* Private per-component storage for DCT or IDCT subsystem. */
-  void * dct_table;
+  void *dct_table;
 } jpeg_component_info;
 
 
@@ -196,14 +198,14 @@
 
 /* The decompressor can save APPn and COM markers in a list of these: */
 
-typedef struct jpeg_marker_struct * jpeg_saved_marker_ptr;
+typedef struct jpeg_marker_struct *jpeg_saved_marker_ptr;
 
 struct jpeg_marker_struct {
   jpeg_saved_marker_ptr next;   /* next in list, or NULL */
   UINT8 marker;                 /* marker code: JPEG_COM, or JPEG_APP0+n */
   unsigned int original_length; /* # bytes of data in the file */
   unsigned int data_length;     /* # bytes of data saved at data[] */
-  JOCTET * data;                /* the data contained in the marker */
+  JOCTET *data;                 /* the data contained in the marker */
   /* the marker length word is not counted in data_length or original_length */
 };
 
@@ -266,10 +268,10 @@
 /* Common fields between JPEG compression and decompression master structs. */
 
 #define jpeg_common_fields \
-  struct jpeg_error_mgr * err;  /* Error handler module */\
-  struct jpeg_memory_mgr * mem; /* Memory manager module */\
-  struct jpeg_progress_mgr * progress; /* Progress monitor, or NULL if none */\
-  void * client_data;           /* Available for use by application */\
+  struct jpeg_error_mgr *err;   /* Error handler module */\
+  struct jpeg_memory_mgr *mem;  /* Memory manager module */\
+  struct jpeg_progress_mgr *progress; /* Progress monitor, or NULL if none */\
+  void *client_data;            /* Available for use by application */\
   boolean is_decompressor;      /* So common code can tell which is which */\
   int global_state              /* For checking call sequence validity */
 
@@ -285,9 +287,9 @@
    */
 };
 
-typedef struct jpeg_common_struct * j_common_ptr;
-typedef struct jpeg_compress_struct * j_compress_ptr;
-typedef struct jpeg_decompress_struct * j_decompress_ptr;
+typedef struct jpeg_common_struct *j_common_ptr;
+typedef struct jpeg_compress_struct *j_compress_ptr;
+typedef struct jpeg_decompress_struct *j_decompress_ptr;
 
 
 /* Master record for a compression instance */
@@ -296,7 +298,7 @@
   jpeg_common_fields;           /* Fields shared with jpeg_decompress_struct */
 
   /* Destination for compressed data */
-  struct jpeg_destination_mgr * dest;
+  struct jpeg_destination_mgr *dest;
 
   /* Description of source image --- these fields must be filled in by
    * outer application before starting compression.  in_color_space must
@@ -336,10 +338,10 @@
   int num_components;           /* # of color components in JPEG image */
   J_COLOR_SPACE jpeg_color_space; /* colorspace of JPEG image */
 
-  jpeg_component_info * comp_info;
+  jpeg_component_info *comp_info;
   /* comp_info[i] describes component that appears i'th in SOF */
 
-  JQUANT_TBL * quant_tbl_ptrs[NUM_QUANT_TBLS];
+  JQUANT_TBL *quant_tbl_ptrs[NUM_QUANT_TBLS];
 #if JPEG_LIB_VERSION >= 70
   int q_scale_factor[NUM_QUANT_TBLS];
 #endif
@@ -347,8 +349,8 @@
    * and corresponding scale factors (percentage, initialized 100).
    */
 
-  JHUFF_TBL * dc_huff_tbl_ptrs[NUM_HUFF_TBLS];
-  JHUFF_TBL * ac_huff_tbl_ptrs[NUM_HUFF_TBLS];
+  JHUFF_TBL *dc_huff_tbl_ptrs[NUM_HUFF_TBLS];
+  JHUFF_TBL *ac_huff_tbl_ptrs[NUM_HUFF_TBLS];
   /* ptrs to Huffman coding tables, or NULL if not defined */
 
   UINT8 arith_dc_L[NUM_ARITH_TBLS]; /* L values for DC arith-coding tables */
@@ -356,7 +358,7 @@
   UINT8 arith_ac_K[NUM_ARITH_TBLS]; /* Kx values for AC arith-coding tables */
 
   int num_scans;                /* # of entries in scan_info array */
-  const jpeg_scan_info * scan_info; /* script for multi-scan file, or NULL */
+  const jpeg_scan_info *scan_info; /* script for multi-scan file, or NULL */
   /* The default value of scan_info is NULL, which causes a single-scan
    * sequential JPEG file to be emitted.  To create a multi-scan file,
    * set num_scans and scan_info to point to an array of scan definitions.
@@ -429,7 +431,7 @@
    * They describe the components and MCUs actually appearing in the scan.
    */
   int comps_in_scan;            /* # of JPEG components in this scan */
-  jpeg_component_info * cur_comp_info[MAX_COMPS_IN_SCAN];
+  jpeg_component_info *cur_comp_info[MAX_COMPS_IN_SCAN];
   /* *cur_comp_info[i] describes component that appears i'th in SOS */
 
   JDIMENSION MCUs_per_row;      /* # of MCUs across the image */
@@ -444,23 +446,23 @@
 
 #if JPEG_LIB_VERSION >= 80
   int block_size;               /* the basic DCT block size: 1..16 */
-  const int * natural_order;    /* natural-order position array */
+  const int *natural_order;     /* natural-order position array */
   int lim_Se;                   /* min( Se, DCTSIZE2-1 ) */
 #endif
 
   /*
    * Links to compression subobjects (methods and private variables of modules)
    */
-  struct jpeg_comp_master * master;
-  struct jpeg_c_main_controller * main;
-  struct jpeg_c_prep_controller * prep;
-  struct jpeg_c_coef_controller * coef;
-  struct jpeg_marker_writer * marker;
-  struct jpeg_color_converter * cconvert;
-  struct jpeg_downsampler * downsample;
-  struct jpeg_forward_dct * fdct;
-  struct jpeg_entropy_encoder * entropy;
-  jpeg_scan_info * script_space; /* workspace for jpeg_simple_progression */
+  struct jpeg_comp_master *master;
+  struct jpeg_c_main_controller *main;
+  struct jpeg_c_prep_controller *prep;
+  struct jpeg_c_coef_controller *coef;
+  struct jpeg_marker_writer *marker;
+  struct jpeg_color_converter *cconvert;
+  struct jpeg_downsampler *downsample;
+  struct jpeg_forward_dct *fdct;
+  struct jpeg_entropy_encoder *entropy;
+  jpeg_scan_info *script_space; /* workspace for jpeg_simple_progression */
   int script_space_size;
 };
 
@@ -471,7 +473,7 @@
   jpeg_common_fields;           /* Fields shared with jpeg_compress_struct */
 
   /* Source of compressed data */
-  struct jpeg_source_mgr * src;
+  struct jpeg_source_mgr *src;
 
   /* Basic description of image --- filled in by jpeg_read_header(). */
   /* Application may inspect these values to decide how to process image. */
@@ -578,11 +580,11 @@
    * datastreams when processing abbreviated JPEG datastreams.
    */
 
-  JQUANT_TBL * quant_tbl_ptrs[NUM_QUANT_TBLS];
+  JQUANT_TBL *quant_tbl_ptrs[NUM_QUANT_TBLS];
   /* ptrs to coefficient quantization tables, or NULL if not defined */
 
-  JHUFF_TBL * dc_huff_tbl_ptrs[NUM_HUFF_TBLS];
-  JHUFF_TBL * ac_huff_tbl_ptrs[NUM_HUFF_TBLS];
+  JHUFF_TBL *dc_huff_tbl_ptrs[NUM_HUFF_TBLS];
+  JHUFF_TBL *ac_huff_tbl_ptrs[NUM_HUFF_TBLS];
   /* ptrs to Huffman coding tables, or NULL if not defined */
 
   /* These parameters are never carried across datastreams, since they
@@ -591,7 +593,7 @@
 
   int data_precision;           /* bits of precision in image data */
 
-  jpeg_component_info * comp_info;
+  jpeg_component_info *comp_info;
   /* comp_info[i] describes component that appears i'th in SOF */
 
 #if JPEG_LIB_VERSION >= 80
@@ -653,7 +655,7 @@
    * v_samp_factor*DCT_[v_]scaled_size sample rows of a component per iMCU row.
    */
 
-  JSAMPLE * sample_range_limit; /* table for fast range-limiting */
+  JSAMPLE *sample_range_limit;  /* table for fast range-limiting */
 
   /*
    * These fields are valid during any one scan.
@@ -661,7 +663,7 @@
    * Note that the decompressor output side must not use these fields.
    */
   int comps_in_scan;            /* # of JPEG components in this scan */
-  jpeg_component_info * cur_comp_info[MAX_COMPS_IN_SCAN];
+  jpeg_component_info *cur_comp_info[MAX_COMPS_IN_SCAN];
   /* *cur_comp_info[i] describes component that appears i'th in SOS */
 
   JDIMENSION MCUs_per_row;      /* # of MCUs across the image */
@@ -678,7 +680,7 @@
   /* These fields are derived from Se of first SOS marker.
    */
   int block_size;               /* the basic DCT block size: 1..16 */
-  const int * natural_order; /* natural-order position array for entropy decode */
+  const int *natural_order; /* natural-order position array for entropy decode */
   int lim_Se;                   /* min( Se, DCTSIZE2-1 ) for entropy decode */
 #endif
 
@@ -691,17 +693,17 @@
   /*
    * Links to decompression subobjects (methods, private variables of modules)
    */
-  struct jpeg_decomp_master * master;
-  struct jpeg_d_main_controller * main;
-  struct jpeg_d_coef_controller * coef;
-  struct jpeg_d_post_controller * post;
-  struct jpeg_input_controller * inputctl;
-  struct jpeg_marker_reader * marker;
-  struct jpeg_entropy_decoder * entropy;
-  struct jpeg_inverse_dct * idct;
-  struct jpeg_upsampler * upsample;
-  struct jpeg_color_deconverter * cconvert;
-  struct jpeg_color_quantizer * cquantize;
+  struct jpeg_decomp_master *master;
+  struct jpeg_d_main_controller *main;
+  struct jpeg_d_coef_controller *coef;
+  struct jpeg_d_post_controller *post;
+  struct jpeg_input_controller *inputctl;
+  struct jpeg_marker_reader *marker;
+  struct jpeg_entropy_decoder *entropy;
+  struct jpeg_inverse_dct *idct;
+  struct jpeg_upsampler *upsample;
+  struct jpeg_color_deconverter *cconvert;
+  struct jpeg_color_quantizer *cquantize;
 };
 
 
@@ -723,7 +725,7 @@
   /* Routine that actually outputs a trace or error message */
   void (*output_message) (j_common_ptr cinfo);
   /* Format a message string for the most recent JPEG error or message */
-  void (*format_message) (j_common_ptr cinfo, char * buffer);
+  void (*format_message) (j_common_ptr cinfo, char *buffer);
 #define JMSG_LENGTH_MAX  200    /* recommended size of format_message buffer */
   /* Reset error state variables at start of a new image */
   void (*reset_error_mgr) (j_common_ptr cinfo);
@@ -760,12 +762,12 @@
    * First table includes all errors generated by JPEG library itself.
    * Error code 0 is reserved for a "no such error string" message.
    */
-  const char * const * jpeg_message_table; /* Library errors */
+  const char * const *jpeg_message_table; /* Library errors */
   int last_jpeg_message;    /* Table contains strings 0..last_jpeg_message */
   /* Second table can be added by application (see cjpeg/djpeg for example).
    * It contains strings numbered first_addon_message..last_addon_message.
    */
-  const char * const * addon_message_table; /* Non-library errors */
+  const char * const *addon_message_table; /* Non-library errors */
   int first_addon_message;      /* code for first string in addon table */
   int last_addon_message;       /* code for last string in addon table */
 };
@@ -786,7 +788,7 @@
 /* Data destination object for compression */
 
 struct jpeg_destination_mgr {
-  JOCTET * next_output_byte;    /* => next byte to write in buffer */
+  JOCTET *next_output_byte;     /* => next byte to write in buffer */
   size_t free_in_buffer;        /* # of byte spaces remaining in buffer */
 
   void (*init_destination) (j_compress_ptr cinfo);
@@ -798,7 +800,7 @@
 /* Data source object for decompression */
 
 struct jpeg_source_mgr {
-  const JOCTET * next_input_byte; /* => next byte to read from buffer */
+  const JOCTET *next_input_byte; /* => next byte to read from buffer */
   size_t bytes_in_buffer;       /* # of bytes remaining in buffer */
 
   void (*init_source) (j_decompress_ptr cinfo);
@@ -824,15 +826,15 @@
 #define JPOOL_IMAGE     1       /* lasts until done with image/datastream */
 #define JPOOL_NUMPOOLS  2
 
-typedef struct jvirt_sarray_control * jvirt_sarray_ptr;
-typedef struct jvirt_barray_control * jvirt_barray_ptr;
+typedef struct jvirt_sarray_control *jvirt_sarray_ptr;
+typedef struct jvirt_barray_control *jvirt_barray_ptr;
 
 
 struct jpeg_memory_mgr {
   /* Method pointers */
-  void * (*alloc_small) (j_common_ptr cinfo, int pool_id, size_t sizeofobject);
-  void * (*alloc_large) (j_common_ptr cinfo, int pool_id,
-                         size_t sizeofobject);
+  void *(*alloc_small) (j_common_ptr cinfo, int pool_id, size_t sizeofobject);
+  void *(*alloc_large) (j_common_ptr cinfo, int pool_id,
+                        size_t sizeofobject);
   JSAMPARRAY (*alloc_sarray) (j_common_ptr cinfo, int pool_id,
                               JDIMENSION samplesperrow, JDIMENSION numrows);
   JBLOCKARRAY (*alloc_barray) (j_common_ptr cinfo, int pool_id,
@@ -886,7 +888,7 @@
 
 
 /* Default error-management setup */
-EXTERN(struct jpeg_error_mgr *) jpeg_std_error (struct jpeg_error_mgr * err);
+EXTERN(struct jpeg_error_mgr *) jpeg_std_error (struct jpeg_error_mgr *err);
 
 /* Initialization of JPEG compression objects.
  * jpeg_create_compress() and jpeg_create_decompress() are the exported
@@ -911,14 +913,15 @@
 
 /* Standard data source and destination managers: stdio streams. */
 /* Caller is responsible for opening the file before and closing after. */
-EXTERN(void) jpeg_stdio_dest (j_compress_ptr cinfo, FILE * outfile);
-EXTERN(void) jpeg_stdio_src (j_decompress_ptr cinfo, FILE * infile);
+EXTERN(void) jpeg_stdio_dest (j_compress_ptr cinfo, FILE *outfile);
+EXTERN(void) jpeg_stdio_src (j_decompress_ptr cinfo, FILE *infile);
 
 #if JPEG_LIB_VERSION >= 80 || defined(MEM_SRCDST_SUPPORTED)
 /* Data source and destination managers: memory buffers. */
-EXTERN(void) jpeg_mem_dest (j_compress_ptr cinfo, unsigned char ** outbuffer,
-                            unsigned long * outsize);
-EXTERN(void) jpeg_mem_src (j_decompress_ptr cinfo, unsigned char * inbuffer,
+EXTERN(void) jpeg_mem_dest (j_compress_ptr cinfo, unsigned char **outbuffer,
+                            unsigned long *outsize);
+EXTERN(void) jpeg_mem_src (j_decompress_ptr cinfo,
+                           const unsigned char *inbuffer,
                            unsigned long insize);
 #endif
 
@@ -964,7 +967,7 @@
 
 /* Write a special marker.  See libjpeg.txt concerning safe usage. */
 EXTERN(void) jpeg_write_marker (j_compress_ptr cinfo, int marker,
-                                const JOCTET * dataptr, unsigned int datalen);
+                                const JOCTET *dataptr, unsigned int datalen);
 /* Same, but piecemeal. */
 EXTERN(void) jpeg_write_m_header (j_compress_ptr cinfo, int marker,
                                   unsigned int datalen);
@@ -990,6 +993,10 @@
 EXTERN(JDIMENSION) jpeg_read_scanlines (j_decompress_ptr cinfo,
                                         JSAMPARRAY scanlines,
                                         JDIMENSION max_lines);
+EXTERN(JDIMENSION) jpeg_skip_scanlines (j_decompress_ptr cinfo,
+                                        JDIMENSION num_lines);
+EXTERN(void) jpeg_crop_scanline (j_decompress_ptr cinfo, JDIMENSION *xoffset,
+                                 JDIMENSION *width);
 EXTERN(boolean) jpeg_finish_decompress (j_decompress_ptr cinfo);
 
 /* Replaces jpeg_read_scanlines when reading raw downsampled data. */
@@ -1028,7 +1035,7 @@
 /* Read or write raw DCT coefficients --- useful for lossless transcoding. */
 EXTERN(jvirt_barray_ptr *) jpeg_read_coefficients (j_decompress_ptr cinfo);
 EXTERN(void) jpeg_write_coefficients (j_compress_ptr cinfo,
-                                      jvirt_barray_ptr * coef_arrays);
+                                      jvirt_barray_ptr *coef_arrays);
 EXTERN(void) jpeg_copy_critical_parameters (j_decompress_ptr srcinfo,
                                             j_compress_ptr dstinfo);
 
diff --git a/jpegtran.1 b/jpegtran.1
index 5b8e126..7f3c853 100644
--- a/jpegtran.1
+++ b/jpegtran.1
@@ -1,4 +1,4 @@
-.TH JPEGTRAN 1 "21 November 2014"
+.TH JPEGTRAN 1 "18 February 2016"
 .SH NAME
 jpegtran \- lossless transformation of JPEG files
 .SH SYNOPSIS
@@ -19,6 +19,10 @@
 perform some rearrangements of the image data, for example turning an image
 from landscape to portrait format by rotation.
 .PP
+For EXIF files and JPEG files containing Exif data, you may prefer to use
+.B exiftran
+instead.
+.PP
 .B jpegtran
 works by rearranging the compressed data (DCT coefficients), without
 ever fully decoding the image.  Therefore, its transformations are lossless:
@@ -28,7 +32,11 @@
 .B cjpeg
 to accomplish the same conversion.  But by the same token,
 .B jpegtran
-cannot perform lossy operations such as changing the image quality.
+cannot perform lossy operations such as changing the image quality.  However,
+while the image data is losslessly transformed, metadata can be removed.  See
+the
+.B \-copy
+option for specifics.
 .PP
 .B jpegtran
 reads the named JPEG/JFIF file, or the standard input if no file is
@@ -150,6 +158,18 @@
 .B | pnmflip \-r90 | cjpeg)
 .IP
 to do a perfect rotation, if available, or an approximated one if not.
+.PP
+This version of \fBjpegtran\fR also offers a lossless crop option, which
+discards data outside of a given image region but losslessly preserves what is
+inside. Like the rotate and flip transforms, lossless crop is restricted by the
+current JPEG format; the upper left corner of the selected region must fall on
+an iMCU boundary.  If it doesn't, then it is silently moved up and/or left to
+the nearest iMCU boundary (the lower right corner is unchanged.)  Thus, the
+output image covers at least the requested region, but it may cover more.  The
+adjustment of the region dimensions may be optionally disabled by attaching
+an 'f' character ("force") to the width or height number.
+
+The image can be losslessly cropped by giving the switch:
 .TP
 .B \-crop WxH+X+Y
 Crop the image to a rectangular region of width W and height H, starting at
@@ -180,16 +200,17 @@
 .TP
 .B \-copy none
 Copy no extra markers from source file.  This setting suppresses all
-comments and other excess baggage present in the source file.
+comments and other metadata in the source file.
 .TP
 .B \-copy comments
 Copy only comment markers.  This setting copies comments from the source file
-but discards any other data that is inessential for image display.
+but discards any other metadata.
 .TP
 .B \-copy all
 Copy all extra markers.  This setting preserves miscellaneous markers
 found in the source file, such as JFIF thumbnails, Exif data, and Photoshop
-settings.  In some files, these extra markers can be sizable.
+settings.  In some files, these extra markers can be sizable.  Note that this
+option will copy thumbnails as-is; they will not be transformed.
 .PP
 The default behavior is \fB-copy comments\fR.  (Note: in IJG releases v6 and
 v6a, \fBjpegtran\fR always did the equivalent of \fB-copy none\fR.)
diff --git a/jpegtran.c b/jpegtran.c
index f978cef..c44f21e 100644
--- a/jpegtran.c
+++ b/jpegtran.c
@@ -5,7 +5,8 @@
  * Copyright (C) 1995-2010, Thomas G. Lane, Guido Vollbeding.
  * libjpeg-turbo Modifications:
  * Copyright (C) 2010, 2014, D. R. Commander.
- * For conditions of distribution and use, see the accompanying README file.
+ * For conditions of distribution and use, see the accompanying README.ijg
+ * file.
  *
  * This file contains a command-line user interface for JPEG transcoding.
  * It is very similar to cjpeg.c, and partly to djpeg.c, but provides
@@ -38,8 +39,8 @@
  */
 
 
-static const char * progname;   /* program name for error messages */
-static char * outfilename;      /* for -outfile switch */
+static const char *progname;    /* program name for error messages */
+static char *outfilename;       /* for -outfile switch */
 static JCOPY_OPTION copyoption; /* -copy switch */
 static jpeg_transform_info transformoption; /* image transformation options */
 
@@ -131,9 +132,9 @@
  */
 {
   int argn;
-  char * arg;
+  char *arg;
   boolean simple_progressive;
-  char * scansarg = NULL;       /* saves -scans parm if any */
+  char *scansarg = NULL;        /* saves -scans parm if any */
 
   /* Set up default JPEG parameters. */
   simple_progressive = FALSE;
@@ -377,13 +378,13 @@
 #ifdef PROGRESS_REPORT
   struct cdjpeg_progress_mgr progress;
 #endif
-  jvirt_barray_ptr * src_coef_arrays;
-  jvirt_barray_ptr * dst_coef_arrays;
+  jvirt_barray_ptr *src_coef_arrays;
+  jvirt_barray_ptr *dst_coef_arrays;
   int file_index;
   /* We assume all-in-memory processing and can therefore use only a
    * single file pointer for sequential input and output operation.
    */
-  FILE * fp;
+  FILE *fp;
 
   /* On Mac, fetch a command line. */
 #ifdef USE_CCOMMAND
diff --git a/jquant1.c b/jquant1.c
index 0e25354..e781481 100644
--- a/jquant1.c
+++ b/jquant1.c
@@ -4,8 +4,9 @@
  * This file was part of the Independent JPEG Group's software:
  * Copyright (C) 1991-1996, Thomas G. Lane.
  * libjpeg-turbo Modifications:
- * Copyright (C) 2009, D. R. Commander
- * For conditions of distribution and use, see the accompanying README file.
+ * Copyright (C) 2009, 2015, D. R. Commander.
+ * For conditions of distribution and use, see the accompanying README.ijg
+ * file.
  *
  * This file contains 1-pass color quantization (color mapping) routines.
  * These routines provide mapping to a fixed color map using equally spaced
@@ -127,8 +128,8 @@
 typedef INT16 FSERROR;          /* 16 bits should be enough */
 typedef int LOCFSERROR;         /* use 'int' for calculation temps */
 #else
-typedef INT32 FSERROR;          /* may need more than 16 bits */
-typedef INT32 LOCFSERROR;       /* be sure calculation temps are big enough */
+typedef JLONG FSERROR;          /* may need more than 16 bits */
+typedef JLONG LOCFSERROR;       /* be sure calculation temps are big enough */
 #endif
 
 typedef FSERROR *FSERRPTR;  /* pointer to error array */
@@ -163,7 +164,7 @@
   boolean on_odd_row;           /* flag to remember which row we are on */
 } my_cquantizer;
 
-typedef my_cquantizer * my_cquantize_ptr;
+typedef my_cquantizer *my_cquantize_ptr;
 
 
 /*
@@ -253,7 +254,7 @@
    * (Forcing the upper and lower values to the limits ensures that
    * dithering can't produce a color outside the selected gamut.)
    */
-  return (int) (((INT32) j * MAXJSAMPLE + maxj/2) / maxj);
+  return (int) (((JLONG) j * MAXJSAMPLE + maxj/2) / maxj);
 }
 
 
@@ -263,7 +264,7 @@
 /* Must have largest(j=0) >= 0, and largest(j=maxj) >= MAXJSAMPLE */
 {
   /* Breakpoints are halfway between values returned by output_value */
-  return (int) (((INT32) (2*j + 1) * MAXJSAMPLE + maxj) / (2*maxj));
+  return (int) (((JLONG) (2*j + 1) * MAXJSAMPLE + maxj) / (2*maxj));
 }
 
 
@@ -399,7 +400,7 @@
 {
   ODITHER_MATRIX_PTR odither;
   int j,k;
-  INT32 num,den;
+  JLONG num,den;
 
   odither = (ODITHER_MATRIX_PTR)
     (*cinfo->mem->alloc_small) ((j_common_ptr) cinfo, JPOOL_IMAGE,
@@ -409,10 +410,10 @@
    * (f=0..N-1) should be (N-1-2*f)/(2*N) * MAXJSAMPLE/(ncolors-1).
    * On 16-bit-int machine, be careful to avoid overflow.
    */
-  den = 2 * ODITHER_CELLS * ((INT32) (ncolors - 1));
+  den = 2 * ODITHER_CELLS * ((JLONG) (ncolors - 1));
   for (j = 0; j < ODITHER_SIZE; j++) {
     for (k = 0; k < ODITHER_SIZE; k++) {
-      num = ((INT32) (ODITHER_CELLS-1 - 2*((int)base_dither_matrix[j][k])))
+      num = ((JLONG) (ODITHER_CELLS-1 - 2*((int)base_dither_matrix[j][k])))
             * MAXJSAMPLE;
       /* Ensure round towards zero despite C's lack of consistency
        * about rounding negative values in integer division...
@@ -522,7 +523,7 @@
   register JSAMPROW input_ptr;
   register JSAMPROW output_ptr;
   JSAMPROW colorindex_ci;
-  int * dither;                 /* points to active row of dither matrix */
+  int *dither;                  /* points to active row of dither matrix */
   int row_index, col_index;     /* current indexes into dither matrix */
   int nc = cinfo->out_color_components;
   int ci;
@@ -574,9 +575,9 @@
   JSAMPROW colorindex0 = cquantize->colorindex[0];
   JSAMPROW colorindex1 = cquantize->colorindex[1];
   JSAMPROW colorindex2 = cquantize->colorindex[2];
-  int * dither0;                /* points to active row of dither matrix */
-  int * dither1;
-  int * dither2;
+  int *dither0;                 /* points to active row of dither matrix */
+  int *dither1;
+  int *dither2;
   int row_index, col_index;     /* current indexes into dither matrix */
   int row;
   JDIMENSION col;
diff --git a/jquant2.c b/jquant2.c
index 291b4f1..cfbd0f1 100644
--- a/jquant2.c
+++ b/jquant2.c
@@ -4,8 +4,9 @@
  * This file was part of the Independent JPEG Group's software:
  * Copyright (C) 1991-1996, Thomas G. Lane.
  * libjpeg-turbo Modifications:
- * Copyright (C) 2009, 2014, D. R. Commander.
- * For conditions of distribution and use, see the accompanying README file.
+ * Copyright (C) 2009, 2014-2015, D. R. Commander.
+ * For conditions of distribution and use, see the accompanying README.ijg
+ * file.
  *
  * This file contains 2-pass color quantization (color mapping) routines.
  * These routines provide selection of a custom color map for an image,
@@ -127,11 +128,11 @@
 
 typedef UINT16 histcell;        /* histogram cell; prefer an unsigned type */
 
-typedef histcell * histptr; /* for pointers to histogram cells */
+typedef histcell *histptr; /* for pointers to histogram cells */
 
 typedef histcell hist1d[HIST_C2_ELEMS]; /* typedefs for the array */
-typedef hist1d * hist2d;    /* type for the 2nd-level pointers */
-typedef hist2d * hist3d;        /* type for top-level pointer */
+typedef hist1d *hist2d;         /* type for the 2nd-level pointers */
+typedef hist2d *hist3d;         /* type for top-level pointer */
 
 
 /* Declarations for Floyd-Steinberg dithering.
@@ -159,8 +160,8 @@
 typedef INT16 FSERROR;          /* 16 bits should be enough */
 typedef int LOCFSERROR;         /* use 'int' for calculation temps */
 #else
-typedef INT32 FSERROR;          /* may need more than 16 bits */
-typedef INT32 LOCFSERROR;       /* be sure calculation temps are big enough */
+typedef JLONG FSERROR;          /* may need more than 16 bits */
+typedef JLONG LOCFSERROR;       /* be sure calculation temps are big enough */
 #endif
 
 typedef FSERROR *FSERRPTR;      /* pointer to error array */
@@ -183,10 +184,10 @@
   /* Variables for Floyd-Steinberg dithering */
   FSERRPTR fserrors;            /* accumulated errors */
   boolean on_odd_row;           /* flag to remember which row we are on */
-  int * error_limiter;          /* table for clamping the applied error */
+  int *error_limiter;           /* table for clamping the applied error */
 } my_cquantizer;
 
-typedef my_cquantizer * my_cquantize_ptr;
+typedef my_cquantizer *my_cquantize_ptr;
 
 
 /*
@@ -239,12 +240,12 @@
   int c1min, c1max;
   int c2min, c2max;
   /* The volume (actually 2-norm) of the box */
-  INT32 volume;
+  JLONG volume;
   /* The number of nonzero histogram cells within this box */
   long colorcount;
 } box;
 
-typedef box * boxptr;
+typedef box *boxptr;
 
 
 LOCAL(boxptr)
@@ -274,7 +275,7 @@
 {
   register boxptr boxp;
   register int i;
-  register INT32 maxv = 0;
+  register JLONG maxv = 0;
   boxptr which = NULL;
 
   for (i = 0, boxp = boxlist; i < numboxes; i++, boxp++) {
@@ -297,7 +298,7 @@
   histptr histp;
   int c0,c1,c2;
   int c0min,c0max,c1min,c1max,c2min,c2max;
-  INT32 dist0,dist1,dist2;
+  JLONG dist0,dist1,dist2;
   long ccount;
 
   c0min = boxp->c0min;  c0max = boxp->c0max;
@@ -571,7 +572,7 @@
  * distance from every colormap entry to every histogram cell.  Unfortunately,
  * it needs a work array to hold the best-distance-so-far for each histogram
  * cell (because the inner loop has to be over cells, not colormap entries).
- * The work array elements have to be INT32s, so the work array would need
+ * The work array elements have to be JLONGs, so the work array would need
  * 256Kb at our recommended precision.  This is not feasible in DOS machines.
  *
  * To get around these problems, we apply Thomas' method to compute the
@@ -637,8 +638,8 @@
   int maxc0, maxc1, maxc2;
   int centerc0, centerc1, centerc2;
   int i, x, ncolors;
-  INT32 minmaxdist, min_dist, max_dist, tdist;
-  INT32 mindist[MAXNUMCOLORS];  /* min distance to colormap entry i */
+  JLONG minmaxdist, min_dist, max_dist, tdist;
+  JLONG mindist[MAXNUMCOLORS];  /* min distance to colormap entry i */
 
   /* Compute true coordinates of update box's upper corner and center.
    * Actually we compute the coordinates of the center of the upper-corner
@@ -762,15 +763,15 @@
 {
   int ic0, ic1, ic2;
   int i, icolor;
-  register INT32 * bptr;        /* pointer into bestdist[] array */
-  JSAMPLE * cptr;               /* pointer into bestcolor[] array */
-  INT32 dist0, dist1;           /* initial distance values */
-  register INT32 dist2;         /* current distance in inner loop */
-  INT32 xx0, xx1;               /* distance increments */
-  register INT32 xx2;
-  INT32 inc0, inc1, inc2;       /* initial values for increments */
+  register JLONG *bptr;         /* pointer into bestdist[] array */
+  JSAMPLE *cptr;                /* pointer into bestcolor[] array */
+  JLONG dist0, dist1;           /* initial distance values */
+  register JLONG dist2;         /* current distance in inner loop */
+  JLONG xx0, xx1;               /* distance increments */
+  register JLONG xx2;
+  JLONG inc0, inc1, inc2;       /* initial values for increments */
   /* This array holds the distance to the nearest-so-far color for each cell */
-  INT32 bestdist[BOX_C0_ELEMS * BOX_C1_ELEMS * BOX_C2_ELEMS];
+  JLONG bestdist[BOX_C0_ELEMS * BOX_C1_ELEMS * BOX_C2_ELEMS];
 
   /* Initialize best-distance for each cell of the update box */
   bptr = bestdist;
@@ -840,7 +841,7 @@
   hist3d histogram = cquantize->histogram;
   int minc0, minc1, minc2;      /* lower left corner of update box */
   int ic0, ic1, ic2;
-  register JSAMPLE * cptr;      /* pointer into bestcolor[] array */
+  register JSAMPLE *cptr;       /* pointer into bestcolor[] array */
   register histptr cachep;      /* pointer into main cache array */
   /* This array lists the candidate colormap indexes. */
   JSAMPLE colorlist[MAXNUMCOLORS];
@@ -1079,7 +1080,7 @@
 /* Allocate and fill in the error_limiter table */
 {
   my_cquantize_ptr cquantize = (my_cquantize_ptr) cinfo->cquantize;
-  int * table;
+  int *table;
   int in, out;
 
   table = (int *) (*cinfo->mem->alloc_small)
diff --git a/jsimd.h b/jsimd.h
index f1f584b..f2e2484 100644
--- a/jsimd.h
+++ b/jsimd.h
@@ -3,6 +3,7 @@
  *
  * Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
  * Copyright 2011, 2014 D. R. Commander
+ * Copyright 2015 Matthieu Darbois
  *
  * Based on the x86 SIMD extension for IJG JPEG library,
  * Copyright (C) 1999-2006, MIYASAKA Masaru.
@@ -10,6 +11,8 @@
  *
  */
 
+#include "jchuff.h"             /* Declarations shared with jcphuff.c */
+
 EXTERN(int) jsimd_can_rgb_ycc (void);
 EXTERN(int) jsimd_can_rgb_gray (void);
 EXTERN(int) jsimd_can_ycc_rgb (void);
@@ -36,17 +39,17 @@
 EXTERN(int) jsimd_can_h2v1_downsample (void);
 
 EXTERN(void) jsimd_h2v2_downsample
-        (j_compress_ptr cinfo, jpeg_component_info * compptr,
+        (j_compress_ptr cinfo, jpeg_component_info *compptr,
          JSAMPARRAY input_data, JSAMPARRAY output_data);
 
 EXTERN(int) jsimd_can_h2v2_smooth_downsample (void);
 
 EXTERN(void) jsimd_h2v2_smooth_downsample
-        (j_compress_ptr cinfo, jpeg_component_info * compptr,
+        (j_compress_ptr cinfo, jpeg_component_info *compptr,
          JSAMPARRAY input_data, JSAMPARRAY output_data);
 
 EXTERN(void) jsimd_h2v1_downsample
-        (j_compress_ptr cinfo, jpeg_component_info * compptr,
+        (j_compress_ptr cinfo, jpeg_component_info *compptr,
         JSAMPARRAY input_data, JSAMPARRAY output_data);
 
 EXTERN(int) jsimd_can_h2v2_upsample (void);
@@ -54,24 +57,24 @@
 EXTERN(int) jsimd_can_int_upsample (void);
 
 EXTERN(void) jsimd_h2v2_upsample
-        (j_decompress_ptr cinfo, jpeg_component_info * compptr,
-         JSAMPARRAY input_data, JSAMPARRAY * output_data_ptr);
+        (j_decompress_ptr cinfo, jpeg_component_info *compptr,
+         JSAMPARRAY input_data, JSAMPARRAY *output_data_ptr);
 EXTERN(void) jsimd_h2v1_upsample
-        (j_decompress_ptr cinfo, jpeg_component_info * compptr,
-         JSAMPARRAY input_data, JSAMPARRAY * output_data_ptr);
+        (j_decompress_ptr cinfo, jpeg_component_info *compptr,
+         JSAMPARRAY input_data, JSAMPARRAY *output_data_ptr);
 EXTERN(void) jsimd_int_upsample
-        (j_decompress_ptr cinfo, jpeg_component_info * compptr,
-         JSAMPARRAY input_data, JSAMPARRAY * output_data_ptr);
+        (j_decompress_ptr cinfo, jpeg_component_info *compptr,
+         JSAMPARRAY input_data, JSAMPARRAY *output_data_ptr);
 
 EXTERN(int) jsimd_can_h2v2_fancy_upsample (void);
 EXTERN(int) jsimd_can_h2v1_fancy_upsample (void);
 
 EXTERN(void) jsimd_h2v2_fancy_upsample
-        (j_decompress_ptr cinfo, jpeg_component_info * compptr,
-         JSAMPARRAY input_data, JSAMPARRAY * output_data_ptr);
+        (j_decompress_ptr cinfo, jpeg_component_info *compptr,
+         JSAMPARRAY input_data, JSAMPARRAY *output_data_ptr);
 EXTERN(void) jsimd_h2v1_fancy_upsample
-        (j_decompress_ptr cinfo, jpeg_component_info * compptr,
-         JSAMPARRAY input_data, JSAMPARRAY * output_data_ptr);
+        (j_decompress_ptr cinfo, jpeg_component_info *compptr,
+         JSAMPARRAY input_data, JSAMPARRAY *output_data_ptr);
 
 EXTERN(int) jsimd_can_h2v2_merged_upsample (void);
 EXTERN(int) jsimd_can_h2v1_merged_upsample (void);
@@ -82,3 +85,9 @@
 EXTERN(void) jsimd_h2v1_merged_upsample
         (j_decompress_ptr cinfo, JSAMPIMAGE input_buf,
          JDIMENSION in_row_group_ctr, JSAMPARRAY output_buf);
+
+EXTERN(int) jsimd_can_huff_encode_one_block (void);
+
+EXTERN(JOCTET*) jsimd_huff_encode_one_block
+        (void *state, JOCTET *buffer, JCOEFPTR block, int last_dc_val,
+         c_derived_tbl *dctbl, c_derived_tbl *actbl);
diff --git a/jsimd_none.c b/jsimd_none.c
index 34aefc9..90dc965 100644
--- a/jsimd_none.c
+++ b/jsimd_none.c
@@ -3,6 +3,7 @@
  *
  * Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
  * Copyright 2009-2011, 2014 D. R. Commander
+ * Copyright 2015 Matthieu Darbois
  *
  * Based on the x86 SIMD extension for IJG JPEG library,
  * Copyright (C) 1999-2006, MIYASAKA Masaru.
@@ -102,19 +103,20 @@
 }
 
 GLOBAL(void)
-jsimd_h2v2_downsample (j_compress_ptr cinfo, jpeg_component_info * compptr,
+jsimd_h2v2_downsample (j_compress_ptr cinfo, jpeg_component_info *compptr,
                        JSAMPARRAY input_data, JSAMPARRAY output_data)
 {
 }
 
 GLOBAL(void)
-jsimd_h2v2_smooth_downsample (j_compress_ptr cinfo, jpeg_component_info * compptr,
-                       JSAMPARRAY input_data, JSAMPARRAY output_data)
+jsimd_h2v2_smooth_downsample (j_compress_ptr cinfo,
+                              jpeg_component_info *compptr,
+                              JSAMPARRAY input_data, JSAMPARRAY output_data)
 {
 }
 
 GLOBAL(void)
-jsimd_h2v1_downsample (j_compress_ptr cinfo, jpeg_component_info * compptr,
+jsimd_h2v1_downsample (j_compress_ptr cinfo, jpeg_component_info *compptr,
                        JSAMPARRAY input_data, JSAMPARRAY output_data)
 {
 }
@@ -138,24 +140,24 @@
 }
 
 GLOBAL(void)
-jsimd_int_upsample (j_decompress_ptr cinfo, jpeg_component_info * compptr,
-                      JSAMPARRAY input_data, JSAMPARRAY * output_data_ptr)
+jsimd_int_upsample (j_decompress_ptr cinfo, jpeg_component_info *compptr,
+                      JSAMPARRAY input_data, JSAMPARRAY *output_data_ptr)
 {
 }
 
 GLOBAL(void)
 jsimd_h2v2_upsample (j_decompress_ptr cinfo,
-                     jpeg_component_info * compptr,
+                     jpeg_component_info *compptr,
                      JSAMPARRAY input_data,
-                     JSAMPARRAY * output_data_ptr)
+                     JSAMPARRAY *output_data_ptr)
 {
 }
 
 GLOBAL(void)
 jsimd_h2v1_upsample (j_decompress_ptr cinfo,
-                     jpeg_component_info * compptr,
+                     jpeg_component_info *compptr,
                      JSAMPARRAY input_data,
-                     JSAMPARRAY * output_data_ptr)
+                     JSAMPARRAY *output_data_ptr)
 {
 }
 
@@ -173,17 +175,17 @@
 
 GLOBAL(void)
 jsimd_h2v2_fancy_upsample (j_decompress_ptr cinfo,
-                           jpeg_component_info * compptr,
+                           jpeg_component_info *compptr,
                            JSAMPARRAY input_data,
-                           JSAMPARRAY * output_data_ptr)
+                           JSAMPARRAY *output_data_ptr)
 {
 }
 
 GLOBAL(void)
 jsimd_h2v1_fancy_upsample (j_decompress_ptr cinfo,
-                           jpeg_component_info * compptr,
+                           jpeg_component_info *compptr,
                            JSAMPARRAY input_data,
-                           JSAMPARRAY * output_data_ptr)
+                           JSAMPARRAY *output_data_ptr)
 {
 }
 
@@ -229,13 +231,13 @@
 
 GLOBAL(void)
 jsimd_convsamp (JSAMPARRAY sample_data, JDIMENSION start_col,
-                DCTELEM * workspace)
+                DCTELEM *workspace)
 {
 }
 
 GLOBAL(void)
 jsimd_convsamp_float (JSAMPARRAY sample_data, JDIMENSION start_col,
-                      FAST_FLOAT * workspace)
+                      FAST_FLOAT *workspace)
 {
 }
 
@@ -258,17 +260,17 @@
 }
 
 GLOBAL(void)
-jsimd_fdct_islow (DCTELEM * data)
+jsimd_fdct_islow (DCTELEM *data)
 {
 }
 
 GLOBAL(void)
-jsimd_fdct_ifast (DCTELEM * data)
+jsimd_fdct_ifast (DCTELEM *data)
 {
 }
 
 GLOBAL(void)
-jsimd_fdct_float (FAST_FLOAT * data)
+jsimd_fdct_float (FAST_FLOAT *data)
 {
 }
 
@@ -285,14 +287,14 @@
 }
 
 GLOBAL(void)
-jsimd_quantize (JCOEFPTR coef_block, DCTELEM * divisors,
-                DCTELEM * workspace)
+jsimd_quantize (JCOEFPTR coef_block, DCTELEM *divisors,
+                DCTELEM *workspace)
 {
 }
 
 GLOBAL(void)
-jsimd_quantize_float (JCOEFPTR coef_block, FAST_FLOAT * divisors,
-                      FAST_FLOAT * workspace)
+jsimd_quantize_float (JCOEFPTR coef_block, FAST_FLOAT *divisors,
+                      FAST_FLOAT *workspace)
 {
 }
 
@@ -321,28 +323,28 @@
 }
 
 GLOBAL(void)
-jsimd_idct_2x2 (j_decompress_ptr cinfo, jpeg_component_info * compptr,
+jsimd_idct_2x2 (j_decompress_ptr cinfo, jpeg_component_info *compptr,
                 JCOEFPTR coef_block, JSAMPARRAY output_buf,
                 JDIMENSION output_col)
 {
 }
 
 GLOBAL(void)
-jsimd_idct_4x4 (j_decompress_ptr cinfo, jpeg_component_info * compptr,
+jsimd_idct_4x4 (j_decompress_ptr cinfo, jpeg_component_info *compptr,
                 JCOEFPTR coef_block, JSAMPARRAY output_buf,
                 JDIMENSION output_col)
 {
 }
 
 GLOBAL(void)
-jsimd_idct_6x6 (j_decompress_ptr cinfo, jpeg_component_info * compptr,
+jsimd_idct_6x6 (j_decompress_ptr cinfo, jpeg_component_info *compptr,
                 JCOEFPTR coef_block, JSAMPARRAY output_buf,
                 JDIMENSION output_col)
 {
 }
 
 GLOBAL(void)
-jsimd_idct_12x12 (j_decompress_ptr cinfo, jpeg_component_info * compptr,
+jsimd_idct_12x12 (j_decompress_ptr cinfo, jpeg_component_info *compptr,
                   JCOEFPTR coef_block, JSAMPARRAY output_buf,
                   JDIMENSION output_col)
 {
@@ -367,23 +369,36 @@
 }
 
 GLOBAL(void)
-jsimd_idct_islow (j_decompress_ptr cinfo, jpeg_component_info * compptr,
+jsimd_idct_islow (j_decompress_ptr cinfo, jpeg_component_info *compptr,
                   JCOEFPTR coef_block, JSAMPARRAY output_buf,
                   JDIMENSION output_col)
 {
 }
 
 GLOBAL(void)
-jsimd_idct_ifast (j_decompress_ptr cinfo, jpeg_component_info * compptr,
+jsimd_idct_ifast (j_decompress_ptr cinfo, jpeg_component_info *compptr,
                   JCOEFPTR coef_block, JSAMPARRAY output_buf,
                   JDIMENSION output_col)
 {
 }
 
 GLOBAL(void)
-jsimd_idct_float (j_decompress_ptr cinfo, jpeg_component_info * compptr,
+jsimd_idct_float (j_decompress_ptr cinfo, jpeg_component_info *compptr,
                   JCOEFPTR coef_block, JSAMPARRAY output_buf,
                   JDIMENSION output_col)
 {
 }
 
+GLOBAL(int)
+jsimd_can_huff_encode_one_block (void)
+{
+  return 0;
+}
+
+GLOBAL(JOCTET*)
+jsimd_huff_encode_one_block (void *state, JOCTET *buffer, JCOEFPTR block,
+                             int last_dc_val, c_derived_tbl *dctbl,
+                             c_derived_tbl *actbl)
+{
+  return NULL;
+}
diff --git a/jsimddct.h b/jsimddct.h
index aa421fb..b19ab48 100644
--- a/jsimddct.h
+++ b/jsimddct.h
@@ -13,26 +13,26 @@
 EXTERN(int) jsimd_can_convsamp_float (void);
 
 EXTERN(void) jsimd_convsamp (JSAMPARRAY sample_data, JDIMENSION start_col,
-                             DCTELEM * workspace);
+                             DCTELEM *workspace);
 EXTERN(void) jsimd_convsamp_float (JSAMPARRAY sample_data,
                                    JDIMENSION start_col,
-                                   FAST_FLOAT * workspace);
+                                   FAST_FLOAT *workspace);
 
 EXTERN(int) jsimd_can_fdct_islow (void);
 EXTERN(int) jsimd_can_fdct_ifast (void);
 EXTERN(int) jsimd_can_fdct_float (void);
 
-EXTERN(void) jsimd_fdct_islow (DCTELEM * data);
-EXTERN(void) jsimd_fdct_ifast (DCTELEM * data);
-EXTERN(void) jsimd_fdct_float (FAST_FLOAT * data);
+EXTERN(void) jsimd_fdct_islow (DCTELEM *data);
+EXTERN(void) jsimd_fdct_ifast (DCTELEM *data);
+EXTERN(void) jsimd_fdct_float (FAST_FLOAT *data);
 
 EXTERN(int) jsimd_can_quantize (void);
 EXTERN(int) jsimd_can_quantize_float (void);
 
-EXTERN(void) jsimd_quantize (JCOEFPTR coef_block, DCTELEM * divisors,
-                             DCTELEM * workspace);
-EXTERN(void) jsimd_quantize_float (JCOEFPTR coef_block, FAST_FLOAT * divisors,
-                                   FAST_FLOAT * workspace);
+EXTERN(void) jsimd_quantize (JCOEFPTR coef_block, DCTELEM *divisors,
+                             DCTELEM *workspace);
+EXTERN(void) jsimd_quantize_float (JCOEFPTR coef_block, FAST_FLOAT *divisors,
+                                   FAST_FLOAT *workspace);
 
 EXTERN(int) jsimd_can_idct_2x2 (void);
 EXTERN(int) jsimd_can_idct_4x4 (void);
@@ -40,19 +40,19 @@
 EXTERN(int) jsimd_can_idct_12x12 (void);
 
 EXTERN(void) jsimd_idct_2x2 (j_decompress_ptr cinfo,
-                             jpeg_component_info * compptr,
+                             jpeg_component_info *compptr,
                              JCOEFPTR coef_block, JSAMPARRAY output_buf,
                              JDIMENSION output_col);
 EXTERN(void) jsimd_idct_4x4 (j_decompress_ptr cinfo,
-                             jpeg_component_info * compptr,
+                             jpeg_component_info *compptr,
                              JCOEFPTR coef_block, JSAMPARRAY output_buf,
                              JDIMENSION output_col);
 EXTERN(void) jsimd_idct_6x6 (j_decompress_ptr cinfo,
-                             jpeg_component_info * compptr,
+                             jpeg_component_info *compptr,
                              JCOEFPTR coef_block, JSAMPARRAY output_buf,
                              JDIMENSION output_col);
 EXTERN(void) jsimd_idct_12x12 (j_decompress_ptr cinfo,
-                               jpeg_component_info * compptr,
+                               jpeg_component_info *compptr,
                                JCOEFPTR coef_block, JSAMPARRAY output_buf,
                                JDIMENSION output_col);
 
@@ -61,14 +61,14 @@
 EXTERN(int) jsimd_can_idct_float (void);
 
 EXTERN(void) jsimd_idct_islow (j_decompress_ptr cinfo,
-                               jpeg_component_info * compptr,
+                               jpeg_component_info *compptr,
                                JCOEFPTR coef_block, JSAMPARRAY output_buf,
                                JDIMENSION output_col);
 EXTERN(void) jsimd_idct_ifast (j_decompress_ptr cinfo,
-                               jpeg_component_info * compptr,
+                               jpeg_component_info *compptr,
                                JCOEFPTR coef_block, JSAMPARRAY output_buf,
                                JDIMENSION output_col);
 EXTERN(void) jsimd_idct_float (j_decompress_ptr cinfo,
-                               jpeg_component_info * compptr,
+                               jpeg_component_info *compptr,
                                JCOEFPTR coef_block, JSAMPARRAY output_buf,
                                JDIMENSION output_col);
diff --git a/jstdhuff.c b/jstdhuff.c
index 717c134..e202e8e 100644
--- a/jstdhuff.c
+++ b/jstdhuff.c
@@ -5,7 +5,8 @@
  * Copyright (C) 1991-1998, Thomas G. Lane.
  * libjpeg-turbo Modifications:
  * Copyright (C) 2013, D. R. Commander.
- * For conditions of distribution and use, see the accompanying README file.
+ * For conditions of distribution and use, see the accompanying README.ijg
+ * file.
  *
  * This file contains routines to set the default Huffman tables, if they are
  * not already set.
diff --git a/jutils.c b/jutils.c
index 0e2611c..f9d3502 100644
--- a/jutils.c
+++ b/jutils.c
@@ -5,7 +5,8 @@
  * Copyright (C) 1991-1996, Thomas G. Lane.
  * It was modified by The libjpeg-turbo Project to include only code
  * relevant to libjpeg-turbo.
- * For conditions of distribution and use, see the accompanying README file.
+ * For conditions of distribution and use, see the accompanying README.ijg
+ * file.
  *
  * This file contains tables and miscellaneous utility routines needed
  * for both compression and decompression.
@@ -124,7 +125,7 @@
 
 
 GLOBAL(void)
-jzero_far (void * target, size_t bytestozero)
+jzero_far (void *target, size_t bytestozero)
 /* Zero out a chunk of memory. */
 /* This might be sample-array data, block-array data, or alloc_large data. */
 {
diff --git a/jversion.h b/jversion.h
index 92a1b67..6ce663d 100644
--- a/jversion.h
+++ b/jversion.h
@@ -5,7 +5,8 @@
  * Copyright (C) 1991-2012, Thomas G. Lane, Guido Vollbeding.
  * libjpeg-turbo Modifications:
  * Copyright (C) 2010, 2012-2016, D. R. Commander.
- * For conditions of distribution and use, see the accompanying README file.
+ * For conditions of distribution and use, see the accompanying README.ijg
+ * file.
  *
  * This file contains software version identification.
  */
@@ -25,12 +26,24 @@
 
 #endif
 
-#define JCOPYRIGHT      "Copyright (C) 1991-2012 Thomas G. Lane, Guido Vollbeding\n" \
-                        "Copyright (C) 1999-2006 MIYASAKA Masaru\n" \
-                        "Copyright (C) 2009 Pierre Ossman for Cendio AB\n" \
-                        "Copyright (C) 2009-2016 D. R. Commander\n" \
-                        "Copyright (C) 2009-2011 Nokia Corporation and/or its subsidiary(-ies)\n" \
+/*
+ * NOTE: It is our convention to place the authors in the following order:
+ * - libjpeg-turbo authors (2009-) in descending order of the date of their
+ *   most recent contribution to the project, then in ascending order of the
+ *   date of their first contribution to the project
+ * - Upstream authors in descending order of the date of the first inclusion of
+ *   their code
+ */
+
+#define JCOPYRIGHT      "Copyright (C) 2009-2016 D. R. Commander\n" \
+                        "Copyright (C) 2011-2016 Siarhei Siamashka\n" \
+                        "Copyright (C) 2015-2016 Matthieu Darbois\n" \
+                        "Copyright (C) 2015 Google, Inc.\n" \
                         "Copyright (C) 2013-2014 MIPS Technologies, Inc.\n" \
-                        "Copyright (C) 2013 Linaro Limited"
+                        "Copyright (C) 2013 Linaro Limited\n" \
+                        "Copyright (C) 2009-2011 Nokia Corporation and/or its subsidiary(-ies)\n" \
+                        "Copyright (C) 2009 Pierre Ossman for Cendio AB\n" \
+                        "Copyright (C) 1999-2006 MIYASAKA Masaru\n" \
+                        "Copyright (C) 1991-2016 Thomas G. Lane, Guido Vollbeding" \
 
 #define JCOPYRIGHT_SHORT "Copyright (C) 1991-2016 The libjpeg-turbo Project and many others"
diff --git a/libjpeg.txt b/libjpeg.txt
index 7dbb354..71d37c6 100644
--- a/libjpeg.txt
+++ b/libjpeg.txt
@@ -1,10 +1,11 @@
 USING THE IJG JPEG LIBRARY
 
 This file was part of the Independent JPEG Group's software:
-Copyright (C) 1994-2011, Thomas G. Lane, Guido Vollbeding.
+Copyright (C) 1994-2013, Thomas G. Lane, Guido Vollbeding.
 libjpeg-turbo Modifications:
-Copyright (C) 2010, 2014, D. R. Commander.
-For conditions of distribution and use, see the accompanying README file.
+Copyright (C) 2010, 2014-2016, D. R. Commander.
+Copyright (C) 2015, Google, Inc.
+For conditions of distribution and use, see the accompanying README.ijg file.
 
 
 This file describes how to use the IJG JPEG library within an application
@@ -200,7 +201,7 @@
 feed in a colormapped image by expanding it to full-color format.  However
 JPEG often doesn't work very well with source data that has been colormapped,
 because of dithering noise.  This is discussed in more detail in the JPEG FAQ
-and the other references mentioned in the README file.
+and the other references mentioned in the README.ijg file.
 
 Pixels are stored by scanlines, with each scanline running from left to
 right.  The component values for each pixel are adjacent in the row; for
@@ -292,7 +293,7 @@
 If you use the standard destination module, you must open the target stdio
 stream beforehand.  Typical code for this step looks like:
 
-        FILE * outfile;
+        FILE *outfile;
         ...
         if ((outfile = fopen(filename, "wb")) == NULL) {
             fprintf(stderr, "can't open %s\n", filename);
@@ -539,7 +540,7 @@
 If you use the standard source module, you must open the source stdio stream
 beforehand.  Typical code for this step looks like:
 
-        FILE * infile;
+        FILE *infile;
         ...
         if ((infile = fopen(filename, "rb")) == NULL) {
             fprintf(stderr, "can't open %s\n", filename);
@@ -729,6 +730,91 @@
 The previous discussion of aborting compression cycles applies here too.
 
 
+Partial image decompression
+---------------------------
+
+Partial image decompression is convenient for performance-critical applications
+that wish to view only a portion of a large JPEG image without decompressing
+the whole thing.  It it also useful in memory-constrained environments (such as
+on mobile devices.)  This library provides the following functions to support
+partial image decompression:
+
+1. Skipping rows when decompressing
+
+        jpeg_skip_scanlines(j_decompress_ptr cinfo, JDIMENSION num_lines);
+
+This function provides application programmers with the ability to skip over
+multiple rows in the JPEG image.
+
+Suspending data sources are not supported by this function.  Calling
+jpeg_skip_scanlines() with a suspending data source will result in undefined
+behavior.
+
+jpeg_skip_scanlines() will not allow skipping past the bottom of the image.  If
+the value of num_lines is large enough to skip past the bottom of the image,
+then the function will skip to the end of the image instead.
+
+If the value of num_lines is valid, then jpeg_skip_scanlines() will always
+skip all of the input rows requested.  There is no need to inspect the return
+value of the function in that case.
+
+Best results will be achieved by calling jpeg_skip_scanlines() for large chunks
+of rows.  The function should be viewed as a way to quickly jump to a
+particular vertical offset in the JPEG image in order to decode a subset of the
+image.  Used in this manner, it will provide significant performance
+improvements.
+
+Calling jpeg_skip_scanlines() for small values of num_lines has several
+potential drawbacks:
+    1) JPEG decompression occurs in blocks, so if jpeg_skip_scanlines() is
+       called from the middle of a decompression block, then it is likely that
+       much of the decompression work has already been done for the first
+       couple of rows that need to be skipped.
+    2) When this function returns, it must leave the decompressor in a state
+       such that it is ready to read the next line.  This may involve
+       decompressing a block that must be partially skipped.
+These issues are especially tricky for cases in which upsampling requires
+context rows.  In the worst case, jpeg_skip_scanlines() will perform similarly
+to jpeg_read_scanlines() (since it will actually call jpeg_read_scanlines().)
+
+2. Decompressing partial scanlines
+
+        jpeg_crop_scanline (j_decompress_ptr cinfo, JDIMENSION *xoffset,
+                            JDIMENSION *width)
+
+This function provides application programmers with the ability to decompress
+only a portion of each row in the JPEG image.  It must be called after
+jpeg_start_decompress() and before any calls to jpeg_read_scanlines() or
+jpeg_skip_scanlines().
+
+If xoffset and width do not form a valid subset of the image row, then this
+function will generate an error.  Note that if the output image is scaled, then
+xoffset and width are relative to the scaled image dimensions.
+
+xoffset and width are passed by reference because xoffset must fall on an iMCU
+boundary.  If it doesn't, then it will be moved left to the nearest iMCU
+boundary, and width will be increased accordingly.  If the calling program does
+not like the adjusted values of xoffset and width, then it can call
+jpeg_crop_scanline() again with new values (for instance, if it wants to move
+xoffset to the nearest iMCU boundary to the right instead of to the left.)
+
+After calling this function, cinfo->output_width will be set to the adjusted
+width.  This value should be used when allocating an output buffer to pass to
+jpeg_read_scanlines().
+
+The output image from a partial-width decompression will be identical to the
+corresponding image region from a full decode, with one exception:  The "fancy"
+(smooth) h2v2 (4:2:0) and h2v1 (4:2:2) upsampling algorithms fill in the
+missing chroma components by averaging the chroma components from neighboring
+pixels, except on the right and left edges of the image (where there are no
+neighboring pixels.)  When performing a partial-width decompression, these
+"fancy" upsampling algorithms may treat the left and right edges of the partial
+image region as if they are the left and right edges of the image, meaning that
+the upsampling algorithm may be simplified.  The result is that the pixels on
+the left or right edge of the partial image may not be exactly identical to the
+corresponding pixels in the original image.
+
+
 Mechanics of usage: include files, linking, etc
 -----------------------------------------------
 
@@ -781,7 +867,7 @@
 compression, as well as the "helper" routines provided to assist in this
 task.  Proper setting of some parameters requires detailed understanding
 of the JPEG standard; if you don't know what a parameter is for, it's best
-not to mess with it!  See REFERENCES in the README file for pointers to
+not to mess with it!  See REFERENCES in the README.ijg file for pointers to
 more info about JPEG.
 
 It's a good idea to call jpeg_set_defaults() first, even if you plan to set
@@ -872,6 +958,10 @@
 
 Compression parameters (cinfo fields) include:
 
+boolean arith_code
+	If TRUE, use arithmetic coding.
+	If FALSE, use Huffman coding.
+
 J_DCT_METHOD dct_method
         Selects the algorithm used for the DCT step.  Choices are:
                 JDCT_ISLOW: slow but accurate integer algorithm
@@ -928,7 +1018,7 @@
         If you use restarts, you may want to use larger intervals in those
         cases.
 
-const jpeg_scan_info * scan_info
+const jpeg_scan_info *scan_info
 int num_scans
         By default, scan_info is NULL; this causes the compressor to write a
         single-scan sequential JPEG file.  If not NULL, scan_info points to
@@ -974,7 +1064,7 @@
         default behavior ensures that the JPEG file's color space can be
         recognized by the decoder.
 
-JQUANT_TBL * quant_tbl_ptrs[NUM_QUANT_TBLS]
+JQUANT_TBL *quant_tbl_ptrs[NUM_QUANT_TBLS]
         Pointers to coefficient quantization tables, one per table slot,
         or NULL if no table is defined for a slot.  Usually these should
         be set via one of the above helper routines; jpeg_add_quant_table()
@@ -1009,8 +1099,8 @@
                 cinfo->comp_info[0].v_samp_factor = 1;
                 cinfo->comp_info[0].h_samp_factor = 1;
 
-JHUFF_TBL * dc_huff_tbl_ptrs[NUM_HUFF_TBLS]
-JHUFF_TBL * ac_huff_tbl_ptrs[NUM_HUFF_TBLS]
+JHUFF_TBL *dc_huff_tbl_ptrs[NUM_HUFF_TBLS]
+JHUFF_TBL *ac_huff_tbl_ptrs[NUM_HUFF_TBLS]
         Pointers to Huffman coding tables, one per table slot, or NULL if
         no table is defined for a slot.  Slots 0 and 1 are filled with the
         JPEG sample tables by jpeg_set_defaults().  If you need to allocate
@@ -1394,7 +1484,7 @@
         somewhere other than stderr.  Note that this method does not know
         how to generate a message, only where to send it.
 
-format_message (j_common_ptr cinfo, char * buffer)
+format_message (j_common_ptr cinfo, char *buffer)
         Constructs a readable error message string based on the error info
         stored in cinfo->err.  This method is called by output_message.  Few
         applications should need to override this method.  One possible
@@ -1471,7 +1561,7 @@
 A data destination manager struct contains a pointer and count defining the
 next byte to write in the work buffer and the remaining free space:
 
-        JOCTET * next_output_byte;  /* => next byte to write in buffer */
+        JOCTET *next_output_byte;   /* => next byte to write in buffer */
         size_t free_in_buffer;      /* # of byte spaces remaining in buffer */
 
 The library increments the pointer and decrements the count until the buffer
@@ -1520,7 +1610,7 @@
 defining the next byte to read from the work buffer and the number of bytes
 remaining:
 
-        const JOCTET * next_input_byte; /* => next byte to read from buffer */
+        const JOCTET *next_input_byte;  /* => next byte to read from buffer */
         size_t bytes_in_buffer;         /* # of bytes remaining in buffer */
 
 The library increments the pointer and decrements the count until the buffer
diff --git a/md5/CMakeLists.txt b/md5/CMakeLists.txt
new file mode 100644
index 0000000..526ef08
--- /dev/null
+++ b/md5/CMakeLists.txt
@@ -0,0 +1 @@
+add_executable(md5cmp md5cmp.c md5.c md5hl.c)
diff --git a/md5/md5.c b/md5/md5.c
index b30df97..087f4b0 100644
--- a/md5/md5.c
+++ b/md5/md5.c
@@ -36,7 +36,7 @@
 #if (BYTE_ORDER == LITTLE_ENDIAN)
 #define Encode memcpy
 #define Decode memcpy
-#else 
+#else
 
 /*
  * OS X doesn't have le32toh() or htole32()
@@ -134,7 +134,7 @@
 	context->state[3] = 0x10325476;
 }
 
-/* 
+/*
  * MD5 block update operation. Continues an MD5 message-digest
  * operation, processing another message block, and updating the
  * context.
diff --git a/md5/md5cmp.c b/md5/md5cmp.c
index 07acda4..dfd60bd 100644
--- a/md5/md5cmp.c
+++ b/md5/md5cmp.c
@@ -1,5 +1,5 @@
 /*
- * Copyright (C)2013 D. R. Commander.  All Rights Reserved.
+ * Copyright (C)2013, 2016 D. R. Commander.  All Rights Reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions are met:
@@ -30,6 +30,7 @@
 #include <string.h>
 #include <sys/types.h>
 #include "./md5.h"
+#include "../tjutil.h"
 
 int main(int argc, char *argv[])
 {
diff --git a/md5/md5hl.c b/md5/md5hl.c
index eaa41e2..d2b7ca4 100644
--- a/md5/md5hl.c
+++ b/md5/md5hl.c
@@ -4,12 +4,25 @@
  * can do whatever you want with this stuff. If we meet some day, and you think
  * this stuff is worth it, you can buy me a beer in return.   Poul-Henning Kamp
  * ----------------------------------------------------------------------------
+ * libjpeg-turbo Modifications:
+ * Copyright (C) 2016, D. R. Commander
+ * Modifications are under the same license as the original code (see above)
+ * ----------------------------------------------------------------------------
  */
 
 #include <sys/types.h>
 #include <sys/stat.h>
 #include <fcntl.h>
+#ifdef _WIN32
+#include <io.h>
+#define close _close
+#define fstat _fstat
+#define lseek _lseek
+#define read _read
+#define stat _stat
+#else
 #include <unistd.h>
+#endif
 
 #include <errno.h>
 #include <stdio.h>
@@ -55,7 +68,11 @@
 	off_t n;
 
 	MD5Init(&ctx);
+#if _WIN32
+	f = _open(filename, O_RDONLY|O_BINARY);
+#else
 	f = open(filename, O_RDONLY);
+#endif
 	if (f < 0)
 		return 0;
 	if (fstat(f, &stbuf) < 0)
@@ -73,11 +90,11 @@
 			i = read(f, buffer, sizeof(buffer));
 		else
 			i = read(f, buffer, n);
-		if (i < 0) 
+		if (i < 0)
 			break;
 		MD5Update(&ctx, buffer, i);
 		n -= i;
-	} 
+	}
 	e = errno;
 	close(f);
 	errno = e;
diff --git a/rdbmp.c b/rdbmp.c
index df5dbcc..eaa7086 100644
--- a/rdbmp.c
+++ b/rdbmp.c
@@ -7,7 +7,8 @@
  * libjpeg-turbo Modifications:
  * Modified 2011 by Siarhei Siamashka.
  * Copyright (C) 2015, D. R. Commander.
- * For conditions of distribution and use, see the accompanying README file.
+ * For conditions of distribution and use, see the accompanying README.ijg
+ * file.
  *
  * This file contains routines to read input images in Microsoft "BMP"
  * format (MS Windows 3.x, OS/2 1.x, and OS/2 2.x flavors).
@@ -51,7 +52,7 @@
 
 /* Private version of data source object */
 
-typedef struct _bmp_source_struct * bmp_source_ptr;
+typedef struct _bmp_source_struct *bmp_source_ptr;
 
 typedef struct _bmp_source_struct {
   struct cjpeg_source_struct pub; /* public fields */
diff --git a/rdcolmap.c b/rdcolmap.c
index ac6f50e..ed8ca3b 100644
--- a/rdcolmap.c
+++ b/rdcolmap.c
@@ -3,7 +3,8 @@
  *
  * Copyright (C) 1994-1996, Thomas G. Lane.
  * This file is part of the Independent JPEG Group's software.
- * For conditions of distribution and use, see the accompanying README file.
+ * For conditions of distribution and use, see the accompanying README.ijg
+ * file.
  *
  * This file implements djpeg's "-map file" switch.  It reads a source image
  * and constructs a colormap to be supplied to the JPEG decompressor.
@@ -76,7 +77,7 @@
  */
 
 LOCAL(void)
-read_gif_map (j_decompress_ptr cinfo, FILE * infile)
+read_gif_map (j_decompress_ptr cinfo, FILE *infile)
 {
   int header[13];
   int i, colormaplen;
@@ -118,7 +119,7 @@
 
 
 LOCAL(int)
-pbm_getc (FILE * infile)
+pbm_getc (FILE *infile)
 /* Read next char, skipping over any comments */
 /* A comment/newline sequence is returned as a newline */
 {
@@ -135,7 +136,7 @@
 
 
 LOCAL(unsigned int)
-read_pbm_integer (j_decompress_ptr cinfo, FILE * infile)
+read_pbm_integer (j_decompress_ptr cinfo, FILE *infile)
 /* Read an unsigned decimal integer from the PPM file */
 /* Swallows one trailing character after the integer */
 /* Note that on a 16-bit-int machine, only values up to 64k can be read. */
@@ -168,7 +169,7 @@
  */
 
 LOCAL(void)
-read_ppm_map (j_decompress_ptr cinfo, FILE * infile)
+read_ppm_map (j_decompress_ptr cinfo, FILE *infile)
 {
   int c;
   unsigned int w, h, maxval, row, col;
@@ -228,7 +229,7 @@
  */
 
 GLOBAL(void)
-read_color_map (j_decompress_ptr cinfo, FILE * infile)
+read_color_map (j_decompress_ptr cinfo, FILE *infile)
 {
   /* Allocate space for a color map of maximum supported size. */
   cinfo->colormap = (*cinfo->mem->alloc_sarray)
diff --git a/rdgif.c b/rdgif.c
index 5caad8a..ce689f7 100644
--- a/rdgif.c
+++ b/rdgif.c
@@ -3,7 +3,8 @@
  *
  * Copyright (C) 1991-1997, Thomas G. Lane.
  * This file is part of the Independent JPEG Group's software.
- * For conditions of distribution and use, see the accompanying README file.
+ * For conditions of distribution and use, see the accompanying README.ijg
+ * file.
  *
  * This file contains routines to read input images in GIF format.
  *
diff --git a/rdjpgcom.c b/rdjpgcom.c
index 2f0115d..b3076dd 100644
--- a/rdjpgcom.c
+++ b/rdjpgcom.c
@@ -6,7 +6,8 @@
  * Modified 2009 by Bill Allombert, Guido Vollbeding.
  * It was modified by The libjpeg-turbo Project to include only code relevant
  * to libjpeg-turbo.
- * For conditions of distribution and use, see the accompanying README file.
+ * For conditions of distribution and use, see the accompanying README.ijg
+ * file.
  *
  * This file contains a very simple stand-alone application that displays
  * the text in COM (comment) markers in a JFIF file.
@@ -56,7 +57,7 @@
  * To reuse this code in another application, you might need to change these.
  */
 
-static FILE * infile;           /* input JPEG file */
+static FILE *infile;            /* input JPEG file */
 
 /* Return next input byte, or EOF if no more */
 #define NEXTBYTE()  getc(infile)
@@ -278,7 +279,7 @@
   unsigned int length;
   unsigned int image_height, image_width;
   int data_precision, num_components;
-  const char * process;
+  const char *process;
   int ci;
 
   length = read_2_bytes();      /* usual parameter length count */
@@ -396,7 +397,7 @@
 
 /* Command line parsing code */
 
-static const char * progname;   /* program name for error messages */
+static const char *progname;    /* program name for error messages */
 
 
 static void
@@ -416,7 +417,7 @@
 
 
 static int
-keymatch (char * arg, const char * keyword, int minchars)
+keymatch (char *arg, const char *keyword, int minchars)
 /* Case-insensitive matching of (possibly abbreviated) keyword switches. */
 /* keyword is the constant keyword (must be lower case already), */
 /* minchars is length of minimum legal abbreviation. */
@@ -448,7 +449,7 @@
 main (int argc, char **argv)
 {
   int argn;
-  char * arg;
+  char *arg;
   int verbose = 0, raw = 0;
 
   /* On Mac, fetch a command line. */
diff --git a/rdppm.c b/rdppm.c
index 5b52f0e..b71d337 100644
--- a/rdppm.c
+++ b/rdppm.c
@@ -5,8 +5,9 @@
  * Copyright (C) 1991-1997, Thomas G. Lane.
  * Modified 2009 by Bill Allombert, Guido Vollbeding.
  * libjpeg-turbo Modifications:
- * Copyright (C) 2016, D. R. Commander.
- * For conditions of distribution and use, see the accompanying README file.
+ * Copyright (C) 2015, 2016, D. R. Commander.
+ * For conditions of distribution and use, see the accompanying README.ijg
+ * file.
  *
  * This file contains routines to read input images in PPM/PGM format.
  * The extended 2-byte-per-sample raw PPM/PGM formats are supported.
@@ -71,11 +72,11 @@
   int maxval;
 } ppm_source_struct;
 
-typedef ppm_source_struct * ppm_source_ptr;
+typedef ppm_source_struct *ppm_source_ptr;
 
 
 LOCAL(int)
-pbm_getc (FILE * infile)
+pbm_getc (FILE *infile)
 /* Read next char, skipping over any comments */
 /* A comment/newline sequence is returned as a newline */
 {
@@ -92,7 +93,7 @@
 
 
 LOCAL(unsigned int)
-read_pbm_integer (j_compress_ptr cinfo, FILE * infile, unsigned int maxval)
+read_pbm_integer (j_compress_ptr cinfo, FILE *infile, unsigned int maxval)
 /* Read an unsigned decimal integer from the PPM file */
 /* Swallows one trailing character after the integer */
 /* Note that on a 16-bit-int machine, only values up to 64k can be read. */
@@ -140,7 +141,7 @@
 /* This version is for reading text-format PGM files with any maxval */
 {
   ppm_source_ptr source = (ppm_source_ptr) sinfo;
-  FILE * infile = source->pub.input_file;
+  FILE *infile = source->pub.input_file;
   register JSAMPROW ptr;
   register JSAMPLE *rescale = source->rescale;
   JDIMENSION col;
@@ -159,7 +160,7 @@
 /* This version is for reading text-format PPM files with any maxval */
 {
   ppm_source_ptr source = (ppm_source_ptr) sinfo;
-  FILE * infile = source->pub.input_file;
+  FILE *infile = source->pub.input_file;
   register JSAMPROW ptr;
   register JSAMPLE *rescale = source->rescale;
   JDIMENSION col;
@@ -181,7 +182,7 @@
 {
   ppm_source_ptr source = (ppm_source_ptr) sinfo;
   register JSAMPROW ptr;
-  register U_CHAR * bufferptr;
+  register U_CHAR *bufferptr;
   register JSAMPLE *rescale = source->rescale;
   JDIMENSION col;
 
@@ -202,7 +203,7 @@
 {
   ppm_source_ptr source = (ppm_source_ptr) sinfo;
   register JSAMPROW ptr;
-  register U_CHAR * bufferptr;
+  register U_CHAR *bufferptr;
   register JSAMPLE *rescale = source->rescale;
   JDIMENSION col;
 
@@ -240,7 +241,7 @@
 {
   ppm_source_ptr source = (ppm_source_ptr) sinfo;
   register JSAMPROW ptr;
-  register U_CHAR * bufferptr;
+  register U_CHAR *bufferptr;
   register JSAMPLE *rescale = source->rescale;
   JDIMENSION col;
   unsigned int maxval = source->maxval;
@@ -267,7 +268,7 @@
 {
   ppm_source_ptr source = (ppm_source_ptr) sinfo;
   register JSAMPROW ptr;
-  register U_CHAR * bufferptr;
+  register U_CHAR *bufferptr;
   register JSAMPLE *rescale = source->rescale;
   JDIMENSION col;
   unsigned int maxval = source->maxval;
@@ -419,7 +420,7 @@
 
   /* Compute the rescaling array if required. */
   if (need_rescale) {
-    INT32 val, half_maxval;
+    long val, half_maxval;
 
     /* On 16-bit-int machines we have to be careful of maxval = 65535 */
     source->rescale = (JSAMPLE *)
@@ -427,7 +428,7 @@
                                   (size_t) (((long) maxval + 1L) *
                                             sizeof(JSAMPLE)));
     half_maxval = maxval / 2;
-    for (val = 0; val <= (INT32) maxval; val++) {
+    for (val = 0; val <= (long) maxval; val++) {
       /* The multiplication here must be done in 32 bits to avoid overflow */
       source->rescale[val] = (JSAMPLE) ((val * MAXJSAMPLE + half_maxval) /
                                         maxval);
diff --git a/rdrle.c b/rdrle.c
index 8df3ddb..226c528 100644
--- a/rdrle.c
+++ b/rdrle.c
@@ -5,7 +5,8 @@
  * Copyright (C) 1991-1996, Thomas G. Lane.
  * It was modified by The libjpeg-turbo Project to include only code and
  * information relevant to libjpeg-turbo.
- * For conditions of distribution and use, see the accompanying README file.
+ * For conditions of distribution and use, see the accompanying README.ijg
+ * file.
  *
  * This file contains routines to read input images in Utah RLE format.
  * The Utah Raster Toolkit library is required (version 3.1 or later).
@@ -61,7 +62,7 @@
  * then fetch the required row from the virtual array on subsequent calls.
  */
 
-typedef struct _rle_source_struct * rle_source_ptr;
+typedef struct _rle_source_struct *rle_source_ptr;
 
 typedef struct _rle_source_struct {
   struct cjpeg_source_struct pub; /* public fields */
@@ -70,7 +71,7 @@
   jvirt_sarray_ptr image;       /* virtual array to hold the image */
   JDIMENSION row;               /* current row # in the virtual array */
   rle_hdr header;               /* Input file information */
-  rle_pixel** rle_row;          /* holds a row returned by rle_getrow() */
+  rle_pixel **rle_row;          /* holds a row returned by rle_getrow() */
 
 } rle_source_struct;
 
@@ -215,7 +216,7 @@
   colormap = source->header.cmap;
   dest_row = source->pub.buffer[0];
   source->row--;
-  src_row = * (*cinfo->mem->access_virt_sarray)
+  src_row = *(*cinfo->mem->access_virt_sarray)
     ((j_common_ptr) cinfo, source->image, source->row, (JDIMENSION) 1, FALSE);
 
   for (col = cinfo->image_width; col > 0; col--) {
@@ -288,7 +289,7 @@
   case MAPPEDGRAY:
   case TRUECOLOR:
     for (row = 0; row < cinfo->image_height; row++) {
-      scanline = * (*cinfo->mem->access_virt_sarray)
+      scanline = *(*cinfo->mem->access_virt_sarray)
         ((j_common_ptr) cinfo, source->image, row, (JDIMENSION) 1, TRUE);
       rle_row = source->rle_row;
       rle_getrow(&source->header, rle_row);
@@ -311,7 +312,7 @@
 
   case DIRECTCOLOR:
     for (row = 0; row < cinfo->image_height; row++) {
-      scanline = * (*cinfo->mem->access_virt_sarray)
+      scanline = *(*cinfo->mem->access_virt_sarray)
         ((j_common_ptr) cinfo, source->image, row, (JDIMENSION) 1, TRUE);
       rle_getrow(&source->header, rle_row);
 
diff --git a/rdswitch.c b/rdswitch.c
index a0aa37c..7d870c3 100644
--- a/rdswitch.c
+++ b/rdswitch.c
@@ -5,7 +5,8 @@
  * Copyright (C) 1991-1996, Thomas G. Lane.
  * libjpeg-turbo Modifications:
  * Copyright (C) 2010, D. R. Commander.
- * For conditions of distribution and use, see the accompanying README file.
+ * For conditions of distribution and use, see the accompanying README.ijg
+ * file.
  *
  * This file contains routines to process some of cjpeg's more complicated
  * command-line switches.  Switches processed here are:
@@ -21,7 +22,7 @@
 
 
 LOCAL(int)
-text_getc (FILE * file)
+text_getc (FILE *file)
 /* Read next char, skipping over any comments (# to end of line) */
 /* A comment/newline sequence is returned as a newline */
 {
@@ -38,7 +39,7 @@
 
 
 LOCAL(boolean)
-read_text_integer (FILE * file, long * result, int * termchar)
+read_text_integer (FILE *file, long *result, int *termchar)
 /* Read an unsigned decimal integer from a file, store it in result */
 /* Reads one trailing character after the integer; returns it in termchar */
 {
@@ -77,7 +78,8 @@
 #endif
 
 GLOBAL(boolean)
-read_quant_tables (j_compress_ptr cinfo, char * filename, boolean force_baseline)
+read_quant_tables (j_compress_ptr cinfo, char *filename,
+                   boolean force_baseline)
 /* Read a set of quantization tables from the specified file.
  * The file is plain ASCII text: decimal numbers with whitespace between.
  * Comments preceded by '#' may be included in the file.
@@ -88,7 +90,7 @@
  * You must use -qslots if you want a different component->table mapping.
  */
 {
-  FILE * fp;
+  FILE *fp;
   int tblno, i, termchar;
   long val;
   unsigned int table[DCTSIZE2];
@@ -138,7 +140,7 @@
 #ifdef C_MULTISCAN_FILES_SUPPORTED
 
 LOCAL(boolean)
-read_scan_integer (FILE * file, long * result, int * termchar)
+read_scan_integer (FILE *file, long *result, int *termchar)
 /* Variant of read_text_integer that always looks for a non-space termchar;
  * this simplifies parsing of punctuation in scan scripts.
  */
@@ -167,7 +169,7 @@
 
 
 GLOBAL(boolean)
-read_scan_script (j_compress_ptr cinfo, char * filename)
+read_scan_script (j_compress_ptr cinfo, char *filename)
 /* Read a scan script from the specified text file.
  * Each entry in the file defines one scan to be emitted.
  * Entries are separated by semicolons ';'.
@@ -184,10 +186,10 @@
  * jcmaster.c will validate the script parameters.
  */
 {
-  FILE * fp;
+  FILE *fp;
   int scanno, ncomps, termchar;
   long val;
-  jpeg_scan_info * scanptr;
+  jpeg_scan_info *scanptr;
 #define MAX_SCANS  100          /* quite arbitrary limit */
   jpeg_scan_info scans[MAX_SCANS];
 
diff --git a/rdtarga.c b/rdtarga.c
index b15ac64..b9bbd07 100644
--- a/rdtarga.c
+++ b/rdtarga.c
@@ -5,7 +5,8 @@
  * Copyright (C) 1991-1996, Thomas G. Lane.
  * It was modified by The libjpeg-turbo Project to include only code relevant
  * to libjpeg-turbo.
- * For conditions of distribution and use, see the accompanying README file.
+ * For conditions of distribution and use, see the accompanying README.ijg
+ * file.
  *
  * This file contains routines to read input images in Targa format.
  *
@@ -45,7 +46,7 @@
 
 /* Private version of data source object */
 
-typedef struct _tga_source_struct * tga_source_ptr;
+typedef struct _tga_source_struct *tga_source_ptr;
 
 typedef struct _tga_source_struct {
   struct cjpeg_source_struct pub; /* public fields */
diff --git a/release/ReadMe.txt b/release/ReadMe.txt
index b9f6ca5..7fb8d0f 100644
--- a/release/ReadMe.txt
+++ b/release/ReadMe.txt
@@ -1,4 +1,4 @@
-libjpeg-turbo is a JPEG image codec that uses SIMD instructions (MMX, SSE2, NEON) to accelerate baseline JPEG compression and decompression on x86, x86-64, and ARM systems.  On such systems, libjpeg-turbo is generally 2-4x as fast as libjpeg, all else being equal.  On other types of systems, libjpeg-turbo can still outperform libjpeg by a significant amount, by virtue of its highly-optimized Huffman coding routines.  In many cases, the performance of libjpeg-turbo rivals that of proprietary high-speed JPEG codecs.
+libjpeg-turbo is a JPEG image codec that uses SIMD instructions (MMX, SSE2, NEON, AltiVec) to accelerate baseline JPEG compression and decompression on x86, x86-64, ARM, and PowerPC systems.  On such systems, libjpeg-turbo is generally 2-6x as fast as libjpeg, all else being equal.  On other types of systems, libjpeg-turbo can still outperform libjpeg by a significant amount, by virtue of its highly-optimized Huffman coding routines.  In many cases, the performance of libjpeg-turbo rivals that of proprietary high-speed JPEG codecs.
 
 libjpeg-turbo implements both the traditional libjpeg API as well as the less powerful but more straightforward TurboJPEG API.  libjpeg-turbo also features colorspace extensions that allow it to compress from/decompress to 32-bit and big-endian pixel buffers (RGBX, XBGR, etc.), as well as a full-featured Java interface.
 
diff --git a/release/deb-control.tmpl b/release/deb-control.tmpl
index 510b1d6..681721d 100644
--- a/release/deb-control.tmpl
+++ b/release/deb-control.tmpl
@@ -9,11 +9,11 @@
 Installed-Size: {__SIZE}
 Description: A SIMD-accelerated JPEG codec that provides both the libjpeg and TurboJPEG APIs
  libjpeg-turbo is a JPEG image codec that uses SIMD instructions (MMX, SSE2,
- NEON) to accelerate baseline JPEG compression and decompression on x86,
- x86-64, and ARM systems.  On such systems, libjpeg-turbo is generally 2-4x as
- fast as libjpeg, all else being equal.  On other types of systems,
- libjpeg-turbo can still outperform libjpeg by a significant amount, by virtue
- of its highly-optimized Huffman coding routines.  In many cases, the
+ NEON, AltiVec) to accelerate baseline JPEG compression and decompression on
+ x86, x86-64, ARM, and PowerPC systems.  On such systems, libjpeg-turbo is
+ generally 2-6x as fast as libjpeg, all else being equal.  On other types of
+ systems, libjpeg-turbo can still outperform libjpeg by a significant amount,
+ by virtue of its highly-optimized Huffman coding routines.  In many cases, the
  performance of libjpeg-turbo rivals that of proprietary high-speed JPEG
  codecs.
  .
diff --git a/release/libjpeg-turbo.nsi.in b/release/libjpeg-turbo.nsi.in
index c675eac..f458b81 100755
--- a/release/libjpeg-turbo.nsi.in
+++ b/release/libjpeg-turbo.nsi.in
@@ -79,8 +79,9 @@
 	File "@CMAKE_SOURCE_DIR@\jpeglib.h"
 	File "@CMAKE_SOURCE_DIR@\turbojpeg.h"
 	SetOutPath $INSTDIR\doc
-	File "@CMAKE_SOURCE_DIR@\README"
-	File "@CMAKE_SOURCE_DIR@\README-turbo.txt"
+	File "@CMAKE_SOURCE_DIR@\README.ijg"
+	File "@CMAKE_SOURCE_DIR@\README.md"
+	File "@CMAKE_SOURCE_DIR@\LICENSE.md"
 	File "@CMAKE_SOURCE_DIR@\example.c"
 	File "@CMAKE_SOURCE_DIR@\libjpeg.txt"
 	File "@CMAKE_SOURCE_DIR@\structure.txt"
@@ -140,8 +141,9 @@
 	Delete $INSTDIR\include\jpeglib.h"
 	Delete $INSTDIR\include\turbojpeg.h"
 	Delete $INSTDIR\uninstall_@VERSION@.exe
-	Delete $INSTDIR\doc\README
-	Delete $INSTDIR\doc\README-turbo.txt
+	Delete $INSTDIR\doc\README.ijg
+	Delete $INSTDIR\doc\README.md
+	Delete $INSTDIR\doc\LICENSE.md
 	Delete $INSTDIR\doc\example.c
 	Delete $INSTDIR\doc\libjpeg.txt
 	Delete $INSTDIR\doc\structure.txt
diff --git a/release/libjpeg-turbo.spec.in b/release/libjpeg-turbo.spec.in
index 23793cf..4b792d7 100644
--- a/release/libjpeg-turbo.spec.in
+++ b/release/libjpeg-turbo.spec.in
@@ -44,12 +44,12 @@
 
 %description
 libjpeg-turbo is a JPEG image codec that uses SIMD instructions (MMX, SSE2,
-NEON) to accelerate baseline JPEG compression and decompression on x86, x86-64,
-and ARM systems.  On such systems, libjpeg-turbo is generally 2-4x as fast as
-libjpeg, all else being equal.  On other types of systems, libjpeg-turbo can
-still outperform libjpeg by a significant amount, by virtue of its
-highly-optimized Huffman coding routines.  In many cases, the performance of
-libjpeg-turbo rivals that of proprietary high-speed JPEG codecs.
+NEON, AltiVec) to accelerate baseline JPEG compression and decompression on
+x86, x86-64, ARM, and PowerPC systems.  On such systems, libjpeg-turbo is
+generally 2-6x as fast as libjpeg, all else being equal.  On other types of
+systems, libjpeg-turbo can still outperform libjpeg by a significant amount, by
+virtue of its highly-optimized Huffman coding routines.  In many cases, the
+performance of libjpeg-turbo rivals that of proprietary high-speed JPEG codecs.
 
 libjpeg-turbo implements both the traditional libjpeg API as well as the less
 powerful but more straightforward TurboJPEG API.  libjpeg-turbo also features
@@ -124,10 +124,13 @@
 %{_libdir}/libjpeg.so.@SO_MAJOR_VERSION@
 %{_libdir}/libjpeg.so
 %{_libdir}/libjpeg.a
+%{_libdir}/pkgconfig
+%{_libdir}/pkgconfig/libjpeg.pc
 %{_libdir}/libturbojpeg.so.0.1.0
 %{_libdir}/libturbojpeg.so.0
 %{_libdir}/libturbojpeg.so
 %{_libdir}/libturbojpeg.a
+%{_libdir}/pkgconfig/libturbojpeg.pc
 %dir %{_includedir}
 %{_includedir}/jconfig.h
 %{_includedir}/jerror.h
diff --git a/release/libjpeg.pc.in b/release/libjpeg.pc.in
new file mode 100644
index 0000000..40795f7
--- /dev/null
+++ b/release/libjpeg.pc.in
@@ -0,0 +1,10 @@
+prefix=@prefix@
+exec_prefix=@exec_prefix@
+libdir=@libdir@
+includedir=@includedir@
+
+Name: libjpeg
+Description: A SIMD-accelerated JPEG codec that provides the libjpeg API
+Version: @PACKAGE_VERSION@
+Libs: -L${libdir} -ljpeg
+Cflags: -I${includedir}
diff --git a/release/libturbojpeg.pc.in b/release/libturbojpeg.pc.in
new file mode 100644
index 0000000..7d4b656
--- /dev/null
+++ b/release/libturbojpeg.pc.in
@@ -0,0 +1,10 @@
+prefix=@prefix@
+exec_prefix=@exec_prefix@
+libdir=@libdir@
+includedir=@includedir@
+
+Name: libturbojpeg
+Description: A SIMD-accelerated JPEG codec that provides the TurboJPEG API
+Version: @PACKAGE_VERSION@
+Libs: -L${libdir} -lturbojpeg
+Cflags: -I${includedir}
diff --git a/release/uninstall.in b/release/uninstall.in
index f167bbd..6cd1f86 100644
--- a/release/uninstall.in
+++ b/release/uninstall.in
@@ -1,4 +1,4 @@
-# Copyright (C)2009-2011, 2013 D. R. Commander.  All Rights Reserved.
+# Copyright (C)2009-2011, 2013, 2016 D. R. Commander.  All Rights Reserved.
 #
 # Redistribution and use in source and binary forms, with or without
 # modification, are permitted provided that the following conditions are met:
@@ -66,6 +66,9 @@
 if [ -d $BINDIR ]; then
 	rmdir $BINDIR 2>&1 || EXITSTATUS=-1
 fi
+if [ -d $LIBDIR/pkgconfig ]; then
+	rmdir $LIBDIR/pkgconfig 2>&1 || EXITSTATUS=-1
+fi
 if [ -d $LIBDIR ]; then
 	rmdir $LIBDIR 2>&1 || EXITSTATUS=-1
 fi
diff --git a/simd/CMakeLists.txt b/simd/CMakeLists.txt
index b16c467..37938ec 100755
--- a/simd/CMakeLists.txt
+++ b/simd/CMakeLists.txt
@@ -22,17 +22,19 @@
 
 if(SIMD_X86_64)
   set(SIMD_BASENAMES jfdctflt-sse-64 jccolor-sse2-64 jcgray-sse2-64
-    jcsample-sse2-64 jdcolor-sse2-64 jdmerge-sse2-64 jdsample-sse2-64
-    jfdctfst-sse2-64 jfdctint-sse2-64 jidctflt-sse2-64 jidctfst-sse2-64
-    jidctint-sse2-64 jidctred-sse2-64 jquantf-sse2-64 jquanti-sse2-64)
+    jchuff-sse2-64 jcsample-sse2-64 jdcolor-sse2-64 jdmerge-sse2-64
+    jdsample-sse2-64 jfdctfst-sse2-64 jfdctint-sse2-64 jidctflt-sse2-64
+    jidctfst-sse2-64 jidctint-sse2-64 jidctred-sse2-64 jquantf-sse2-64
+    jquanti-sse2-64)
   message(STATUS "Building x86_64 SIMD extensions")
 else()
   set(SIMD_BASENAMES jsimdcpu jfdctflt-3dn jidctflt-3dn jquant-3dn jccolor-mmx
     jcgray-mmx jcsample-mmx jdcolor-mmx jdmerge-mmx jdsample-mmx jfdctfst-mmx
     jfdctint-mmx jidctfst-mmx jidctint-mmx jidctred-mmx jquant-mmx jfdctflt-sse
-    jidctflt-sse jquant-sse jccolor-sse2 jcgray-sse2 jcsample-sse2 jdcolor-sse2
-    jdmerge-sse2 jdsample-sse2 jfdctfst-sse2 jfdctint-sse2 jidctflt-sse2
-    jidctfst-sse2 jidctint-sse2 jidctred-sse2 jquantf-sse2 jquanti-sse2)
+    jidctflt-sse jquant-sse jccolor-sse2 jcgray-sse2 jchuff-sse2 jcsample-sse2
+    jdcolor-sse2 jdmerge-sse2 jdsample-sse2 jfdctfst-sse2 jfdctint-sse2
+    jidctflt-sse2 jidctfst-sse2 jidctint-sse2 jidctred-sse2 jquantf-sse2
+    jquanti-sse2)
   message(STATUS "Building i386 SIMD extensions")
 endif()
 
diff --git a/simd/Makefile.am b/simd/Makefile.am
index 3029f1c..fad6c8c 100644
--- a/simd/Makefile.am
+++ b/simd/Makefile.am
@@ -6,17 +6,18 @@
 	jccolext-mmx.asm   jcgryext-mmx.asm   jdcolext-mmx.asm   jdmrgext-mmx.asm \
 	jccolext-sse2.asm  jcgryext-sse2.asm  jdcolext-sse2.asm  jdmrgext-sse2.asm \
 	jccolext-sse2-64.asm  jcgryext-sse2-64.asm  jdcolext-sse2-64.asm \
-	jdmrgext-sse2-64.asm
+	jdmrgext-sse2-64.asm  jccolext-altivec.c    jcgryext-altivec.c \
+	jdcolext-altivec.c    jdmrgext-altivec.c
 
 if SIMD_X86_64
 
 libsimd_la_SOURCES = jsimd_x86_64.c jsimd.h jsimdcfg.inc.h jsimdext.inc \
-	jcolsamp.inc jdct.inc jfdctflt-sse-64.asm \
-	jccolor-sse2-64.asm   jcgray-sse2-64.asm    jcsample-sse2-64.asm \
-	jdcolor-sse2-64.asm   jdmerge-sse2-64.asm   jdsample-sse2-64.asm \
-	jfdctfst-sse2-64.asm  jfdctint-sse2-64.asm  jidctflt-sse2-64.asm \
-	jidctfst-sse2-64.asm  jidctint-sse2-64.asm  jidctred-sse2-64.asm  \
-	jquantf-sse2-64.asm   jquanti-sse2-64.asm
+	jcolsamp.inc jdct.inc jpeg_nbits_table.inc jfdctflt-sse-64.asm \
+	jccolor-sse2-64.asm   jcgray-sse2-64.asm    jchuff-sse2-64.asm \
+	jcsample-sse2-64.asm  jdcolor-sse2-64.asm   jdmerge-sse2-64.asm \
+	jdsample-sse2-64.asm  jfdctfst-sse2-64.asm  jfdctint-sse2-64.asm \
+	jidctflt-sse2-64.asm  jidctfst-sse2-64.asm  jidctint-sse2-64.asm \
+	jidctred-sse2-64.asm  jquantf-sse2-64.asm   jquanti-sse2-64.asm
 
 jccolor-sse2-64.lo:  jccolext-sse2-64.asm
 jcgray-sse2-64.lo:   jcgryext-sse2-64.asm
@@ -28,18 +29,18 @@
 if SIMD_I386
 
 libsimd_la_SOURCES = jsimd_i386.c jsimd.h jsimdcfg.inc.h jsimdext.inc \
-	jcolsamp.inc jdct.inc jsimdcpu.asm \
+	jcolsamp.inc jdct.inc jpeg_nbits_table.inc jsimdcpu.asm \
 	jfdctflt-3dn.asm   jidctflt-3dn.asm   jquant-3dn.asm \
 	jccolor-mmx.asm    jcgray-mmx.asm     jcsample-mmx.asm \
 	jdcolor-mmx.asm    jdmerge-mmx.asm    jdsample-mmx.asm \
 	jfdctfst-mmx.asm   jfdctint-mmx.asm   jidctfst-mmx.asm \
 	jidctint-mmx.asm   jidctred-mmx.asm   jquant-mmx.asm \
 	jfdctflt-sse.asm   jidctflt-sse.asm   jquant-sse.asm \
-	jccolor-sse2.asm   jcgray-sse2.asm    jcsample-sse2.asm \
-	jdcolor-sse2.asm   jdmerge-sse2.asm   jdsample-sse2.asm \
-	jfdctfst-sse2.asm  jfdctint-sse2.asm  jidctflt-sse2.asm \
-	jidctfst-sse2.asm  jidctint-sse2.asm  jidctred-sse2.asm  \
-	jquantf-sse2.asm   jquanti-sse2.asm
+	jccolor-sse2.asm   jcgray-sse2.asm    jchuff-sse2.asm \
+	jcsample-sse2.asm  jdcolor-sse2.asm   jdmerge-sse2.asm \
+	jdsample-sse2.asm  jfdctfst-sse2.asm  jfdctint-sse2.asm \
+	jidctflt-sse2.asm  jidctfst-sse2.asm  jidctint-sse2.asm \
+	jidctred-sse2.asm  jquantf-sse2.asm   jquanti-sse2.asm
 
 jccolor-mmx.lo:   jccolext-mmx.asm
 jcgray.-mmx.lo:   jcgryext-mmx.asm
@@ -70,6 +71,23 @@
 
 endif
 
+if SIMD_POWERPC
+
+libsimd_la_SOURCES = jsimd_powerpc.c jsimd_altivec.h jcsample.h \
+	jccolor-altivec.c     jcgray-altivec.c      jcsample-altivec.c \
+	jdcolor-altivec.c     jdmerge-altivec.c     jdsample-altivec.c \
+	jfdctfst-altivec.c    jfdctint-altivec.c \
+	jidctfst-altivec.c    jidctint-altivec.c \
+	jquanti-altivec.c
+libsimd_la_CFLAGS = -maltivec
+
+jccolor-altivec.lo:  jccolext-altivec.c
+jcgray-altivec.lo:   jcgryext-altivec.c
+jdcolor-altivec.lo:  jdcolext-altivec.c
+jdmerge-altivec.lo:  jdmrgext-altivec.c
+
+endif
+
 AM_CPPFLAGS = -I$(top_srcdir)
 
 .asm.lo:
diff --git a/simd/jccolext-altivec.c b/simd/jccolext-altivec.c
new file mode 100644
index 0000000..403aa96
--- /dev/null
+++ b/simd/jccolext-altivec.c
@@ -0,0 +1,267 @@
+/*
+ * AltiVec optimizations for libjpeg-turbo
+ *
+ * Copyright (C) 2014-2015, D. R. Commander.
+ * Copyright (C) 2014, Jay Foad.
+ * All rights reserved.
+ * This software is provided 'as-is', without any express or implied
+ * warranty.  In no event will the authors be held liable for any damages
+ * arising from the use of this software.
+ *
+ * Permission is granted to anyone to use this software for any purpose,
+ * including commercial applications, and to alter it and redistribute it
+ * freely, subject to the following restrictions:
+ *
+ * 1. The origin of this software must not be misrepresented; you must not
+ *    claim that you wrote the original software. If you use this software
+ *    in a product, an acknowledgment in the product documentation would be
+ *    appreciated but is not required.
+ * 2. Altered source versions must be plainly marked as such, and must not be
+ *    misrepresented as being the original software.
+ * 3. This notice may not be removed or altered from any source distribution.
+ */
+
+/* This file is included by jccolor-altivec.c */
+
+
+void jsimd_rgb_ycc_convert_altivec (JDIMENSION img_width, JSAMPARRAY input_buf,
+                                    JSAMPIMAGE output_buf,
+                                    JDIMENSION output_row, int num_rows)
+{
+  JSAMPROW inptr, outptr0, outptr1, outptr2;
+  int pitch = img_width * RGB_PIXELSIZE, num_cols;
+#if __BIG_ENDIAN__
+  int offset;
+#endif
+  unsigned char __attribute__((aligned(16))) tmpbuf[RGB_PIXELSIZE * 16];
+
+  __vector unsigned char rgb0, rgb1 = {0}, rgb2 = {0},
+    rgbg0, rgbg1, rgbg2, rgbg3, y, cb, cr;
+#if __BIG_ENDIAN__ || RGB_PIXELSIZE == 4
+  __vector unsigned char rgb3 = {0};
+#endif
+#if __BIG_ENDIAN__ && RGB_PIXELSIZE == 4
+  __vector unsigned char rgb4 = {0};
+#endif
+  __vector short rg0, rg1, rg2, rg3, bg0, bg1, bg2, bg3;
+  __vector unsigned short yl, yh, crl, crh, cbl, cbh;
+  __vector int y0, y1, y2, y3, cr0, cr1, cr2, cr3, cb0, cb1, cb2, cb3;
+
+  /* Constants */
+  __vector short pw_f0299_f0337 = { __4X2(F_0_299, F_0_337) },
+    pw_f0114_f0250 = { __4X2(F_0_114, F_0_250) },
+    pw_mf016_mf033 = { __4X2(-F_0_168, -F_0_331) },
+    pw_mf008_mf041 = { __4X2(-F_0_081, -F_0_418) };
+  __vector unsigned short pw_f050_f000 = { __4X2(F_0_500, 0) };
+  __vector int pd_onehalf = { __4X(ONE_HALF) },
+    pd_onehalfm1_cj = { __4X(ONE_HALF - 1 + (CENTERJSAMPLE << SCALEBITS)) };
+  __vector unsigned char pb_zero = { __16X(0) },
+#if __BIG_ENDIAN__
+    shift_pack_index = {0,1,4,5,8,9,12,13,16,17,20,21,24,25,28,29};
+#else
+    shift_pack_index = {2,3,6,7,10,11,14,15,18,19,22,23,26,27,30,31};
+#endif
+
+  while (--num_rows >= 0) {
+    inptr = *input_buf++;
+    outptr0 = output_buf[0][output_row];
+    outptr1 = output_buf[1][output_row];
+    outptr2 = output_buf[2][output_row];
+    output_row++;
+
+    for (num_cols = pitch; num_cols > 0;
+         num_cols -= RGB_PIXELSIZE * 16, inptr += RGB_PIXELSIZE * 16,
+         outptr0 += 16, outptr1 += 16, outptr2 += 16) {
+
+#if __BIG_ENDIAN__
+      /* Load 16 pixels == 48 or 64 bytes */
+      offset = (size_t)inptr & 15;
+      if (offset) {
+        __vector unsigned char unaligned_shift_index;
+        int bytes = num_cols + offset;
+
+        if (bytes < (RGB_PIXELSIZE + 1) * 16 && (bytes & 15)) {
+          /* Slow path to prevent buffer overread.  Since there is no way to
+           * read a partial AltiVec register, overread would occur on the last
+           * chunk of the last image row if the right edge is not on a 16-byte
+           * boundary.  It could also occur on other rows if the bytes per row
+           * is low enough.  Since we can't determine whether we're on the last
+           * image row, we have to assume every row is the last.
+           */
+          memcpy(tmpbuf, inptr, min(num_cols, RGB_PIXELSIZE * 16));
+          rgb0 = vec_ld(0, tmpbuf);
+          rgb1 = vec_ld(16, tmpbuf);
+          rgb2 = vec_ld(32, tmpbuf);
+#if RGB_PIXELSIZE == 4
+          rgb3 = vec_ld(48, tmpbuf);
+#endif
+        } else {
+          /* Fast path */
+          rgb0 = vec_ld(0, inptr);
+          if (bytes > 16)
+            rgb1 = vec_ld(16, inptr);
+          if (bytes > 32)
+            rgb2 = vec_ld(32, inptr);
+          if (bytes > 48)
+            rgb3 = vec_ld(48, inptr);
+#if RGB_PIXELSIZE == 4
+          if (bytes > 64)
+            rgb4 = vec_ld(64, inptr);
+#endif
+          unaligned_shift_index = vec_lvsl(0, inptr);
+          rgb0 = vec_perm(rgb0, rgb1, unaligned_shift_index);
+          rgb1 = vec_perm(rgb1, rgb2, unaligned_shift_index);
+          rgb2 = vec_perm(rgb2, rgb3, unaligned_shift_index);
+#if RGB_PIXELSIZE == 4
+          rgb3 = vec_perm(rgb3, rgb4, unaligned_shift_index);
+#endif
+        }
+      } else {
+#endif /* __BIG_ENDIAN__ */
+        if (num_cols < RGB_PIXELSIZE * 16 && (num_cols & 15)) {
+          /* Slow path */
+          memcpy(tmpbuf, inptr, min(num_cols, RGB_PIXELSIZE * 16));
+          rgb0 = VEC_LD(0, tmpbuf);
+          rgb1 = VEC_LD(16, tmpbuf);
+          rgb2 = VEC_LD(32, tmpbuf);
+#if RGB_PIXELSIZE == 4
+          rgb3 = VEC_LD(48, tmpbuf);
+#endif
+        } else {
+          /* Fast path */
+          rgb0 = VEC_LD(0, inptr);
+          if (num_cols > 16)
+            rgb1 = VEC_LD(16, inptr);
+          if (num_cols > 32)
+            rgb2 = VEC_LD(32, inptr);
+#if RGB_PIXELSIZE == 4
+          if (num_cols > 48)
+            rgb3 = VEC_LD(48, inptr);
+#endif
+        }
+#if __BIG_ENDIAN__
+      }
+#endif
+
+#if RGB_PIXELSIZE == 3
+      /* rgb0 = R0 G0 B0 R1 G1 B1 R2 G2 B2 R3 G3 B3 R4 G4 B4 R5
+       * rgb1 = G5 B5 R6 G6 B6 R7 G7 B7 R8 G8 B8 R9 G9 B9 Ra Ga
+       * rgb2 = Ba Rb Gb Bb Rc Gc Bc Rd Gd Bd Re Ge Be Rf Gf Bf
+       *
+       * rgbg0 = R0 G0 R1 G1 R2 G2 R3 G3 B0 G0 B1 G1 B2 G2 B3 G3
+       * rgbg1 = R4 G4 R5 G5 R6 G6 R7 G7 B4 G4 B5 G5 B6 G6 B7 G7
+       * rgbg2 = R8 G8 R9 G9 Ra Ga Rb Gb B8 G8 B9 G9 Ba Ga Bb Gb
+       * rgbg3 = Rc Gc Rd Gd Re Ge Rf Gf Bc Gc Bd Gd Be Ge Bf Gf
+       */
+      rgbg0 = vec_perm(rgb0, rgb0, (__vector unsigned char)RGBG_INDEX0);
+      rgbg1 = vec_perm(rgb0, rgb1, (__vector unsigned char)RGBG_INDEX1);
+      rgbg2 = vec_perm(rgb1, rgb2, (__vector unsigned char)RGBG_INDEX2);
+      rgbg3 = vec_perm(rgb2, rgb2, (__vector unsigned char)RGBG_INDEX3);
+#else
+      /* rgb0 = R0 G0 B0 X0 R1 G1 B1 X1 R2 G2 B2 X2 R3 G3 B3 X3
+       * rgb1 = R4 G4 B4 X4 R5 G5 B5 X5 R6 G6 B6 X6 R7 G7 B7 X7
+       * rgb2 = R8 G8 B8 X8 R9 G9 B9 X9 Ra Ga Ba Xa Rb Gb Bb Xb
+       * rgb3 = Rc Gc Bc Xc Rd Gd Bd Xd Re Ge Be Xe Rf Gf Bf Xf
+       *
+       * rgbg0 = R0 G0 R1 G1 R2 G2 R3 G3 B0 G0 B1 G1 B2 G2 B3 G3
+       * rgbg1 = R4 G4 R5 G5 R6 G6 R7 G7 B4 G4 B5 G5 B6 G6 B7 G7
+       * rgbg2 = R8 G8 R9 G9 Ra Ga Rb Gb B8 G8 B9 G9 Ba Ga Bb Gb
+       * rgbg3 = Rc Gc Rd Gd Re Ge Rf Gf Bc Gc Bd Gd Be Ge Bf Gf
+       */
+      rgbg0 = vec_perm(rgb0, rgb0, (__vector unsigned char)RGBG_INDEX);
+      rgbg1 = vec_perm(rgb1, rgb1, (__vector unsigned char)RGBG_INDEX);
+      rgbg2 = vec_perm(rgb2, rgb2, (__vector unsigned char)RGBG_INDEX);
+      rgbg3 = vec_perm(rgb3, rgb3, (__vector unsigned char)RGBG_INDEX);
+#endif
+
+      /* rg0 = R0 G0 R1 G1 R2 G2 R3 G3
+       * bg0 = B0 G0 B1 G1 B2 G2 B3 G3
+       * ...
+       *
+       * NOTE: We have to use vec_merge*() here because vec_unpack*() doesn't
+       * support unsigned vectors.
+       */
+      rg0 = (__vector signed short)VEC_UNPACKHU(rgbg0);
+      bg0 = (__vector signed short)VEC_UNPACKLU(rgbg0);
+      rg1 = (__vector signed short)VEC_UNPACKHU(rgbg1);
+      bg1 = (__vector signed short)VEC_UNPACKLU(rgbg1);
+      rg2 = (__vector signed short)VEC_UNPACKHU(rgbg2);
+      bg2 = (__vector signed short)VEC_UNPACKLU(rgbg2);
+      rg3 = (__vector signed short)VEC_UNPACKHU(rgbg3);
+      bg3 = (__vector signed short)VEC_UNPACKLU(rgbg3);
+
+      /* (Original)
+       * Y  =  0.29900 * R + 0.58700 * G + 0.11400 * B
+       * Cb = -0.16874 * R - 0.33126 * G + 0.50000 * B + CENTERJSAMPLE
+       * Cr =  0.50000 * R - 0.41869 * G - 0.08131 * B + CENTERJSAMPLE
+       *
+       * (This implementation)
+       * Y  =  0.29900 * R + 0.33700 * G + 0.11400 * B + 0.25000 * G
+       * Cb = -0.16874 * R - 0.33126 * G + 0.50000 * B + CENTERJSAMPLE
+       * Cr =  0.50000 * R - 0.41869 * G - 0.08131 * B + CENTERJSAMPLE
+       */
+
+      /* Calculate Y values */
+
+      y0 = vec_msums(rg0, pw_f0299_f0337, pd_onehalf);
+      y1 = vec_msums(rg1, pw_f0299_f0337, pd_onehalf);
+      y2 = vec_msums(rg2, pw_f0299_f0337, pd_onehalf);
+      y3 = vec_msums(rg3, pw_f0299_f0337, pd_onehalf);
+      y0 = vec_msums(bg0, pw_f0114_f0250, y0);
+      y1 = vec_msums(bg1, pw_f0114_f0250, y1);
+      y2 = vec_msums(bg2, pw_f0114_f0250, y2);
+      y3 = vec_msums(bg3, pw_f0114_f0250, y3);
+      /* Clever way to avoid 4 shifts + 2 packs.  This packs the high word from
+       * each dword into a new 16-bit vector, which is the equivalent of
+       * descaling the 32-bit results (right-shifting by 16 bits) and then
+       * packing them.
+       */
+      yl = vec_perm((__vector unsigned short)y0, (__vector unsigned short)y1,
+                    shift_pack_index);
+      yh = vec_perm((__vector unsigned short)y2, (__vector unsigned short)y3,
+                    shift_pack_index);
+      y = vec_pack(yl, yh);
+      vec_st(y, 0, outptr0);
+
+      /* Calculate Cb values */
+      cb0 = vec_msums(rg0, pw_mf016_mf033, pd_onehalfm1_cj);
+      cb1 = vec_msums(rg1, pw_mf016_mf033, pd_onehalfm1_cj);
+      cb2 = vec_msums(rg2, pw_mf016_mf033, pd_onehalfm1_cj);
+      cb3 = vec_msums(rg3, pw_mf016_mf033, pd_onehalfm1_cj);
+      cb0 = (__vector int)vec_msum((__vector unsigned short)bg0, pw_f050_f000,
+                                   (__vector unsigned int)cb0);
+      cb1 = (__vector int)vec_msum((__vector unsigned short)bg1, pw_f050_f000,
+                                   (__vector unsigned int)cb1);
+      cb2 = (__vector int)vec_msum((__vector unsigned short)bg2, pw_f050_f000,
+                                   (__vector unsigned int)cb2);
+      cb3 = (__vector int)vec_msum((__vector unsigned short)bg3, pw_f050_f000,
+                                   (__vector unsigned int)cb3);
+      cbl = vec_perm((__vector unsigned short)cb0,
+                     (__vector unsigned short)cb1, shift_pack_index);
+      cbh = vec_perm((__vector unsigned short)cb2,
+                     (__vector unsigned short)cb3, shift_pack_index);
+      cb = vec_pack(cbl, cbh);
+      vec_st(cb, 0, outptr1);
+
+      /* Calculate Cr values */
+      cr0 = vec_msums(bg0, pw_mf008_mf041, pd_onehalfm1_cj);
+      cr1 = vec_msums(bg1, pw_mf008_mf041, pd_onehalfm1_cj);
+      cr2 = vec_msums(bg2, pw_mf008_mf041, pd_onehalfm1_cj);
+      cr3 = vec_msums(bg3, pw_mf008_mf041, pd_onehalfm1_cj);
+      cr0 = (__vector int)vec_msum((__vector unsigned short)rg0, pw_f050_f000,
+                                   (__vector unsigned int)cr0);
+      cr1 = (__vector int)vec_msum((__vector unsigned short)rg1, pw_f050_f000,
+                                   (__vector unsigned int)cr1);
+      cr2 = (__vector int)vec_msum((__vector unsigned short)rg2, pw_f050_f000,
+                                   (__vector unsigned int)cr2);
+      cr3 = (__vector int)vec_msum((__vector unsigned short)rg3, pw_f050_f000,
+                                   (__vector unsigned int)cr3);
+      crl = vec_perm((__vector unsigned short)cr0,
+                     (__vector unsigned short)cr1, shift_pack_index);
+      crh = vec_perm((__vector unsigned short)cr2,
+                     (__vector unsigned short)cr3, shift_pack_index);
+      cr = vec_pack(crl, crh);
+      vec_st(cr, 0, outptr2);
+    }
+  }
+}
diff --git a/simd/jccolor-altivec.c b/simd/jccolor-altivec.c
new file mode 100644
index 0000000..04b8708
--- /dev/null
+++ b/simd/jccolor-altivec.c
@@ -0,0 +1,104 @@
+/*
+ * AltiVec optimizations for libjpeg-turbo
+ *
+ * Copyright (C) 2014, D. R. Commander.
+ * All rights reserved.
+ * This software is provided 'as-is', without any express or implied
+ * warranty.  In no event will the authors be held liable for any damages
+ * arising from the use of this software.
+ *
+ * Permission is granted to anyone to use this software for any purpose,
+ * including commercial applications, and to alter it and redistribute it
+ * freely, subject to the following restrictions:
+ *
+ * 1. The origin of this software must not be misrepresented; you must not
+ *    claim that you wrote the original software. If you use this software
+ *    in a product, an acknowledgment in the product documentation would be
+ *    appreciated but is not required.
+ * 2. Altered source versions must be plainly marked as such, and must not be
+ *    misrepresented as being the original software.
+ * 3. This notice may not be removed or altered from any source distribution.
+ */
+
+/* RGB --> YCC CONVERSION */
+
+#include "jsimd_altivec.h"
+
+
+#define F_0_081 5329                 /* FIX(0.08131) */
+#define F_0_114 7471                 /* FIX(0.11400) */
+#define F_0_168 11059                /* FIX(0.16874) */
+#define F_0_250 16384                /* FIX(0.25000) */
+#define F_0_299 19595                /* FIX(0.29900) */
+#define F_0_331 21709                /* FIX(0.33126) */
+#define F_0_418 27439                /* FIX(0.41869) */
+#define F_0_500 32768                /* FIX(0.50000) */
+#define F_0_587 38470                /* FIX(0.58700) */
+#define F_0_337 (F_0_587 - F_0_250)  /* FIX(0.58700) - FIX(0.25000) */
+
+#define SCALEBITS 16
+#define ONE_HALF (1 << (SCALEBITS - 1))
+
+
+#define RGBG_INDEX0 {0,1,3,4,6,7,9,10,2,1,5,4,8,7,11,10}
+#define RGBG_INDEX1 {12,13,15,16,18,19,21,22,14,13,17,16,20,19,23,22}
+#define RGBG_INDEX2 {8,9,11,12,14,15,17,18,10,9,13,12,16,15,19,18}
+#define RGBG_INDEX3 {4,5,7,8,10,11,13,14,6,5,9,8,12,11,15,14}
+#include "jccolext-altivec.c"
+#undef RGB_PIXELSIZE
+
+#define RGB_PIXELSIZE EXT_RGB_PIXELSIZE
+#define jsimd_rgb_ycc_convert_altivec jsimd_extrgb_ycc_convert_altivec
+#include "jccolext-altivec.c"
+#undef RGB_PIXELSIZE
+#undef RGBG_INDEX0
+#undef RGBG_INDEX1
+#undef RGBG_INDEX2
+#undef RGBG_INDEX3
+#undef jsimd_rgb_ycc_convert_altivec
+
+#define RGB_PIXELSIZE EXT_RGBX_PIXELSIZE
+#define RGBG_INDEX {0,1,4,5,8,9,12,13,2,1,6,5,10,9,14,13}
+#define jsimd_rgb_ycc_convert_altivec jsimd_extrgbx_ycc_convert_altivec
+#include "jccolext-altivec.c"
+#undef RGB_PIXELSIZE
+#undef RGBG_INDEX
+#undef jsimd_rgb_ycc_convert_altivec
+
+#define RGB_PIXELSIZE EXT_BGR_PIXELSIZE
+#define RGBG_INDEX0 {2,1,5,4,8,7,11,10,0,1,3,4,6,7,9,10}
+#define RGBG_INDEX1 {14,13,17,16,20,19,23,22,12,13,15,16,18,19,21,22}
+#define RGBG_INDEX2 {10,9,13,12,16,15,19,18,8,9,11,12,14,15,17,18}
+#define RGBG_INDEX3 {6,5,9,8,12,11,15,14,4,5,7,8,10,11,13,14}
+#define jsimd_rgb_ycc_convert_altivec jsimd_extbgr_ycc_convert_altivec
+#include "jccolext-altivec.c"
+#undef RGB_PIXELSIZE
+#undef RGBG_INDEX0
+#undef RGBG_INDEX1
+#undef RGBG_INDEX2
+#undef RGBG_INDEX3
+#undef jsimd_rgb_ycc_convert_altivec
+
+#define RGB_PIXELSIZE EXT_BGRX_PIXELSIZE
+#define RGBG_INDEX {2,1,6,5,10,9,14,13,0,1,4,5,8,9,12,13}
+#define jsimd_rgb_ycc_convert_altivec jsimd_extbgrx_ycc_convert_altivec
+#include "jccolext-altivec.c"
+#undef RGB_PIXELSIZE
+#undef RGBG_INDEX
+#undef jsimd_rgb_ycc_convert_altivec
+
+#define RGB_PIXELSIZE EXT_XBGR_PIXELSIZE
+#define RGBG_INDEX {3,2,7,6,11,10,15,14,1,2,5,6,9,10,13,14}
+#define jsimd_rgb_ycc_convert_altivec jsimd_extxbgr_ycc_convert_altivec
+#include "jccolext-altivec.c"
+#undef RGB_PIXELSIZE
+#undef RGBG_INDEX
+#undef jsimd_rgb_ycc_convert_altivec
+
+#define RGB_PIXELSIZE EXT_XRGB_PIXELSIZE
+#define RGBG_INDEX {1,2,5,6,9,10,13,14,3,2,7,6,11,10,15,14}
+#define jsimd_rgb_ycc_convert_altivec jsimd_extxrgb_ycc_convert_altivec
+#include "jccolext-altivec.c"
+#undef RGB_PIXELSIZE
+#undef RGBG_INDEX
+#undef jsimd_rgb_ycc_convert_altivec
diff --git a/simd/jcgray-altivec.c b/simd/jcgray-altivec.c
new file mode 100644
index 0000000..b52fade
--- /dev/null
+++ b/simd/jcgray-altivec.c
@@ -0,0 +1,99 @@
+/*
+ * AltiVec optimizations for libjpeg-turbo
+ *
+ * Copyright (C) 2014, D. R. Commander.
+ * All rights reserved.
+ * This software is provided 'as-is', without any express or implied
+ * warranty.  In no event will the authors be held liable for any damages
+ * arising from the use of this software.
+ *
+ * Permission is granted to anyone to use this software for any purpose,
+ * including commercial applications, and to alter it and redistribute it
+ * freely, subject to the following restrictions:
+ *
+ * 1. The origin of this software must not be misrepresented; you must not
+ *    claim that you wrote the original software. If you use this software
+ *    in a product, an acknowledgment in the product documentation would be
+ *    appreciated but is not required.
+ * 2. Altered source versions must be plainly marked as such, and must not be
+ *    misrepresented as being the original software.
+ * 3. This notice may not be removed or altered from any source distribution.
+ */
+
+/* RGB --> GRAYSCALE CONVERSION */
+
+#include "jsimd_altivec.h"
+
+
+#define F_0_114 7471                 /* FIX(0.11400) */
+#define F_0_250 16384                /* FIX(0.25000) */
+#define F_0_299 19595                /* FIX(0.29900) */
+#define F_0_587 38470                /* FIX(0.58700) */
+#define F_0_337 (F_0_587 - F_0_250)  /* FIX(0.58700) - FIX(0.25000) */
+
+#define SCALEBITS 16
+#define ONE_HALF (1 << (SCALEBITS - 1))
+
+
+#define RGBG_INDEX0 {0,1,3,4,6,7,9,10,2,1,5,4,8,7,11,10}
+#define RGBG_INDEX1 {12,13,15,16,18,19,21,22,14,13,17,16,20,19,23,22}
+#define RGBG_INDEX2 {8,9,11,12,14,15,17,18,10,9,13,12,16,15,19,18}
+#define RGBG_INDEX3 {4,5,7,8,10,11,13,14,6,5,9,8,12,11,15,14}
+#include "jcgryext-altivec.c"
+#undef RGB_PIXELSIZE
+
+#define RGB_PIXELSIZE EXT_RGB_PIXELSIZE
+#define jsimd_rgb_gray_convert_altivec jsimd_extrgb_gray_convert_altivec
+#include "jcgryext-altivec.c"
+#undef RGB_PIXELSIZE
+#undef RGBG_INDEX0
+#undef RGBG_INDEX1
+#undef RGBG_INDEX2
+#undef RGBG_INDEX3
+#undef jsimd_rgb_gray_convert_altivec
+
+#define RGB_PIXELSIZE EXT_RGBX_PIXELSIZE
+#define RGBG_INDEX {0,1,4,5,8,9,12,13,2,1,6,5,10,9,14,13}
+#define jsimd_rgb_gray_convert_altivec jsimd_extrgbx_gray_convert_altivec
+#include "jcgryext-altivec.c"
+#undef RGB_PIXELSIZE
+#undef RGBG_INDEX
+#undef jsimd_rgb_gray_convert_altivec
+
+#define RGB_PIXELSIZE EXT_BGR_PIXELSIZE
+#define RGBG_INDEX0 {2,1,5,4,8,7,11,10,0,1,3,4,6,7,9,10}
+#define RGBG_INDEX1 {14,13,17,16,20,19,23,22,12,13,15,16,18,19,21,22}
+#define RGBG_INDEX2 {10,9,13,12,16,15,19,18,8,9,11,12,14,15,17,18}
+#define RGBG_INDEX3 {6,5,9,8,12,11,15,14,4,5,7,8,10,11,13,14}
+#define jsimd_rgb_gray_convert_altivec jsimd_extbgr_gray_convert_altivec
+#include "jcgryext-altivec.c"
+#undef RGB_PIXELSIZE
+#undef RGBG_INDEX0
+#undef RGBG_INDEX1
+#undef RGBG_INDEX2
+#undef RGBG_INDEX3
+#undef jsimd_rgb_gray_convert_altivec
+
+#define RGB_PIXELSIZE EXT_BGRX_PIXELSIZE
+#define RGBG_INDEX {2,1,6,5,10,9,14,13,0,1,4,5,8,9,12,13}
+#define jsimd_rgb_gray_convert_altivec jsimd_extbgrx_gray_convert_altivec
+#include "jcgryext-altivec.c"
+#undef RGB_PIXELSIZE
+#undef RGBG_INDEX
+#undef jsimd_rgb_gray_convert_altivec
+
+#define RGB_PIXELSIZE EXT_XBGR_PIXELSIZE
+#define RGBG_INDEX {3,2,7,6,11,10,15,14,1,2,5,6,9,10,13,14}
+#define jsimd_rgb_gray_convert_altivec jsimd_extxbgr_gray_convert_altivec
+#include "jcgryext-altivec.c"
+#undef RGB_PIXELSIZE
+#undef RGBG_INDEX
+#undef jsimd_rgb_gray_convert_altivec
+
+#define RGB_PIXELSIZE EXT_XRGB_PIXELSIZE
+#define RGBG_INDEX {1,2,5,6,9,10,13,14,3,2,7,6,11,10,15,14}
+#define jsimd_rgb_gray_convert_altivec jsimd_extxrgb_gray_convert_altivec
+#include "jcgryext-altivec.c"
+#undef RGB_PIXELSIZE
+#undef RGBG_INDEX
+#undef jsimd_rgb_gray_convert_altivec
diff --git a/simd/jcgryext-altivec.c b/simd/jcgryext-altivec.c
new file mode 100644
index 0000000..c171615
--- /dev/null
+++ b/simd/jcgryext-altivec.c
@@ -0,0 +1,227 @@
+/*
+ * AltiVec optimizations for libjpeg-turbo
+ *
+ * Copyright (C) 2014-2015, D. R. Commander.
+ * Copyright (C) 2014, Jay Foad.
+ * All rights reserved.
+ * This software is provided 'as-is', without any express or implied
+ * warranty.  In no event will the authors be held liable for any damages
+ * arising from the use of this software.
+ *
+ * Permission is granted to anyone to use this software for any purpose,
+ * including commercial applications, and to alter it and redistribute it
+ * freely, subject to the following restrictions:
+ *
+ * 1. The origin of this software must not be misrepresented; you must not
+ *    claim that you wrote the original software. If you use this software
+ *    in a product, an acknowledgment in the product documentation would be
+ *    appreciated but is not required.
+ * 2. Altered source versions must be plainly marked as such, and must not be
+ *    misrepresented as being the original software.
+ * 3. This notice may not be removed or altered from any source distribution.
+ */
+
+/* This file is included by jcgray-altivec.c */
+
+
+void jsimd_rgb_gray_convert_altivec (JDIMENSION img_width,
+                                     JSAMPARRAY input_buf,
+                                     JSAMPIMAGE output_buf,
+                                     JDIMENSION output_row, int num_rows)
+{
+  JSAMPROW inptr, outptr;
+  int pitch = img_width * RGB_PIXELSIZE, num_cols;
+#if __BIG_ENDIAN__
+  int offset;
+  unsigned char __attribute__((aligned(16))) tmpbuf[RGB_PIXELSIZE * 16];
+#endif
+
+  __vector unsigned char rgb0, rgb1 = {0}, rgb2 = {0},
+    rgbg0, rgbg1, rgbg2, rgbg3, y;
+#if __BIG_ENDIAN__ || RGB_PIXELSIZE == 4
+  __vector unsigned char rgb3 = {0};
+#endif
+#if __BIG_ENDIAN__ && RGB_PIXELSIZE == 4
+  __vector unsigned char rgb4 = {0};
+#endif
+  __vector short rg0, rg1, rg2, rg3, bg0, bg1, bg2, bg3;
+  __vector unsigned short yl, yh;
+  __vector int y0, y1, y2, y3;
+
+  /* Constants */
+  __vector short pw_f0299_f0337 = { __4X2(F_0_299, F_0_337) },
+    pw_f0114_f0250 = { __4X2(F_0_114, F_0_250) };
+  __vector int pd_onehalf = { __4X(ONE_HALF) };
+  __vector unsigned char pb_zero = { __16X(0) },
+#if __BIG_ENDIAN__
+    shift_pack_index = {0,1,4,5,8,9,12,13,16,17,20,21,24,25,28,29};
+#else
+    shift_pack_index = {2,3,6,7,10,11,14,15,18,19,22,23,26,27,30,31};
+#endif
+
+  while (--num_rows >= 0) {
+    inptr = *input_buf++;
+    outptr = output_buf[0][output_row];
+    output_row++;
+
+    for (num_cols = pitch; num_cols > 0;
+         num_cols -= RGB_PIXELSIZE * 16, inptr += RGB_PIXELSIZE * 16,
+         outptr += 16) {
+
+#if __BIG_ENDIAN__
+      /* Load 16 pixels == 48 or 64 bytes */
+      offset = (size_t)inptr & 15;
+      if (offset) {
+        __vector unsigned char unaligned_shift_index;
+        int bytes = num_cols + offset;
+
+        if (bytes < (RGB_PIXELSIZE + 1) * 16 && (bytes & 15)) {
+          /* Slow path to prevent buffer overread.  Since there is no way to
+           * read a partial AltiVec register, overread would occur on the last
+           * chunk of the last image row if the right edge is not on a 16-byte
+           * boundary.  It could also occur on other rows if the bytes per row
+           * is low enough.  Since we can't determine whether we're on the last
+           * image row, we have to assume every row is the last.
+           */
+          memcpy(tmpbuf, inptr, min(num_cols, RGB_PIXELSIZE * 16));
+          rgb0 = vec_ld(0, tmpbuf);
+          rgb1 = vec_ld(16, tmpbuf);
+          rgb2 = vec_ld(32, tmpbuf);
+#if RGB_PIXELSIZE == 4
+          rgb3 = vec_ld(48, tmpbuf);
+#endif
+        } else {
+          /* Fast path */
+          rgb0 = vec_ld(0, inptr);
+          if (bytes > 16)
+            rgb1 = vec_ld(16, inptr);
+          if (bytes > 32)
+            rgb2 = vec_ld(32, inptr);
+          if (bytes > 48)
+            rgb3 = vec_ld(48, inptr);
+#if RGB_PIXELSIZE == 4
+          if (bytes > 64)
+            rgb4 = vec_ld(64, inptr);
+#endif
+          unaligned_shift_index = vec_lvsl(0, inptr);
+          rgb0 = vec_perm(rgb0, rgb1, unaligned_shift_index);
+          rgb1 = vec_perm(rgb1, rgb2, unaligned_shift_index);
+          rgb2 = vec_perm(rgb2, rgb3, unaligned_shift_index);
+#if RGB_PIXELSIZE == 4
+          rgb3 = vec_perm(rgb3, rgb4, unaligned_shift_index);
+#endif
+        }
+      } else {
+        if (num_cols < RGB_PIXELSIZE * 16 && (num_cols & 15)) {
+          /* Slow path */
+          memcpy(tmpbuf, inptr, min(num_cols, RGB_PIXELSIZE * 16));
+          rgb0 = vec_ld(0, tmpbuf);
+          rgb1 = vec_ld(16, tmpbuf);
+          rgb2 = vec_ld(32, tmpbuf);
+#if RGB_PIXELSIZE == 4
+          rgb3 = vec_ld(48, tmpbuf);
+#endif
+        } else {
+          /* Fast path */
+          rgb0 = vec_ld(0, inptr);
+          if (num_cols > 16)
+            rgb1 = vec_ld(16, inptr);
+          if (num_cols > 32)
+            rgb2 = vec_ld(32, inptr);
+#if RGB_PIXELSIZE == 4
+          if (num_cols > 48)
+            rgb3 = vec_ld(48, inptr);
+#endif
+        }
+      }
+#else
+      /* Little endian */
+      rgb0 = vec_vsx_ld(0, inptr);
+      if (num_cols > 16)
+        rgb1 = vec_vsx_ld(16, inptr);
+      if (num_cols > 32)
+        rgb2 = vec_vsx_ld(32, inptr);
+#if RGB_PIXELSIZE == 4
+      if (num_cols > 48)
+        rgb3 = vec_vsx_ld(48, inptr);
+#endif
+#endif
+
+#if RGB_PIXELSIZE == 3
+      /* rgb0 = R0 G0 B0 R1 G1 B1 R2 G2 B2 R3 G3 B3 R4 G4 B4 R5
+       * rgb1 = G5 B5 R6 G6 B6 R7 G7 B7 R8 G8 B8 R9 G9 B9 Ra Ga
+       * rgb2 = Ba Rb Gb Bb Rc Gc Bc Rd Gd Bd Re Ge Be Rf Gf Bf
+       *
+       * rgbg0 = R0 G0 R1 G1 R2 G2 R3 G3 B0 G0 B1 G1 B2 G2 B3 G3
+       * rgbg1 = R4 G4 R5 G5 R6 G6 R7 G7 B4 G4 B5 G5 B6 G6 B7 G7
+       * rgbg2 = R8 G8 R9 G9 Ra Ga Rb Gb B8 G8 B9 G9 Ba Ga Bb Gb
+       * rgbg3 = Rc Gc Rd Gd Re Ge Rf Gf Bc Gc Bd Gd Be Ge Bf Gf
+       */
+      rgbg0 = vec_perm(rgb0, rgb0, (__vector unsigned char)RGBG_INDEX0);
+      rgbg1 = vec_perm(rgb0, rgb1, (__vector unsigned char)RGBG_INDEX1);
+      rgbg2 = vec_perm(rgb1, rgb2, (__vector unsigned char)RGBG_INDEX2);
+      rgbg3 = vec_perm(rgb2, rgb2, (__vector unsigned char)RGBG_INDEX3);
+#else
+      /* rgb0 = R0 G0 B0 X0 R1 G1 B1 X1 R2 G2 B2 X2 R3 G3 B3 X3
+       * rgb1 = R4 G4 B4 X4 R5 G5 B5 X5 R6 G6 B6 X6 R7 G7 B7 X7
+       * rgb2 = R8 G8 B8 X8 R9 G9 B9 X9 Ra Ga Ba Xa Rb Gb Bb Xb
+       * rgb3 = Rc Gc Bc Xc Rd Gd Bd Xd Re Ge Be Xe Rf Gf Bf Xf
+       *
+       * rgbg0 = R0 G0 R1 G1 R2 G2 R3 G3 B0 G0 B1 G1 B2 G2 B3 G3
+       * rgbg1 = R4 G4 R5 G5 R6 G6 R7 G7 B4 G4 B5 G5 B6 G6 B7 G7
+       * rgbg2 = R8 G8 R9 G9 Ra Ga Rb Gb B8 G8 B9 G9 Ba Ga Bb Gb
+       * rgbg3 = Rc Gc Rd Gd Re Ge Rf Gf Bc Gc Bd Gd Be Ge Bf Gf
+       */
+      rgbg0 = vec_perm(rgb0, rgb0, (__vector unsigned char)RGBG_INDEX);
+      rgbg1 = vec_perm(rgb1, rgb1, (__vector unsigned char)RGBG_INDEX);
+      rgbg2 = vec_perm(rgb2, rgb2, (__vector unsigned char)RGBG_INDEX);
+      rgbg3 = vec_perm(rgb3, rgb3, (__vector unsigned char)RGBG_INDEX);
+#endif
+
+      /* rg0 = R0 G0 R1 G1 R2 G2 R3 G3
+       * bg0 = B0 G0 B1 G1 B2 G2 B3 G3
+       * ...
+       *
+       * NOTE: We have to use vec_merge*() here because vec_unpack*() doesn't
+       * support unsigned vectors.
+       */
+      rg0 = (__vector signed short)VEC_UNPACKHU(rgbg0);
+      bg0 = (__vector signed short)VEC_UNPACKLU(rgbg0);
+      rg1 = (__vector signed short)VEC_UNPACKHU(rgbg1);
+      bg1 = (__vector signed short)VEC_UNPACKLU(rgbg1);
+      rg2 = (__vector signed short)VEC_UNPACKHU(rgbg2);
+      bg2 = (__vector signed short)VEC_UNPACKLU(rgbg2);
+      rg3 = (__vector signed short)VEC_UNPACKHU(rgbg3);
+      bg3 = (__vector signed short)VEC_UNPACKLU(rgbg3);
+
+      /* (Original)
+       * Y  =  0.29900 * R + 0.58700 * G + 0.11400 * B
+       *
+       * (This implementation)
+       * Y  =  0.29900 * R + 0.33700 * G + 0.11400 * B + 0.25000 * G
+       */
+
+      /* Calculate Y values */
+
+      y0 = vec_msums(rg0, pw_f0299_f0337, pd_onehalf);
+      y1 = vec_msums(rg1, pw_f0299_f0337, pd_onehalf);
+      y2 = vec_msums(rg2, pw_f0299_f0337, pd_onehalf);
+      y3 = vec_msums(rg3, pw_f0299_f0337, pd_onehalf);
+      y0 = vec_msums(bg0, pw_f0114_f0250, y0);
+      y1 = vec_msums(bg1, pw_f0114_f0250, y1);
+      y2 = vec_msums(bg2, pw_f0114_f0250, y2);
+      y3 = vec_msums(bg3, pw_f0114_f0250, y3);
+      /* Clever way to avoid 4 shifts + 2 packs.  This packs the high word from
+       * each dword into a new 16-bit vector, which is the equivalent of
+       * descaling the 32-bit results (right-shifting by 16 bits) and then
+       * packing them.
+       */
+      yl = vec_perm((__vector unsigned short)y0, (__vector unsigned short)y1,
+                    shift_pack_index);
+      yh = vec_perm((__vector unsigned short)y2, (__vector unsigned short)y3,
+                    shift_pack_index);
+      y = vec_pack(yl, yh);
+      vec_st(y, 0, outptr);
+    }
+  }
+}
diff --git a/simd/jchuff-sse2-64.asm b/simd/jchuff-sse2-64.asm
new file mode 100644
index 0000000..84eaeeb
--- /dev/null
+++ b/simd/jchuff-sse2-64.asm
@@ -0,0 +1,361 @@
+;
+; jchuff-sse2-64.asm - Huffman entropy encoding (64-bit SSE2)
+;
+; Copyright 2009-2011, 2014-2016 D. R. Commander.
+; Copyright 2015 Matthieu Darbois
+;
+; Based on
+; x86 SIMD extension for IJG JPEG library
+; Copyright (C) 1999-2006, MIYASAKA Masaru.
+; For conditions of distribution and use, see copyright notice in jsimdext.inc
+;
+; This file should be assembled with NASM (Netwide Assembler),
+; can *not* be assembled with Microsoft's MASM or any compatible
+; assembler (including Borland's Turbo Assembler).
+; NASM is available from http://nasm.sourceforge.net/ or
+; http://sourceforge.net/project/showfiles.php?group_id=6208
+;
+; This file contains an SSE2 implementation for Huffman coding of one block.
+; The following code is based directly on jchuff.c; see jchuff.c for more
+; details.
+;
+; [TAB8]
+
+%include "jsimdext.inc"
+
+; --------------------------------------------------------------------------
+        SECTION SEG_CONST
+
+        alignz  16
+        global  EXTN(jconst_huff_encode_one_block)
+
+EXTN(jconst_huff_encode_one_block):
+
+%include "jpeg_nbits_table.inc"
+
+        alignz  16
+
+; --------------------------------------------------------------------------
+        SECTION SEG_TEXT
+        BITS    64
+
+; These macros perform the same task as the emit_bits() function in the
+; original libjpeg code.  In addition to reducing overhead by explicitly
+; inlining the code, additional performance is achieved by taking into
+; account the size of the bit buffer and waiting until it is almost full
+; before emptying it.  This mostly benefits 64-bit platforms, since 6
+; bytes can be stored in a 64-bit bit buffer before it has to be emptied.
+
+%macro EMIT_BYTE 0
+        sub put_bits, 8  ; put_bits -= 8;
+        mov rdx, put_buffer
+        mov ecx, put_bits
+        shr rdx, cl  ; c = (JOCTET)GETJOCTET(put_buffer >> put_bits);
+        mov byte [buffer], dl  ; *buffer++ = c;
+        add buffer, 1
+        cmp dl, 0xFF  ; need to stuff a zero byte?
+        jne %%.EMIT_BYTE_END
+        mov byte [buffer], 0  ; *buffer++ = 0;
+        add buffer, 1
+%%.EMIT_BYTE_END:
+%endmacro
+
+%macro PUT_BITS 1
+        add put_bits, ecx  ; put_bits += size;
+        shl put_buffer, cl  ; put_buffer = (put_buffer << size);
+        or  put_buffer, %1
+%endmacro
+
+%macro CHECKBUF31 0
+        cmp put_bits, 32  ; if (put_bits > 31) {
+        jl %%.CHECKBUF31_END
+        EMIT_BYTE
+        EMIT_BYTE
+        EMIT_BYTE
+        EMIT_BYTE
+%%.CHECKBUF31_END:
+%endmacro
+
+%macro CHECKBUF47 0
+        cmp put_bits, 48  ; if (put_bits > 47) {
+        jl %%.CHECKBUF47_END
+        EMIT_BYTE
+        EMIT_BYTE
+        EMIT_BYTE
+        EMIT_BYTE
+        EMIT_BYTE
+        EMIT_BYTE
+%%.CHECKBUF47_END:
+%endmacro
+
+%macro EMIT_BITS 2
+        CHECKBUF47
+        mov ecx, %2
+        PUT_BITS %1
+%endmacro
+
+%macro kloop_prepare 37  ;(ko, jno0, ..., jno31, xmm0, xmm1, xmm2, xmm3)
+    pxor xmm8, xmm8  ; __m128i neg = _mm_setzero_si128();
+    pxor xmm9, xmm9  ; __m128i neg = _mm_setzero_si128();
+    pxor xmm10, xmm10  ; __m128i neg = _mm_setzero_si128();
+    pxor xmm11, xmm11  ; __m128i neg = _mm_setzero_si128();
+    pinsrw %34, word [r12 + %2  * SIZEOF_WORD], 0  ; xmm_shadow[0] = block[jno0];
+    pinsrw %35, word [r12 + %10 * SIZEOF_WORD], 0  ; xmm_shadow[8] = block[jno8];
+    pinsrw %36, word [r12 + %18 * SIZEOF_WORD], 0  ; xmm_shadow[16] = block[jno16];
+    pinsrw %37, word [r12 + %26 * SIZEOF_WORD], 0  ; xmm_shadow[24] = block[jno24];
+    pinsrw %34, word [r12 + %3  * SIZEOF_WORD], 1  ; xmm_shadow[1] = block[jno1];
+    pinsrw %35, word [r12 + %11 * SIZEOF_WORD], 1  ; xmm_shadow[9] = block[jno9];
+    pinsrw %36, word [r12 + %19 * SIZEOF_WORD], 1  ; xmm_shadow[17] = block[jno17];
+    pinsrw %37, word [r12 + %27 * SIZEOF_WORD], 1  ; xmm_shadow[25] = block[jno25];
+    pinsrw %34, word [r12 + %4  * SIZEOF_WORD], 2  ; xmm_shadow[2] = block[jno2];
+    pinsrw %35, word [r12 + %12 * SIZEOF_WORD], 2  ; xmm_shadow[10] = block[jno10];
+    pinsrw %36, word [r12 + %20 * SIZEOF_WORD], 2  ; xmm_shadow[18] = block[jno18];
+    pinsrw %37, word [r12 + %28 * SIZEOF_WORD], 2  ; xmm_shadow[26] = block[jno26];
+    pinsrw %34, word [r12 + %5  * SIZEOF_WORD], 3  ; xmm_shadow[3] = block[jno3];
+    pinsrw %35, word [r12 + %13 * SIZEOF_WORD], 3  ; xmm_shadow[11] = block[jno11];
+    pinsrw %36, word [r12 + %21 * SIZEOF_WORD], 3  ; xmm_shadow[19] = block[jno19];
+    pinsrw %37, word [r12 + %29 * SIZEOF_WORD], 3  ; xmm_shadow[27] = block[jno27];
+    pinsrw %34, word [r12 + %6  * SIZEOF_WORD], 4  ; xmm_shadow[4] = block[jno4];
+    pinsrw %35, word [r12 + %14 * SIZEOF_WORD], 4  ; xmm_shadow[12] = block[jno12];
+    pinsrw %36, word [r12 + %22 * SIZEOF_WORD], 4  ; xmm_shadow[20] = block[jno20];
+    pinsrw %37, word [r12 + %30 * SIZEOF_WORD], 4  ; xmm_shadow[28] = block[jno28];
+    pinsrw %34, word [r12 + %7  * SIZEOF_WORD], 5  ; xmm_shadow[5] = block[jno5];
+    pinsrw %35, word [r12 + %15 * SIZEOF_WORD], 5  ; xmm_shadow[13] = block[jno13];
+    pinsrw %36, word [r12 + %23 * SIZEOF_WORD], 5  ; xmm_shadow[21] = block[jno21];
+    pinsrw %37, word [r12 + %31 * SIZEOF_WORD], 5  ; xmm_shadow[29] = block[jno29];
+    pinsrw %34, word [r12 + %8  * SIZEOF_WORD], 6  ; xmm_shadow[6] = block[jno6];
+    pinsrw %35, word [r12 + %16 * SIZEOF_WORD], 6  ; xmm_shadow[14] = block[jno14];
+    pinsrw %36, word [r12 + %24 * SIZEOF_WORD], 6  ; xmm_shadow[22] = block[jno22];
+    pinsrw %37, word [r12 + %32 * SIZEOF_WORD], 6  ; xmm_shadow[30] = block[jno30];
+    pinsrw %34, word [r12 + %9  * SIZEOF_WORD], 7  ; xmm_shadow[7] = block[jno7];
+    pinsrw %35, word [r12 + %17 * SIZEOF_WORD], 7  ; xmm_shadow[15] = block[jno15];
+    pinsrw %36, word [r12 + %25 * SIZEOF_WORD], 7  ; xmm_shadow[23] = block[jno23];
+%if %1 != 32
+    pinsrw %37, word [r12 + %33 * SIZEOF_WORD], 7  ; xmm_shadow[31] = block[jno31];
+%else
+    pinsrw %37, ebx, 7  ; xmm_shadow[31] = block[jno31];
+%endif
+    pcmpgtw xmm8, %34  ; neg = _mm_cmpgt_epi16(neg, x1);
+    pcmpgtw xmm9, %35  ; neg = _mm_cmpgt_epi16(neg, x1);
+    pcmpgtw xmm10, %36  ; neg = _mm_cmpgt_epi16(neg, x1);
+    pcmpgtw xmm11, %37  ; neg = _mm_cmpgt_epi16(neg, x1);
+    paddw %34, xmm8   ; x1 = _mm_add_epi16(x1, neg);
+    paddw %35, xmm9   ; x1 = _mm_add_epi16(x1, neg);
+    paddw %36, xmm10  ; x1 = _mm_add_epi16(x1, neg);
+    paddw %37, xmm11  ; x1 = _mm_add_epi16(x1, neg);
+    pxor %34, xmm8    ; x1 = _mm_xor_si128(x1, neg);
+    pxor %35, xmm9    ; x1 = _mm_xor_si128(x1, neg);
+    pxor %36, xmm10   ; x1 = _mm_xor_si128(x1, neg);
+    pxor %37, xmm11   ; x1 = _mm_xor_si128(x1, neg);
+    pxor xmm8, %34    ; neg = _mm_xor_si128(neg, x1);
+    pxor xmm9, %35    ; neg = _mm_xor_si128(neg, x1);
+    pxor xmm10, %36   ; neg = _mm_xor_si128(neg, x1);
+    pxor xmm11, %37   ; neg = _mm_xor_si128(neg, x1);
+    movdqa XMMWORD [t1 + %1 * SIZEOF_WORD], %34  ; _mm_storeu_si128((__m128i *)(t1 + ko), x1);
+    movdqa XMMWORD [t1 + (%1 + 8) * SIZEOF_WORD], %35  ; _mm_storeu_si128((__m128i *)(t1 + ko + 8), x1);
+    movdqa XMMWORD [t1 + (%1 + 16) * SIZEOF_WORD], %36  ; _mm_storeu_si128((__m128i *)(t1 + ko + 16), x1);
+    movdqa XMMWORD [t1 + (%1 + 24) * SIZEOF_WORD], %37  ; _mm_storeu_si128((__m128i *)(t1 + ko + 24), x1);
+    movdqa XMMWORD [t2 + %1 * SIZEOF_WORD], xmm8  ; _mm_storeu_si128((__m128i *)(t2 + ko), neg);
+    movdqa XMMWORD [t2 + (%1 + 8) * SIZEOF_WORD], xmm9  ; _mm_storeu_si128((__m128i *)(t2 + ko + 8), neg);
+    movdqa XMMWORD [t2 + (%1 + 16) * SIZEOF_WORD], xmm10  ; _mm_storeu_si128((__m128i *)(t2 + ko + 16), neg);
+    movdqa XMMWORD [t2 + (%1 + 24) * SIZEOF_WORD], xmm11  ; _mm_storeu_si128((__m128i *)(t2 + ko + 24), neg);
+%endmacro
+
+;
+; Encode a single block's worth of coefficients.
+;
+; GLOBAL(JOCTET*)
+; jsimd_huff_encode_one_block_sse2 (working_state *state, JOCTET *buffer,
+;                                   JCOEFPTR block, int last_dc_val,
+;                                   c_derived_tbl *dctbl, c_derived_tbl *actbl)
+;
+
+; r10 = working_state *state
+; r11 = JOCTET *buffer
+; r12 = JCOEFPTR block
+; r13 = int last_dc_val
+; r14 = c_derived_tbl *dctbl
+; r15 = c_derived_tbl *actbl
+
+%define t1              rbp-(DCTSIZE2*SIZEOF_WORD)
+%define t2              t1-(DCTSIZE2*SIZEOF_WORD)
+%define put_buffer      r8
+%define put_bits        r9d
+%define buffer          rax
+
+        align   16
+        global  EXTN(jsimd_huff_encode_one_block_sse2)
+
+EXTN(jsimd_huff_encode_one_block_sse2):
+        push    rbp
+        mov     rax,rsp                         ; rax = original rbp
+        sub     rsp, byte 4
+        and     rsp, byte (-SIZEOF_XMMWORD)     ; align to 128 bits
+        mov     [rsp],rax
+        mov     rbp,rsp                         ; rbp = aligned rbp
+        lea     rsp, [t2]
+        collect_args
+%ifdef WIN64
+        movaps  XMMWORD [rsp-1*SIZEOF_XMMWORD], xmm8
+        movaps  XMMWORD [rsp-2*SIZEOF_XMMWORD], xmm9
+        movaps  XMMWORD [rsp-3*SIZEOF_XMMWORD], xmm10
+        movaps  XMMWORD [rsp-4*SIZEOF_XMMWORD], xmm11
+        sub     rsp, 4*SIZEOF_XMMWORD
+%endif
+        push rbx
+
+        mov buffer, r11  ; r11 is now sratch
+
+        mov put_buffer, MMWORD [r10+16]  ; put_buffer = state->cur.put_buffer;
+        mov put_bits,    DWORD [r10+24]  ; put_bits = state->cur.put_bits;
+        push r10  ; r10 is now scratch
+
+        ; Encode the DC coefficient difference per section F.1.2.1
+        movsx edi, word [r12]  ; temp = temp2 = block[0] - last_dc_val;
+        sub   edi, r13d  ; r13 is not used anymore
+        mov   ebx, edi
+
+        ; This is a well-known technique for obtaining the absolute value
+        ; without a branch.  It is derived from an assembly language technique
+        ; presented in "How to Optimize for the Pentium Processors",
+        ; Copyright (c) 1996, 1997 by Agner Fog.
+        mov esi, edi
+        sar esi, 31   ; temp3 = temp >> (CHAR_BIT * sizeof(int) - 1);
+        xor edi, esi  ; temp ^= temp3;
+        sub edi, esi  ; temp -= temp3;
+
+        ; For a negative input, want temp2 = bitwise complement of abs(input)
+        ; This code assumes we are on a two's complement machine
+        add ebx, esi  ; temp2 += temp3;
+
+        ; Find the number of bits needed for the magnitude of the coefficient
+        lea   r11, [rel jpeg_nbits_table]
+        movzx rdi, byte [r11 + rdi]  ; nbits = JPEG_NBITS(temp);
+        ; Emit the Huffman-coded symbol for the number of bits
+        mov   r11d,  INT [r14 + rdi * 4]  ; code = dctbl->ehufco[nbits];
+        movzx  esi, byte [r14 + rdi + 1024]  ; size = dctbl->ehufsi[nbits];
+        EMIT_BITS r11, esi  ; EMIT_BITS(code, size)
+
+        ; Mask off any extra bits in code
+        mov esi, 1
+        mov ecx, edi
+        shl esi, cl
+        dec esi
+        and ebx, esi  ; temp2 &= (((JLONG) 1)<<nbits) - 1;
+
+        ; Emit that number of bits of the value, if positive,
+        ; or the complement of its magnitude, if negative.
+        EMIT_BITS rbx, edi  ; EMIT_BITS(temp2, nbits)
+
+        ; Prepare data
+        xor ebx, ebx
+        kloop_prepare  0,  1,  8,  16, 9,  2,  3,  10, 17, 24, 32, 25, \
+                       18, 11, 4,  5,  12, 19, 26, 33, 40, 48, 41, 34, \
+                       27, 20, 13, 6,  7,  14, 21, 28, 35, \
+                       xmm0, xmm1, xmm2, xmm3
+        kloop_prepare  32, 42, 49, 56, 57, 50, 43, 36, 29, 22, 15, 23, \
+                       30, 37, 44, 51, 58, 59, 52, 45, 38, 31, 39, 46, \
+                       53, 60, 61, 54, 47, 55, 62, 63, 63, \
+                       xmm4, xmm5, xmm6, xmm7
+
+        pxor xmm8, xmm8
+        pcmpeqw xmm0, xmm8  ; tmp0 = _mm_cmpeq_epi16(tmp0, zero);
+        pcmpeqw xmm1, xmm8  ; tmp1 = _mm_cmpeq_epi16(tmp1, zero);
+        pcmpeqw xmm2, xmm8  ; tmp2 = _mm_cmpeq_epi16(tmp2, zero);
+        pcmpeqw xmm3, xmm8  ; tmp3 = _mm_cmpeq_epi16(tmp3, zero);
+        pcmpeqw xmm4, xmm8  ; tmp4 = _mm_cmpeq_epi16(tmp4, zero);
+        pcmpeqw xmm5, xmm8  ; tmp5 = _mm_cmpeq_epi16(tmp5, zero);
+        pcmpeqw xmm6, xmm8  ; tmp6 = _mm_cmpeq_epi16(tmp6, zero);
+        pcmpeqw xmm7, xmm8  ; tmp7 = _mm_cmpeq_epi16(tmp7, zero);
+        packsswb xmm0, xmm1  ; tmp0 = _mm_packs_epi16(tmp0, tmp1);
+        packsswb xmm2, xmm3  ; tmp2 = _mm_packs_epi16(tmp2, tmp3);
+        packsswb xmm4, xmm5  ; tmp4 = _mm_packs_epi16(tmp4, tmp5);
+        packsswb xmm6, xmm7  ; tmp6 = _mm_packs_epi16(tmp6, tmp7);
+        pmovmskb r11d, xmm0  ; index  = ((uint64_t)_mm_movemask_epi8(tmp0)) << 0;
+        pmovmskb r12d, xmm2  ; index  = ((uint64_t)_mm_movemask_epi8(tmp2)) << 16;
+        pmovmskb r13d, xmm4  ; index  = ((uint64_t)_mm_movemask_epi8(tmp4)) << 32;
+        pmovmskb r14d, xmm6  ; index  = ((uint64_t)_mm_movemask_epi8(tmp6)) << 48;
+        shl r12, 16
+        shl r14, 16
+        or  r11, r12
+        or  r13, r14
+        shl r13, 32
+        or  r11, r13
+        not r11  ; index = ~index;
+
+        ;mov MMWORD [ t1 + DCTSIZE2 * SIZEOF_WORD ], r11
+        ;jmp .EFN
+
+        mov   r13d,  INT [r15 + 240 * 4]  ; code_0xf0 = actbl->ehufco[0xf0];
+        movzx r14d, byte [r15 + 1024 + 240]  ; size_0xf0 = actbl->ehufsi[0xf0];
+        lea rsi, [t1]
+.BLOOP:
+        bsf r12, r11  ; r = __builtin_ctzl(index);
+        jz .ELOOP
+        mov rcx, r12
+        lea rsi, [rsi+r12*2]  ; k += r;
+        shr r11, cl  ; index >>= r;
+        movzx rdi, word [rsi]  ; temp = t1[k];
+        lea   rbx, [rel jpeg_nbits_table]
+        movzx rdi, byte [rbx + rdi]  ; nbits = JPEG_NBITS(temp);
+.BRLOOP:
+        cmp r12, 16  ; while (r > 15) {
+        jl .ERLOOP
+        EMIT_BITS r13, r14d  ; EMIT_BITS(code_0xf0, size_0xf0)
+        sub r12, 16  ; r -= 16;
+        jmp .BRLOOP
+.ERLOOP:
+        ; Emit Huffman symbol for run length / number of bits
+        CHECKBUF31  ; uses rcx, rdx
+
+        shl r12, 4  ; temp3 = (r << 4) + nbits;
+        add r12, rdi
+        mov   ebx,  INT [r15 + r12 * 4]  ; code = actbl->ehufco[temp3];
+        movzx ecx, byte [r15 + r12 + 1024]  ; size = actbl->ehufsi[temp3];
+        PUT_BITS rbx
+
+        ;EMIT_CODE(code, size)
+
+        movsx ebx, word [rsi-DCTSIZE2*2]  ; temp2 = t2[k];
+        ; Mask off any extra bits in code
+        mov rcx, rdi
+        mov rdx, 1
+        shl rdx, cl
+        dec rdx
+        and rbx, rdx  ; temp2 &= (((JLONG) 1)<<nbits) - 1;
+        PUT_BITS rbx  ; PUT_BITS(temp2, nbits)
+
+        shr r11, 1  ; index >>= 1;
+        add rsi, 2  ; ++k;
+        jmp .BLOOP
+.ELOOP:
+        ; If the last coef(s) were zero, emit an end-of-block code
+        lea rdi, [t1 + (DCTSIZE2-1) * 2]  ; r = DCTSIZE2-1-k;
+        cmp rdi, rsi  ; if (r > 0) {
+        je .EFN
+        mov   ebx,  INT [r15]  ; code = actbl->ehufco[0];
+        movzx r12d, byte [r15 + 1024]  ; size = actbl->ehufsi[0];
+        EMIT_BITS rbx, r12d
+.EFN:
+        pop r10
+        ; Save put_buffer & put_bits
+        mov MMWORD [r10+16], put_buffer  ; state->cur.put_buffer = put_buffer;
+        mov DWORD  [r10+24], put_bits  ; state->cur.put_bits = put_bits;
+
+        pop rbx
+%ifdef WIN64
+        movaps  xmm11, XMMWORD [rsp+0*SIZEOF_XMMWORD]
+        movaps  xmm10, XMMWORD [rsp+1*SIZEOF_XMMWORD]
+        movaps  xmm9, XMMWORD [rsp+2*SIZEOF_XMMWORD]
+        movaps  xmm8, XMMWORD [rsp+3*SIZEOF_XMMWORD]
+        add     rsp, 4*SIZEOF_XMMWORD
+%endif
+        uncollect_args
+        mov     rsp,rbp         ; rsp <- aligned rbp
+        pop     rsp             ; rsp <- original rbp
+        pop     rbp
+        ret
+
+; For some reason, the OS X linker does not honor the request to align the
+; segment unless we do this.
+        align   16
diff --git a/simd/jchuff-sse2.asm b/simd/jchuff-sse2.asm
new file mode 100644
index 0000000..1d82273
--- /dev/null
+++ b/simd/jchuff-sse2.asm
@@ -0,0 +1,427 @@
+;
+; jchuff-sse2.asm - Huffman entropy encoding (SSE2)
+;
+; Copyright 2009-2011, 2014-2016 D. R. Commander.
+; Copyright 2015 Matthieu Darbois
+;
+; Based on
+; x86 SIMD extension for IJG JPEG library
+; Copyright (C) 1999-2006, MIYASAKA Masaru.
+; For conditions of distribution and use, see copyright notice in jsimdext.inc
+;
+; This file should be assembled with NASM (Netwide Assembler),
+; can *not* be assembled with Microsoft's MASM or any compatible
+; assembler (including Borland's Turbo Assembler).
+; NASM is available from http://nasm.sourceforge.net/ or
+; http://sourceforge.net/project/showfiles.php?group_id=6208
+;
+; This file contains an SSE2 implementation for Huffman coding of one block.
+; The following code is based directly on jchuff.c; see jchuff.c for more
+; details.
+;
+; [TAB8]
+
+%include "jsimdext.inc"
+
+; --------------------------------------------------------------------------
+        SECTION SEG_CONST
+
+        alignz  16
+        global  EXTN(jconst_huff_encode_one_block)
+
+EXTN(jconst_huff_encode_one_block):
+
+%include "jpeg_nbits_table.inc"
+
+        alignz  16
+
+; --------------------------------------------------------------------------
+        SECTION SEG_TEXT
+        BITS    32
+
+; These macros perform the same task as the emit_bits() function in the
+; original libjpeg code.  In addition to reducing overhead by explicitly
+; inlining the code, additional performance is achieved by taking into
+; account the size of the bit buffer and waiting until it is almost full
+; before emptying it.  This mostly benefits 64-bit platforms, since 6
+; bytes can be stored in a 64-bit bit buffer before it has to be emptied.
+
+%macro EMIT_BYTE 0
+        sub put_bits, 8  ; put_bits -= 8;
+        mov edx, put_buffer
+        mov ecx, put_bits
+        shr edx, cl  ; c = (JOCTET)GETJOCTET(put_buffer >> put_bits);
+        mov byte [eax], dl  ; *buffer++ = c;
+        add eax, 1
+        cmp dl, 0xFF  ; need to stuff a zero byte?
+        jne %%.EMIT_BYTE_END
+        mov byte [eax], 0  ; *buffer++ = 0;
+        add eax, 1
+%%.EMIT_BYTE_END:
+%endmacro
+
+%macro PUT_BITS 1
+        add put_bits, ecx  ; put_bits += size;
+        shl put_buffer, cl  ; put_buffer = (put_buffer << size);
+        or  put_buffer, %1
+%endmacro
+
+%macro CHECKBUF15 0
+        cmp put_bits, 16  ; if (put_bits > 31) {
+        jl %%.CHECKBUF15_END
+        mov eax, POINTER [esp+buffer]
+        EMIT_BYTE
+        EMIT_BYTE
+        mov POINTER [esp+buffer], eax
+%%.CHECKBUF15_END:
+%endmacro
+
+%macro EMIT_BITS 1
+        PUT_BITS %1
+        CHECKBUF15
+%endmacro
+
+%macro kloop_prepare 37  ;(ko, jno0, ..., jno31, xmm0, xmm1, xmm2, xmm3)
+    pxor xmm4, xmm4  ; __m128i neg = _mm_setzero_si128();
+    pxor xmm5, xmm5  ; __m128i neg = _mm_setzero_si128();
+    pxor xmm6, xmm6  ; __m128i neg = _mm_setzero_si128();
+    pxor xmm7, xmm7  ; __m128i neg = _mm_setzero_si128();
+    pinsrw %34, word [esi + %2  * SIZEOF_WORD], 0  ; xmm_shadow[0] = block[jno0];
+    pinsrw %35, word [esi + %10 * SIZEOF_WORD], 0  ; xmm_shadow[8] = block[jno8];
+    pinsrw %36, word [esi + %18 * SIZEOF_WORD], 0  ; xmm_shadow[16] = block[jno16];
+    pinsrw %37, word [esi + %26 * SIZEOF_WORD], 0  ; xmm_shadow[24] = block[jno24];
+    pinsrw %34, word [esi + %3  * SIZEOF_WORD], 1  ; xmm_shadow[1] = block[jno1];
+    pinsrw %35, word [esi + %11 * SIZEOF_WORD], 1  ; xmm_shadow[9] = block[jno9];
+    pinsrw %36, word [esi + %19 * SIZEOF_WORD], 1  ; xmm_shadow[17] = block[jno17];
+    pinsrw %37, word [esi + %27 * SIZEOF_WORD], 1  ; xmm_shadow[25] = block[jno25];
+    pinsrw %34, word [esi + %4  * SIZEOF_WORD], 2  ; xmm_shadow[2] = block[jno2];
+    pinsrw %35, word [esi + %12 * SIZEOF_WORD], 2  ; xmm_shadow[10] = block[jno10];
+    pinsrw %36, word [esi + %20 * SIZEOF_WORD], 2  ; xmm_shadow[18] = block[jno18];
+    pinsrw %37, word [esi + %28 * SIZEOF_WORD], 2  ; xmm_shadow[26] = block[jno26];
+    pinsrw %34, word [esi + %5  * SIZEOF_WORD], 3  ; xmm_shadow[3] = block[jno3];
+    pinsrw %35, word [esi + %13 * SIZEOF_WORD], 3  ; xmm_shadow[11] = block[jno11];
+    pinsrw %36, word [esi + %21 * SIZEOF_WORD], 3  ; xmm_shadow[19] = block[jno19];
+    pinsrw %37, word [esi + %29 * SIZEOF_WORD], 3  ; xmm_shadow[27] = block[jno27];
+    pinsrw %34, word [esi + %6  * SIZEOF_WORD], 4  ; xmm_shadow[4] = block[jno4];
+    pinsrw %35, word [esi + %14 * SIZEOF_WORD], 4  ; xmm_shadow[12] = block[jno12];
+    pinsrw %36, word [esi + %22 * SIZEOF_WORD], 4  ; xmm_shadow[20] = block[jno20];
+    pinsrw %37, word [esi + %30 * SIZEOF_WORD], 4  ; xmm_shadow[28] = block[jno28];
+    pinsrw %34, word [esi + %7  * SIZEOF_WORD], 5  ; xmm_shadow[5] = block[jno5];
+    pinsrw %35, word [esi + %15 * SIZEOF_WORD], 5  ; xmm_shadow[13] = block[jno13];
+    pinsrw %36, word [esi + %23 * SIZEOF_WORD], 5  ; xmm_shadow[21] = block[jno21];
+    pinsrw %37, word [esi + %31 * SIZEOF_WORD], 5  ; xmm_shadow[29] = block[jno29];
+    pinsrw %34, word [esi + %8  * SIZEOF_WORD], 6  ; xmm_shadow[6] = block[jno6];
+    pinsrw %35, word [esi + %16 * SIZEOF_WORD], 6  ; xmm_shadow[14] = block[jno14];
+    pinsrw %36, word [esi + %24 * SIZEOF_WORD], 6  ; xmm_shadow[22] = block[jno22];
+    pinsrw %37, word [esi + %32 * SIZEOF_WORD], 6  ; xmm_shadow[30] = block[jno30];
+    pinsrw %34, word [esi + %9  * SIZEOF_WORD], 7  ; xmm_shadow[7] = block[jno7];
+    pinsrw %35, word [esi + %17 * SIZEOF_WORD], 7  ; xmm_shadow[15] = block[jno15];
+    pinsrw %36, word [esi + %25 * SIZEOF_WORD], 7  ; xmm_shadow[23] = block[jno23];
+%if %1 != 32
+    pinsrw %37, word [esi + %33 * SIZEOF_WORD], 7  ; xmm_shadow[31] = block[jno31];
+%else
+    pinsrw %37, ecx, 7  ; xmm_shadow[31] = block[jno31];
+%endif
+    pcmpgtw xmm4, %34  ; neg = _mm_cmpgt_epi16(neg, x1);
+    pcmpgtw xmm5, %35  ; neg = _mm_cmpgt_epi16(neg, x1);
+    pcmpgtw xmm6, %36  ; neg = _mm_cmpgt_epi16(neg, x1);
+    pcmpgtw xmm7, %37  ; neg = _mm_cmpgt_epi16(neg, x1);
+    paddw %34, xmm4   ; x1 = _mm_add_epi16(x1, neg);
+    paddw %35, xmm5   ; x1 = _mm_add_epi16(x1, neg);
+    paddw %36, xmm6  ; x1 = _mm_add_epi16(x1, neg);
+    paddw %37, xmm7  ; x1 = _mm_add_epi16(x1, neg);
+    pxor %34, xmm4    ; x1 = _mm_xor_si128(x1, neg);
+    pxor %35, xmm5    ; x1 = _mm_xor_si128(x1, neg);
+    pxor %36, xmm6   ; x1 = _mm_xor_si128(x1, neg);
+    pxor %37, xmm7   ; x1 = _mm_xor_si128(x1, neg);
+    pxor xmm4, %34    ; neg = _mm_xor_si128(neg, x1);
+    pxor xmm5, %35    ; neg = _mm_xor_si128(neg, x1);
+    pxor xmm6, %36   ; neg = _mm_xor_si128(neg, x1);
+    pxor xmm7, %37   ; neg = _mm_xor_si128(neg, x1);
+    movdqa XMMWORD [esp + t1 + %1 * SIZEOF_WORD], %34  ; _mm_storeu_si128((__m128i *)(t1 + ko), x1);
+    movdqa XMMWORD [esp + t1 + (%1 + 8) * SIZEOF_WORD], %35  ; _mm_storeu_si128((__m128i *)(t1 + ko + 8), x1);
+    movdqa XMMWORD [esp + t1 + (%1 + 16) * SIZEOF_WORD], %36  ; _mm_storeu_si128((__m128i *)(t1 + ko + 16), x1);
+    movdqa XMMWORD [esp + t1 + (%1 + 24) * SIZEOF_WORD], %37  ; _mm_storeu_si128((__m128i *)(t1 + ko + 24), x1);
+    movdqa XMMWORD [esp + t2 + %1 * SIZEOF_WORD], xmm4  ; _mm_storeu_si128((__m128i *)(t2 + ko), neg);
+    movdqa XMMWORD [esp + t2 + (%1 + 8) * SIZEOF_WORD], xmm5  ; _mm_storeu_si128((__m128i *)(t2 + ko + 8), neg);
+    movdqa XMMWORD [esp + t2 + (%1 + 16) * SIZEOF_WORD], xmm6  ; _mm_storeu_si128((__m128i *)(t2 + ko + 16), neg);
+    movdqa XMMWORD [esp + t2 + (%1 + 24) * SIZEOF_WORD], xmm7  ; _mm_storeu_si128((__m128i *)(t2 + ko + 24), neg);
+%endmacro
+
+;
+; Encode a single block's worth of coefficients.
+;
+; GLOBAL(JOCTET*)
+; jsimd_huff_encode_one_block_sse2 (working_state *state, JOCTET *buffer,
+;                                   JCOEFPTR block, int last_dc_val,
+;                                   c_derived_tbl *dctbl, c_derived_tbl *actbl)
+;
+
+; eax + 8 = working_state *state
+; eax + 12 = JOCTET *buffer
+; eax + 16 = JCOEFPTR block
+; eax + 20 = int last_dc_val
+; eax + 24 = c_derived_tbl *dctbl
+; eax + 28 = c_derived_tbl *actbl
+
+%define pad             6*SIZEOF_DWORD  ; Align to 16 bytes
+%define t1              pad
+%define t2              t1+(DCTSIZE2*SIZEOF_WORD)
+%define block           t2+(DCTSIZE2*SIZEOF_WORD)
+%define actbl           block+SIZEOF_DWORD
+%define buffer          actbl+SIZEOF_DWORD
+%define temp            buffer+SIZEOF_DWORD
+%define temp2           temp+SIZEOF_DWORD
+%define temp3           temp2+SIZEOF_DWORD
+%define temp4           temp3+SIZEOF_DWORD
+%define temp5           temp4+SIZEOF_DWORD
+%define gotptr          temp5+SIZEOF_DWORD  ; void *gotptr
+%define put_buffer      ebx
+%define put_bits        edi
+
+        align   16
+        global  EXTN(jsimd_huff_encode_one_block_sse2)
+
+EXTN(jsimd_huff_encode_one_block_sse2):
+        push    ebp
+        mov     eax,esp                         ; eax = original ebp
+        sub     esp, byte 4
+        and     esp, byte (-SIZEOF_XMMWORD)     ; align to 128 bits
+        mov     [esp],eax
+        mov     ebp,esp                         ; ebp = aligned ebp
+        sub     esp, temp5+9*SIZEOF_DWORD-pad
+        push    ebx
+        push    ecx
+;       push    edx             ; need not be preserved
+        push    esi
+        push    edi
+        push    ebp
+
+        mov esi, POINTER [eax+8]        ; (working_state *state)
+        mov put_buffer,  DWORD [esi+8]  ; put_buffer = state->cur.put_buffer;
+        mov put_bits,    DWORD [esi+12]  ; put_bits = state->cur.put_bits;
+        push esi  ; esi is now scratch
+
+        get_GOT edx                       ; get GOT address
+        movpic POINTER [esp+gotptr], edx  ; save GOT address
+
+        mov ecx, POINTER [eax+28]
+        mov edx, POINTER [eax+16]
+        mov esi, POINTER [eax+12]
+        mov POINTER [esp+actbl],  ecx
+        mov POINTER [esp+block],  edx
+        mov POINTER [esp+buffer], esi
+
+        ; Encode the DC coefficient difference per section F.1.2.1
+        mov esi, POINTER [esp+block]        ; block
+        movsx ecx, word [esi]  ; temp = temp2 = block[0] - last_dc_val;
+        sub   ecx, DWORD [eax+20]
+        mov   esi, ecx
+
+        ; This is a well-known technique for obtaining the absolute value
+        ; without a branch.  It is derived from an assembly language technique
+        ; presented in "How to Optimize for the Pentium Processors",
+        ; Copyright (c) 1996, 1997 by Agner Fog.
+        mov edx, ecx
+        sar edx, 31   ; temp3 = temp >> (CHAR_BIT * sizeof(int) - 1);
+        xor ecx, edx ; temp ^= temp3;
+        sub ecx, edx ; temp -= temp3;
+
+        ; For a negative input, want temp2 = bitwise complement of abs(input)
+        ; This code assumes we are on a two's complement machine
+        add esi, edx  ; temp2 += temp3;
+        mov DWORD [esp+temp], esi  ; backup temp2 in temp
+
+        ; Find the number of bits needed for the magnitude of the coefficient
+        movpic ebp, POINTER [esp+gotptr]   ; load GOT address (ebp)
+        movzx edx, byte [GOTOFF(ebp, jpeg_nbits_table + ecx)]  ; nbits = JPEG_NBITS(temp);
+        mov DWORD [esp+temp2], edx  ; backup nbits in temp2
+
+        ; Emit the Huffman-coded symbol for the number of bits
+        mov    ebp, POINTER [eax+24]  ; After this point, arguments are not accessible anymore
+        mov    eax,  INT [ebp + edx * 4]  ; code = dctbl->ehufco[nbits];
+        movzx  ecx, byte [ebp + edx + 1024]  ; size = dctbl->ehufsi[nbits];
+        EMIT_BITS eax  ; EMIT_BITS(code, size)
+
+        mov ecx, DWORD [esp+temp2]  ; restore nbits
+
+        ; Mask off any extra bits in code
+        mov eax, 1
+        shl eax, cl
+        dec eax
+        and eax, DWORD [esp+temp]  ; temp2 &= (((JLONG) 1)<<nbits) - 1;
+
+        ; Emit that number of bits of the value, if positive,
+        ; or the complement of its magnitude, if negative.
+        EMIT_BITS eax  ; EMIT_BITS(temp2, nbits)
+
+        ; Prepare data
+        xor ecx, ecx
+        mov esi, POINTER [esp+block]
+        kloop_prepare  0,  1,  8,  16, 9,  2,  3,  10, 17, 24, 32, 25, \
+                       18, 11, 4,  5,  12, 19, 26, 33, 40, 48, 41, 34, \
+                       27, 20, 13, 6,  7,  14, 21, 28, 35, \
+                       xmm0, xmm1, xmm2, xmm3
+        kloop_prepare  32, 42, 49, 56, 57, 50, 43, 36, 29, 22, 15, 23, \
+                       30, 37, 44, 51, 58, 59, 52, 45, 38, 31, 39, 46, \
+                       53, 60, 61, 54, 47, 55, 62, 63, 63, \
+                       xmm0, xmm1, xmm2, xmm3
+
+        pxor xmm7, xmm7
+        movdqa xmm0, XMMWORD [esp + t1 + 0 * SIZEOF_WORD]   ; __m128i tmp0 = _mm_loadu_si128((__m128i *)(t1 + 0));
+        movdqa xmm1, XMMWORD [esp + t1 + 8 * SIZEOF_WORD]   ; __m128i tmp1 = _mm_loadu_si128((__m128i *)(t1 + 8));
+        movdqa xmm2, XMMWORD [esp + t1 + 16 * SIZEOF_WORD]  ; __m128i tmp2 = _mm_loadu_si128((__m128i *)(t1 + 16));
+        movdqa xmm3, XMMWORD [esp + t1 + 24 * SIZEOF_WORD]  ; __m128i tmp3 = _mm_loadu_si128((__m128i *)(t1 + 24));
+        pcmpeqw xmm0, xmm7  ; tmp0 = _mm_cmpeq_epi16(tmp0, zero);
+        pcmpeqw xmm1, xmm7  ; tmp1 = _mm_cmpeq_epi16(tmp1, zero);
+        pcmpeqw xmm2, xmm7  ; tmp2 = _mm_cmpeq_epi16(tmp2, zero);
+        pcmpeqw xmm3, xmm7  ; tmp3 = _mm_cmpeq_epi16(tmp3, zero);
+        packsswb xmm0, xmm1  ; tmp0 = _mm_packs_epi16(tmp0, tmp1);
+        packsswb xmm2, xmm3  ; tmp2 = _mm_packs_epi16(tmp2, tmp3);
+        pmovmskb edx, xmm0  ; index  = ((uint64_t)_mm_movemask_epi8(tmp0)) << 0;
+        pmovmskb ecx, xmm2  ; index  = ((uint64_t)_mm_movemask_epi8(tmp2)) << 16;
+        shl ecx, 16
+        or  edx, ecx
+        not edx  ; index = ~index;
+
+        lea esi, [esp+t1]
+        mov ebp, POINTER [esp+actbl]  ; ebp = actbl
+
+.BLOOP:
+        bsf ecx, edx  ; r = __builtin_ctzl(index);
+        jz .ELOOP
+        lea esi, [esi+ecx*2]  ; k += r;
+        shr edx, cl  ; index >>= r;
+        mov DWORD [esp+temp3], edx
+.BRLOOP:
+        cmp ecx, 16  ; while (r > 15) {
+        jl .ERLOOP
+        sub ecx, 16 ; r -= 16;
+        mov DWORD [esp+temp], ecx
+        mov   eax, INT [ebp + 240 * 4]  ; code_0xf0 = actbl->ehufco[0xf0];
+        movzx ecx, byte [ebp + 1024 + 240]  ; size_0xf0 = actbl->ehufsi[0xf0];
+        EMIT_BITS eax  ; EMIT_BITS(code_0xf0, size_0xf0)
+        mov ecx, DWORD [esp+temp]
+        jmp .BRLOOP
+.ERLOOP:
+        movsx eax, word [esi]  ; temp = t1[k];
+        movpic edx, POINTER [esp+gotptr]   ; load GOT address (edx)
+        movzx eax, byte [GOTOFF(edx, jpeg_nbits_table + eax)]  ; nbits = JPEG_NBITS(temp);
+        mov DWORD [esp+temp2], eax
+        ; Emit Huffman symbol for run length / number of bits
+        shl ecx, 4  ; temp3 = (r << 4) + nbits;
+        add ecx, eax
+        mov   eax,  INT [ebp + ecx * 4]  ; code = actbl->ehufco[temp3];
+        movzx ecx, byte [ebp + ecx + 1024]  ; size = actbl->ehufsi[temp3];
+        EMIT_BITS eax
+
+        movsx edx, word [esi+DCTSIZE2*2]  ; temp2 = t2[k];
+        ; Mask off any extra bits in code
+        mov ecx, DWORD [esp+temp2]
+        mov eax, 1
+        shl eax, cl
+        dec eax
+        and eax, edx  ; temp2 &= (((JLONG) 1)<<nbits) - 1;
+        EMIT_BITS eax  ; PUT_BITS(temp2, nbits)
+        mov edx, DWORD [esp+temp3]
+        add esi, 2  ; ++k;
+        shr edx, 1  ; index >>= 1;
+
+        jmp .BLOOP
+.ELOOP:
+        movdqa xmm0, XMMWORD [esp + t1 + 32 * SIZEOF_WORD]  ; __m128i tmp0 = _mm_loadu_si128((__m128i *)(t1 + 0));
+        movdqa xmm1, XMMWORD [esp + t1 + 40 * SIZEOF_WORD]  ; __m128i tmp1 = _mm_loadu_si128((__m128i *)(t1 + 8));
+        movdqa xmm2, XMMWORD [esp + t1 + 48 * SIZEOF_WORD]  ; __m128i tmp2 = _mm_loadu_si128((__m128i *)(t1 + 16));
+        movdqa xmm3, XMMWORD [esp + t1 + 56 * SIZEOF_WORD]  ; __m128i tmp3 = _mm_loadu_si128((__m128i *)(t1 + 24));
+        pcmpeqw xmm0, xmm7  ; tmp0 = _mm_cmpeq_epi16(tmp0, zero);
+        pcmpeqw xmm1, xmm7  ; tmp1 = _mm_cmpeq_epi16(tmp1, zero);
+        pcmpeqw xmm2, xmm7  ; tmp2 = _mm_cmpeq_epi16(tmp2, zero);
+        pcmpeqw xmm3, xmm7  ; tmp3 = _mm_cmpeq_epi16(tmp3, zero);
+        packsswb xmm0, xmm1  ; tmp0 = _mm_packs_epi16(tmp0, tmp1);
+        packsswb xmm2, xmm3  ; tmp2 = _mm_packs_epi16(tmp2, tmp3);
+        pmovmskb edx, xmm0  ; index  = ((uint64_t)_mm_movemask_epi8(tmp0)) << 0;
+        pmovmskb ecx, xmm2  ; index  = ((uint64_t)_mm_movemask_epi8(tmp2)) << 16;
+        shl ecx, 16
+        or  edx, ecx
+        not edx  ; index = ~index;
+
+        lea eax, [esp + t1 + (DCTSIZE2/2) * 2]
+        sub eax, esi
+        shr eax, 1
+        bsf ecx, edx  ; r = __builtin_ctzl(index);
+        jz .ELOOP2
+        shr edx, cl  ; index >>= r;
+        add ecx, eax
+        lea esi, [esi+ecx*2]  ; k += r;
+        mov DWORD [esp+temp3], edx
+        jmp .BRLOOP2
+.BLOOP2:
+        bsf ecx, edx  ; r = __builtin_ctzl(index);
+        jz .ELOOP2
+        lea esi, [esi+ecx*2]  ; k += r;
+        shr edx, cl  ; index >>= r;
+        mov DWORD [esp+temp3], edx
+.BRLOOP2:
+        cmp ecx, 16  ; while (r > 15) {
+        jl .ERLOOP2
+        sub ecx, 16  ; r -= 16;
+        mov DWORD [esp+temp], ecx
+        mov   eax, INT [ebp + 240 * 4]  ; code_0xf0 = actbl->ehufco[0xf0];
+        movzx ecx, byte [ebp + 1024 + 240]  ; size_0xf0 = actbl->ehufsi[0xf0];
+        EMIT_BITS eax  ; EMIT_BITS(code_0xf0, size_0xf0)
+        mov ecx, DWORD [esp+temp]
+        jmp .BRLOOP2
+.ERLOOP2:
+        movsx eax, word [esi]  ; temp = t1[k];
+        bsr eax, eax  ; nbits = 32 - __builtin_clz(temp);
+        inc eax
+        mov DWORD [esp+temp2], eax
+        ; Emit Huffman symbol for run length / number of bits
+        shl ecx, 4  ; temp3 = (r << 4) + nbits;
+        add ecx, eax
+        mov   eax,  INT [ebp + ecx * 4]  ; code = actbl->ehufco[temp3];
+        movzx ecx, byte [ebp + ecx + 1024]  ; size = actbl->ehufsi[temp3];
+        EMIT_BITS eax
+
+        movsx edx, word [esi+DCTSIZE2*2]  ; temp2 = t2[k];
+        ; Mask off any extra bits in code
+        mov ecx, DWORD [esp+temp2]
+        mov eax, 1
+        shl eax, cl
+        dec eax
+        and eax, edx  ; temp2 &= (((JLONG) 1)<<nbits) - 1;
+        EMIT_BITS eax  ; PUT_BITS(temp2, nbits)
+        mov edx, DWORD [esp+temp3]
+        add esi, 2  ; ++k;
+        shr edx, 1  ; index >>= 1;
+
+        jmp .BLOOP2
+.ELOOP2:
+        ; If the last coef(s) were zero, emit an end-of-block code
+        lea edx, [esp + t1 + (DCTSIZE2-1) * 2]  ; r = DCTSIZE2-1-k;
+        cmp edx, esi  ; if (r > 0) {
+        je .EFN
+        mov   eax,  INT [ebp]  ; code = actbl->ehufco[0];
+        movzx ecx, byte [ebp + 1024]  ; size = actbl->ehufsi[0];
+        EMIT_BITS eax
+.EFN:
+        mov eax, [esp+buffer]
+        pop esi
+        ; Save put_buffer & put_bits
+        mov DWORD [esi+8], put_buffer  ; state->cur.put_buffer = put_buffer;
+        mov DWORD [esi+12], put_bits  ; state->cur.put_bits = put_bits;
+
+        pop     ebp
+        pop     edi
+        pop     esi
+;       pop     edx             ; need not be preserved
+        pop     ecx
+        pop     ebx
+        mov     esp,ebp         ; esp <- aligned ebp
+        pop     esp             ; esp <- original ebp
+        pop     ebp
+        ret
+
+; For some reason, the OS X linker does not honor the request to align the
+; segment unless we do this.
+        align   16
diff --git a/simd/jcsample-altivec.c b/simd/jcsample-altivec.c
new file mode 100644
index 0000000..603492d
--- /dev/null
+++ b/simd/jcsample-altivec.c
@@ -0,0 +1,158 @@
+/*
+ * AltiVec optimizations for libjpeg-turbo
+ *
+ * Copyright (C) 2015, D. R. Commander.
+ * All rights reserved.
+ * This software is provided 'as-is', without any express or implied
+ * warranty.  In no event will the authors be held liable for any damages
+ * arising from the use of this software.
+ *
+ * Permission is granted to anyone to use this software for any purpose,
+ * including commercial applications, and to alter it and redistribute it
+ * freely, subject to the following restrictions:
+ *
+ * 1. The origin of this software must not be misrepresented; you must not
+ *    claim that you wrote the original software. If you use this software
+ *    in a product, an acknowledgment in the product documentation would be
+ *    appreciated but is not required.
+ * 2. Altered source versions must be plainly marked as such, and must not be
+ *    misrepresented as being the original software.
+ * 3. This notice may not be removed or altered from any source distribution.
+ */
+
+/* CHROMA DOWNSAMPLING */
+
+#include "jsimd_altivec.h"
+#include "jcsample.h"
+
+
+void
+jsimd_h2v1_downsample_altivec (JDIMENSION image_width, int max_v_samp_factor,
+                               JDIMENSION v_samp_factor,
+                               JDIMENSION width_blocks,
+                               JSAMPARRAY input_data, JSAMPARRAY output_data)
+{
+  int outrow, outcol;
+  JDIMENSION output_cols = width_blocks * DCTSIZE;
+  JSAMPROW inptr, outptr;
+
+  __vector unsigned char this0, next0, out;
+  __vector unsigned short this0e, this0o, next0e, next0o, outl, outh;
+
+  /* Constants */
+  __vector unsigned short pw_bias = { __4X2(0, 1) },
+    pw_one = { __8X(1) };
+  __vector unsigned char even_odd_index =
+    {0,2,4,6,8,10,12,14,1,3,5,7,9,11,13,15},
+    pb_zero = { __16X(0) };
+
+  expand_right_edge(input_data, max_v_samp_factor, image_width,
+                    output_cols * 2);
+
+  for (outrow = 0; outrow < v_samp_factor; outrow++) {
+    outptr = output_data[outrow];
+    inptr = input_data[outrow];
+
+    for (outcol = output_cols; outcol > 0;
+         outcol -= 16, inptr += 32, outptr += 16) {
+
+      this0 = vec_ld(0, inptr);
+      this0 = vec_perm(this0, this0, even_odd_index);
+      this0e = (__vector unsigned short)VEC_UNPACKHU(this0);
+      this0o = (__vector unsigned short)VEC_UNPACKLU(this0);
+      outl = vec_add(this0e, this0o);
+      outl = vec_add(outl, pw_bias);
+      outl = vec_sr(outl, pw_one);
+
+      if (outcol > 8) {
+        next0 = vec_ld(16, inptr);
+        next0 = vec_perm(next0, next0, even_odd_index);
+        next0e = (__vector unsigned short)VEC_UNPACKHU(next0);
+        next0o = (__vector unsigned short)VEC_UNPACKLU(next0);
+        outh = vec_add(next0e, next0o);
+        outh = vec_add(outh, pw_bias);
+        outh = vec_sr(outh, pw_one);
+      } else
+        outh = vec_splat_u16(0);
+
+      out = vec_pack(outl, outh);
+      vec_st(out, 0, outptr);
+    }
+  }
+}
+
+
+void
+jsimd_h2v2_downsample_altivec (JDIMENSION image_width, int max_v_samp_factor,
+                               JDIMENSION v_samp_factor,
+                               JDIMENSION width_blocks,
+                               JSAMPARRAY input_data, JSAMPARRAY output_data)
+{
+  int inrow, outrow, outcol;
+  JDIMENSION output_cols = width_blocks * DCTSIZE;
+  JSAMPROW inptr0, inptr1, outptr;
+
+  __vector unsigned char this0, next0, this1, next1, out;
+  __vector unsigned short this0e, this0o, next0e, next0o, this1e, this1o,
+    next1e, next1o, out0l, out0h, out1l, out1h, outl, outh;
+
+  /* Constants */
+  __vector unsigned short pw_bias = { __4X2(1, 2) },
+    pw_two = { __8X(2) };
+  __vector unsigned char even_odd_index =
+    { 0, 2, 4, 6, 8, 10, 12, 14, 1, 3, 5, 7, 9, 11, 13, 15 },
+    pb_zero = { __16X(0) };
+
+  expand_right_edge(input_data, max_v_samp_factor, image_width,
+                    output_cols * 2);
+
+  for (inrow = 0, outrow = 0; outrow < v_samp_factor;
+       inrow += 2, outrow++) {
+
+    inptr0 = input_data[inrow];
+    inptr1 = input_data[inrow + 1];
+    outptr = output_data[outrow];
+
+    for (outcol = output_cols; outcol > 0;
+         outcol -= 16, inptr0 += 32, inptr1 += 32, outptr += 16) {
+
+      this0 = vec_ld(0, inptr0);
+      this0 = vec_perm(this0, this0, even_odd_index);
+      this0e = (__vector unsigned short)VEC_UNPACKHU(this0);
+      this0o = (__vector unsigned short)VEC_UNPACKLU(this0);
+      out0l = vec_add(this0e, this0o);
+
+      this1 = vec_ld(0, inptr1);
+      this1 = vec_perm(this1, this1, even_odd_index);
+      this1e = (__vector unsigned short)VEC_UNPACKHU(this1);
+      this1o = (__vector unsigned short)VEC_UNPACKLU(this1);
+      out1l = vec_add(this1e, this1o);
+
+      outl = vec_add(out0l, out1l);
+      outl = vec_add(outl, pw_bias);
+      outl = vec_sr(outl, pw_two);
+
+      if (outcol > 8) {
+        next0 = vec_ld(16, inptr0);
+        next0 = vec_perm(next0, next0, even_odd_index);
+        next0e = (__vector unsigned short)VEC_UNPACKHU(next0);
+        next0o = (__vector unsigned short)VEC_UNPACKLU(next0);
+        out0h = vec_add(next0e, next0o);
+
+        next1 = vec_ld(16, inptr1);
+        next1 = vec_perm(next1, next1, even_odd_index);
+        next1e = (__vector unsigned short)VEC_UNPACKHU(next1);
+        next1o = (__vector unsigned short)VEC_UNPACKLU(next1);
+        out1h = vec_add(next1e, next1o);
+
+        outh = vec_add(out0h, out1h);
+        outh = vec_add(outh, pw_bias);
+        outh = vec_sr(outh, pw_two);
+      } else
+        outh = vec_splat_u16(0);
+
+      out = vec_pack(outl, outh);
+      vec_st(out, 0, outptr);
+    }
+  }
+}
diff --git a/simd/jcsample.h b/simd/jcsample.h
new file mode 100644
index 0000000..2a50544
--- /dev/null
+++ b/simd/jcsample.h
@@ -0,0 +1,28 @@
+/*
+ * jcsample.h
+ *
+ * This file was part of the Independent JPEG Group's software:
+ * Copyright (C) 1991-1996, Thomas G. Lane.
+ * For conditions of distribution and use, see the accompanying README.ijg
+ * file.
+ */
+
+LOCAL(void)
+expand_right_edge (JSAMPARRAY image_data, int num_rows,
+                   JDIMENSION input_cols, JDIMENSION output_cols)
+{
+  register JSAMPROW ptr;
+  register JSAMPLE pixval;
+  register int count;
+  int row;
+  int numcols = (int) (output_cols - input_cols);
+
+  if (numcols > 0) {
+    for (row = 0; row < num_rows; row++) {
+      ptr = image_data[row] + input_cols;
+      pixval = ptr[-1];         /* don't need GETJSAMPLE() here */
+      for (count = numcols; count > 0; count--)
+        *ptr++ = pixval;
+    }
+  }
+}
diff --git a/simd/jdcolext-altivec.c b/simd/jdcolext-altivec.c
new file mode 100644
index 0000000..1ae91b9
--- /dev/null
+++ b/simd/jdcolext-altivec.c
@@ -0,0 +1,274 @@
+/*
+ * AltiVec optimizations for libjpeg-turbo
+ *
+ * Copyright (C) 2015, D. R. Commander.
+ * All rights reserved.
+ * This software is provided 'as-is', without any express or implied
+ * warranty.  In no event will the authors be held liable for any damages
+ * arising from the use of this software.
+ *
+ * Permission is granted to anyone to use this software for any purpose,
+ * including commercial applications, and to alter it and redistribute it
+ * freely, subject to the following restrictions:
+ *
+ * 1. The origin of this software must not be misrepresented; you must not
+ *    claim that you wrote the original software. If you use this software
+ *    in a product, an acknowledgment in the product documentation would be
+ *    appreciated but is not required.
+ * 2. Altered source versions must be plainly marked as such, and must not be
+ *    misrepresented as being the original software.
+ * 3. This notice may not be removed or altered from any source distribution.
+ */
+
+/* This file is included by jdcolor-altivec.c */
+
+
+void jsimd_ycc_rgb_convert_altivec (JDIMENSION out_width, JSAMPIMAGE input_buf,
+                                    JDIMENSION input_row,
+                                    JSAMPARRAY output_buf, int num_rows)
+{
+  JSAMPROW outptr, inptr0, inptr1, inptr2;
+  int pitch = out_width * RGB_PIXELSIZE, num_cols;
+#if __BIG_ENDIAN__
+  int offset;
+#endif
+  unsigned char __attribute__((aligned(16))) tmpbuf[RGB_PIXELSIZE * 16];
+
+  __vector unsigned char rgb0, rgb1, rgb2, rgbx0, rgbx1, rgbx2, rgbx3,
+    y, cb, cr;
+#if __BIG_ENDIAN__
+  __vector unsigned char edgel, edgeh, edges, out0, out1, out2, out3;
+#if RGB_PIXELSIZE == 4
+  __vector unsigned char out4;
+#endif
+#endif
+#if RGB_PIXELSIZE == 4
+  __vector unsigned char rgb3;
+#endif
+  __vector short rg0, rg1, rg2, rg3, bx0, bx1, bx2, bx3, yl, yh, cbl, cbh,
+    crl, crh, rl, rh, gl, gh, bl, bh, g0w, g1w, g2w, g3w;
+  __vector int g0, g1, g2, g3;
+
+  /* Constants
+   * NOTE: The >> 1 is to compensate for the fact that vec_madds() returns 17
+   * high-order bits, not 16.
+   */
+  __vector short pw_f0402 = { __8X(F_0_402 >> 1) },
+    pw_mf0228 = { __8X(-F_0_228 >> 1) },
+    pw_mf0344_f0285 = { __4X2(-F_0_344, F_0_285) },
+    pw_one = { __8X(1) }, pw_255 = { __8X(255) },
+    pw_cj = { __8X(CENTERJSAMPLE) };
+  __vector int pd_onehalf = { __4X(ONE_HALF) };
+  __vector unsigned char pb_zero = { __16X(0) },
+#if __BIG_ENDIAN__
+    shift_pack_index = {0,1,4,5,8,9,12,13,16,17,20,21,24,25,28,29};
+#else
+    shift_pack_index = {2,3,6,7,10,11,14,15,18,19,22,23,26,27,30,31};
+#endif
+
+  while (--num_rows >= 0) {
+    inptr0 = input_buf[0][input_row];
+    inptr1 = input_buf[1][input_row];
+    inptr2 = input_buf[2][input_row];
+    input_row++;
+    outptr = *output_buf++;
+
+    for (num_cols = pitch; num_cols > 0;
+         num_cols -= RGB_PIXELSIZE * 16, outptr += RGB_PIXELSIZE * 16,
+         inptr0 += 16, inptr1 += 16, inptr2 += 16) {
+
+      y = vec_ld(0, inptr0);
+      /* NOTE: We have to use vec_merge*() here because vec_unpack*() doesn't
+       * support unsigned vectors.
+       */
+      yl = (__vector signed short)VEC_UNPACKHU(y);
+      yh = (__vector signed short)VEC_UNPACKLU(y);
+
+      cb = vec_ld(0, inptr1);
+      cbl = (__vector signed short)VEC_UNPACKHU(cb);
+      cbh = (__vector signed short)VEC_UNPACKLU(cb);
+      cbl = vec_sub(cbl, pw_cj);
+      cbh = vec_sub(cbh, pw_cj);
+
+      cr = vec_ld(0, inptr2);
+      crl = (__vector signed short)VEC_UNPACKHU(cr);
+      crh = (__vector signed short)VEC_UNPACKLU(cr);
+      crl = vec_sub(crl, pw_cj);
+      crh = vec_sub(crh, pw_cj);
+
+      /* (Original)
+       * R = Y                + 1.40200 * Cr
+       * G = Y - 0.34414 * Cb - 0.71414 * Cr
+       * B = Y + 1.77200 * Cb
+       *
+       * (This implementation)
+       * R = Y                + 0.40200 * Cr + Cr
+       * G = Y - 0.34414 * Cb + 0.28586 * Cr - Cr
+       * B = Y - 0.22800 * Cb + Cb + Cb
+       */
+      bl = vec_add(cbl, cbl);
+      bh = vec_add(cbh, cbh);
+      bl = vec_madds(bl, pw_mf0228, pw_one);
+      bh = vec_madds(bh, pw_mf0228, pw_one);
+      bl = vec_sra(bl, (__vector unsigned short)pw_one);
+      bh = vec_sra(bh, (__vector unsigned short)pw_one);
+      bl = vec_add(bl, cbl);
+      bh = vec_add(bh, cbh);
+      bl = vec_add(bl, cbl);
+      bh = vec_add(bh, cbh);
+      bl = vec_add(bl, yl);
+      bh = vec_add(bh, yh);
+
+      rl = vec_add(crl, crl);
+      rh = vec_add(crh, crh);
+      rl = vec_madds(rl, pw_f0402, pw_one);
+      rh = vec_madds(rh, pw_f0402, pw_one);
+      rl = vec_sra(rl, (__vector unsigned short)pw_one);
+      rh = vec_sra(rh, (__vector unsigned short)pw_one);
+      rl = vec_add(rl, crl);
+      rh = vec_add(rh, crh);
+      rl = vec_add(rl, yl);
+      rh = vec_add(rh, yh);
+
+      g0w = vec_mergeh(cbl, crl);
+      g1w = vec_mergel(cbl, crl);
+      g0 = vec_msums(g0w, pw_mf0344_f0285, pd_onehalf);
+      g1 = vec_msums(g1w, pw_mf0344_f0285, pd_onehalf);
+      g2w = vec_mergeh(cbh, crh);
+      g3w = vec_mergel(cbh, crh);
+      g2 = vec_msums(g2w, pw_mf0344_f0285, pd_onehalf);
+      g3 = vec_msums(g3w, pw_mf0344_f0285, pd_onehalf);
+      /* Clever way to avoid 4 shifts + 2 packs.  This packs the high word from
+       * each dword into a new 16-bit vector, which is the equivalent of
+       * descaling the 32-bit results (right-shifting by 16 bits) and then
+       * packing them.
+       */
+      gl = vec_perm((__vector short)g0, (__vector short)g1, shift_pack_index);
+      gh = vec_perm((__vector short)g2, (__vector short)g3, shift_pack_index);
+      gl = vec_sub(gl, crl);
+      gh = vec_sub(gh, crh);
+      gl = vec_add(gl, yl);
+      gh = vec_add(gh, yh);
+
+      rg0 = vec_mergeh(rl, gl);
+      bx0 = vec_mergeh(bl, pw_255);
+      rg1 = vec_mergel(rl, gl);
+      bx1 = vec_mergel(bl, pw_255);
+      rg2 = vec_mergeh(rh, gh);
+      bx2 = vec_mergeh(bh, pw_255);
+      rg3 = vec_mergel(rh, gh);
+      bx3 = vec_mergel(bh, pw_255);
+
+      rgbx0 = vec_packsu(rg0, bx0);
+      rgbx1 = vec_packsu(rg1, bx1);
+      rgbx2 = vec_packsu(rg2, bx2);
+      rgbx3 = vec_packsu(rg3, bx3);
+
+#if RGB_PIXELSIZE == 3
+      /* rgbx0 = R0 G0 R1 G1 R2 G2 R3 G3 B0 X0 B1 X1 B2 X2 B3 X3
+       * rgbx1 = R4 G4 R5 G5 R6 G6 R7 G7 B4 X4 B5 X5 B6 X6 B7 X7
+       * rgbx2 = R8 G8 R9 G9 Ra Ga Rb Gb B8 X8 B9 X9 Ba Xa Bb Xb
+       * rgbx3 = Rc Gc Rd Gd Re Ge Rf Gf Bc Xc Bd Xd Be Xe Bf Xf
+       *
+       * rgb0 = R0 G0 B0 R1 G1 B1 R2 G2 B2 R3 G3 B3 R4 G4 B4 R5
+       * rgb1 = G5 B5 R6 G6 B6 R7 G7 B7 R8 G8 B8 R9 G9 B9 Ra Ga
+       * rgb2 = Ba Rb Gb Bb Rc Gc Bc Rd Gd Bd Re Ge Be Rf Gf Bf
+       */
+      rgb0 = vec_perm(rgbx0, rgbx1, (__vector unsigned char)RGB_INDEX0);
+      rgb1 = vec_perm(rgbx1, rgbx2, (__vector unsigned char)RGB_INDEX1);
+      rgb2 = vec_perm(rgbx2, rgbx3, (__vector unsigned char)RGB_INDEX2);
+#else
+      /* rgbx0 = R0 G0 R1 G1 R2 G2 R3 G3 B0 X0 B1 X1 B2 X2 B3 X3
+       * rgbx1 = R4 G4 R5 G5 R6 G6 R7 G7 B4 X4 B5 X5 B6 X6 B7 X7
+       * rgbx2 = R8 G8 R9 G9 Ra Ga Rb Gb B8 X8 B9 X9 Ba Xa Bb Xb
+       * rgbx3 = Rc Gc Rd Gd Re Ge Rf Gf Bc Xc Bd Xd Be Xe Bf Xf
+       *
+       * rgb0 = R0 G0 B0 X0 R1 G1 B1 X1 R2 G2 B2 X2 R3 G3 B3 X3
+       * rgb1 = R4 G4 B4 X4 R5 G5 B5 X5 R6 G6 B6 X6 R7 G7 B7 X7
+       * rgb2 = R8 G8 B8 X8 R9 G9 B9 X9 Ra Ga Ba Xa Rb Gb Bb Xb
+       * rgb3 = Rc Gc Bc Xc Rd Gd Bd Xd Re Ge Be Xe Rf Gf Bf Xf
+       */
+      rgb0 = vec_perm(rgbx0, rgbx0, (__vector unsigned char)RGB_INDEX);
+      rgb1 = vec_perm(rgbx1, rgbx1, (__vector unsigned char)RGB_INDEX);
+      rgb2 = vec_perm(rgbx2, rgbx2, (__vector unsigned char)RGB_INDEX);
+      rgb3 = vec_perm(rgbx3, rgbx3, (__vector unsigned char)RGB_INDEX);
+#endif
+
+#if __BIG_ENDIAN__
+      offset = (size_t)outptr & 15;
+      if (offset) {
+        __vector unsigned char unaligned_shift_index;
+        int bytes = num_cols + offset;
+
+        if (bytes < (RGB_PIXELSIZE + 1) * 16 && (bytes & 15)) {
+          /* Slow path to prevent buffer overwrite.  Since there is no way to
+           * write a partial AltiVec register, overwrite would occur on the
+           * last chunk of the last image row if the right edge is not on a
+           * 16-byte boundary.  It could also occur on other rows if the bytes
+           * per row is low enough.  Since we can't determine whether we're on
+           * the last image row, we have to assume every row is the last.
+           */
+          vec_st(rgb0, 0, tmpbuf);
+          vec_st(rgb1, 16, tmpbuf);
+          vec_st(rgb2, 32, tmpbuf);
+#if RGB_PIXELSIZE == 4
+          vec_st(rgb3, 48, tmpbuf);
+#endif
+          memcpy(outptr, tmpbuf, min(num_cols, RGB_PIXELSIZE * 16));
+        } else {
+          /* Fast path */
+          unaligned_shift_index = vec_lvsl(0, outptr);
+          edgel = vec_ld(0, outptr);
+          edgeh = vec_ld(min(num_cols - 1, RGB_PIXELSIZE * 16), outptr);
+          edges = vec_perm(edgeh, edgel, unaligned_shift_index);
+          unaligned_shift_index = vec_lvsr(0, outptr);
+          out0 = vec_perm(edges, rgb0, unaligned_shift_index);
+          out1 = vec_perm(rgb0, rgb1, unaligned_shift_index);
+          out2 = vec_perm(rgb1, rgb2, unaligned_shift_index);
+#if RGB_PIXELSIZE == 4
+          out3 = vec_perm(rgb2, rgb3, unaligned_shift_index);
+          out4 = vec_perm(rgb3, edges, unaligned_shift_index);
+#else
+          out3 = vec_perm(rgb2, edges, unaligned_shift_index);
+#endif
+          vec_st(out0, 0, outptr);
+          if (bytes > 16)
+            vec_st(out1, 16, outptr);
+          if (bytes > 32)
+            vec_st(out2, 32, outptr);
+          if (bytes > 48)
+            vec_st(out3, 48, outptr);
+#if RGB_PIXELSIZE == 4
+          if (bytes > 64)
+            vec_st(out4, 64, outptr);
+#endif
+        }
+      } else {
+#endif /* __BIG_ENDIAN__ */
+        if (num_cols < RGB_PIXELSIZE * 16 && (num_cols & 15)) {
+          /* Slow path */
+          VEC_ST(rgb0, 0, tmpbuf);
+          VEC_ST(rgb1, 16, tmpbuf);
+          VEC_ST(rgb2, 32, tmpbuf);
+#if RGB_PIXELSIZE == 4
+          VEC_ST(rgb3, 48, tmpbuf);
+#endif
+          memcpy(outptr, tmpbuf, min(num_cols, RGB_PIXELSIZE * 16));
+        } else {
+          /* Fast path */
+          VEC_ST(rgb0, 0, outptr);
+          if (num_cols > 16)
+            VEC_ST(rgb1, 16, outptr);
+          if (num_cols > 32)
+            VEC_ST(rgb2, 32, outptr);
+#if RGB_PIXELSIZE == 4
+          if (num_cols > 48)
+            VEC_ST(rgb3, 48, outptr);
+#endif
+        }
+#if __BIG_ENDIAN__
+      }
+#endif
+    }
+  }
+}
diff --git a/simd/jdcolor-altivec.c b/simd/jdcolor-altivec.c
new file mode 100644
index 0000000..e0892d8
--- /dev/null
+++ b/simd/jdcolor-altivec.c
@@ -0,0 +1,96 @@
+/*
+ * AltiVec optimizations for libjpeg-turbo
+ *
+ * Copyright (C) 2015, D. R. Commander.
+ * All rights reserved.
+ * This software is provided 'as-is', without any express or implied
+ * warranty.  In no event will the authors be held liable for any damages
+ * arising from the use of this software.
+ *
+ * Permission is granted to anyone to use this software for any purpose,
+ * including commercial applications, and to alter it and redistribute it
+ * freely, subject to the following restrictions:
+ *
+ * 1. The origin of this software must not be misrepresented; you must not
+ *    claim that you wrote the original software. If you use this software
+ *    in a product, an acknowledgment in the product documentation would be
+ *    appreciated but is not required.
+ * 2. Altered source versions must be plainly marked as such, and must not be
+ *    misrepresented as being the original software.
+ * 3. This notice may not be removed or altered from any source distribution.
+ */
+
+/* YCC --> RGB CONVERSION */
+
+#include "jsimd_altivec.h"
+
+
+#define F_0_344 22554              /* FIX(0.34414) */
+#define F_0_714 46802              /* FIX(0.71414) */
+#define F_1_402 91881              /* FIX(1.40200) */
+#define F_1_772 116130             /* FIX(1.77200) */
+#define F_0_402 (F_1_402 - 65536)  /* FIX(1.40200) - FIX(1) */
+#define F_0_285 (65536 - F_0_714)  /* FIX(1) - FIX(0.71414) */
+#define F_0_228 (131072 - F_1_772) /* FIX(2) - FIX(1.77200) */
+
+#define SCALEBITS 16
+#define ONE_HALF (1 << (SCALEBITS - 1))
+
+#define RGB_INDEX0 {0,1,8,2,3,10,4,5,12,6,7,14,16,17,24,18}
+#define RGB_INDEX1 {3,10,4,5,12,6,7,14,16,17,24,18,19,26,20,21}
+#define RGB_INDEX2 {12,6,7,14,16,17,24,18,19,26,20,21,28,22,23,30}
+#include "jdcolext-altivec.c"
+#undef RGB_PIXELSIZE
+
+#define RGB_PIXELSIZE EXT_RGB_PIXELSIZE
+#define jsimd_ycc_rgb_convert_altivec jsimd_ycc_extrgb_convert_altivec
+#include "jdcolext-altivec.c"
+#undef RGB_PIXELSIZE
+#undef RGB_INDEX0
+#undef RGB_INDEX1
+#undef RGB_INDEX2
+#undef jsimd_ycc_rgb_convert_altivec
+
+#define RGB_PIXELSIZE EXT_RGBX_PIXELSIZE
+#define RGB_INDEX {0,1,8,9,2,3,10,11,4,5,12,13,6,7,14,15}
+#define jsimd_ycc_rgb_convert_altivec jsimd_ycc_extrgbx_convert_altivec
+#include "jdcolext-altivec.c"
+#undef RGB_PIXELSIZE
+#undef RGB_INDEX
+#undef jsimd_ycc_rgb_convert_altivec
+
+#define RGB_PIXELSIZE EXT_BGR_PIXELSIZE
+#define RGB_INDEX0 {8,1,0,10,3,2,12,5,4,14,7,6,24,17,16,26}
+#define RGB_INDEX1 {3,2,12,5,4,14,7,6,24,17,16,26,19,18,28,21}
+#define RGB_INDEX2 {4,14,7,6,24,17,16,26,19,18,28,21,20,30,23,22}
+#define jsimd_ycc_rgb_convert_altivec jsimd_ycc_extbgr_convert_altivec
+#include "jdcolext-altivec.c"
+#undef RGB_PIXELSIZE
+#undef RGB_INDEX0
+#undef RGB_INDEX1
+#undef RGB_INDEX2
+#undef jsimd_ycc_rgb_convert_altivec
+
+#define RGB_PIXELSIZE EXT_BGRX_PIXELSIZE
+#define RGB_INDEX {8,1,0,9,10,3,2,11,12,5,4,13,14,7,6,15}
+#define jsimd_ycc_rgb_convert_altivec jsimd_ycc_extbgrx_convert_altivec
+#include "jdcolext-altivec.c"
+#undef RGB_PIXELSIZE
+#undef RGB_INDEX
+#undef jsimd_ycc_rgb_convert_altivec
+
+#define RGB_PIXELSIZE EXT_XBGR_PIXELSIZE
+#define RGB_INDEX {9,8,1,0,11,10,3,2,13,12,5,4,15,14,7,6}
+#define jsimd_ycc_rgb_convert_altivec jsimd_ycc_extxbgr_convert_altivec
+#include "jdcolext-altivec.c"
+#undef RGB_PIXELSIZE
+#undef RGB_INDEX
+#undef jsimd_ycc_rgb_convert_altivec
+
+#define RGB_PIXELSIZE EXT_XRGB_PIXELSIZE
+#define RGB_INDEX {9,0,1,8,11,2,3,10,13,4,5,12,15,6,7,14}
+#define jsimd_ycc_rgb_convert_altivec jsimd_ycc_extxrgb_convert_altivec
+#include "jdcolext-altivec.c"
+#undef RGB_PIXELSIZE
+#undef RGB_INDEX
+#undef jsimd_ycc_rgb_convert_altivec
diff --git a/simd/jdmerge-altivec.c b/simd/jdmerge-altivec.c
new file mode 100644
index 0000000..cc8d3d9
--- /dev/null
+++ b/simd/jdmerge-altivec.c
@@ -0,0 +1,108 @@
+/*
+ * AltiVec optimizations for libjpeg-turbo
+ *
+ * Copyright (C) 2015, D. R. Commander.
+ * All rights reserved.
+ * This software is provided 'as-is', without any express or implied
+ * warranty.  In no event will the authors be held liable for any damages
+ * arising from the use of this software.
+ *
+ * Permission is granted to anyone to use this software for any purpose,
+ * including commercial applications, and to alter it and redistribute it
+ * freely, subject to the following restrictions:
+ *
+ * 1. The origin of this software must not be misrepresented; you must not
+ *    claim that you wrote the original software. If you use this software
+ *    in a product, an acknowledgment in the product documentation would be
+ *    appreciated but is not required.
+ * 2. Altered source versions must be plainly marked as such, and must not be
+ *    misrepresented as being the original software.
+ * 3. This notice may not be removed or altered from any source distribution.
+ */
+
+/* MERGED YCC --> RGB CONVERSION AND UPSAMPLING */
+
+#include "jsimd_altivec.h"
+
+
+#define F_0_344 22554              /* FIX(0.34414) */
+#define F_0_714 46802              /* FIX(0.71414) */
+#define F_1_402 91881              /* FIX(1.40200) */
+#define F_1_772 116130             /* FIX(1.77200) */
+#define F_0_402 (F_1_402 - 65536)  /* FIX(1.40200) - FIX(1) */
+#define F_0_285 (65536 - F_0_714)  /* FIX(1) - FIX(0.71414) */
+#define F_0_228 (131072 - F_1_772) /* FIX(2) - FIX(1.77200) */
+
+#define SCALEBITS 16
+#define ONE_HALF (1 << (SCALEBITS - 1))
+
+#define RGB_INDEX0 {0,1,8,2,3,10,4,5,12,6,7,14,16,17,24,18}
+#define RGB_INDEX1 {3,10,4,5,12,6,7,14,16,17,24,18,19,26,20,21}
+#define RGB_INDEX2 {12,6,7,14,16,17,24,18,19,26,20,21,28,22,23,30}
+#include "jdmrgext-altivec.c"
+#undef RGB_PIXELSIZE
+
+#define RGB_PIXELSIZE EXT_RGB_PIXELSIZE
+#define jsimd_h2v1_merged_upsample_altivec jsimd_h2v1_extrgb_merged_upsample_altivec
+#define jsimd_h2v2_merged_upsample_altivec jsimd_h2v2_extrgb_merged_upsample_altivec
+#include "jdmrgext-altivec.c"
+#undef RGB_PIXELSIZE
+#undef RGB_INDEX0
+#undef RGB_INDEX1
+#undef RGB_INDEX2
+#undef jsimd_h2v1_merged_upsample_altivec
+#undef jsimd_h2v2_merged_upsample_altivec
+
+#define RGB_PIXELSIZE EXT_RGBX_PIXELSIZE
+#define RGB_INDEX {0,1,8,9,2,3,10,11,4,5,12,13,6,7,14,15}
+#define jsimd_h2v1_merged_upsample_altivec jsimd_h2v1_extrgbx_merged_upsample_altivec
+#define jsimd_h2v2_merged_upsample_altivec jsimd_h2v2_extrgbx_merged_upsample_altivec
+#include "jdmrgext-altivec.c"
+#undef RGB_PIXELSIZE
+#undef RGB_INDEX
+#undef jsimd_h2v1_merged_upsample_altivec
+#undef jsimd_h2v2_merged_upsample_altivec
+
+#define RGB_PIXELSIZE EXT_BGR_PIXELSIZE
+#define RGB_INDEX0 {8,1,0,10,3,2,12,5,4,14,7,6,24,17,16,26}
+#define RGB_INDEX1 {3,2,12,5,4,14,7,6,24,17,16,26,19,18,28,21}
+#define RGB_INDEX2 {4,14,7,6,24,17,16,26,19,18,28,21,20,30,23,22}
+#define jsimd_h2v1_merged_upsample_altivec jsimd_h2v1_extbgr_merged_upsample_altivec
+#define jsimd_h2v2_merged_upsample_altivec jsimd_h2v2_extbgr_merged_upsample_altivec
+#include "jdmrgext-altivec.c"
+#undef RGB_PIXELSIZE
+#undef RGB_INDEX0
+#undef RGB_INDEX1
+#undef RGB_INDEX2
+#undef jsimd_h2v1_merged_upsample_altivec
+#undef jsimd_h2v2_merged_upsample_altivec
+
+#define RGB_PIXELSIZE EXT_BGRX_PIXELSIZE
+#define RGB_INDEX {8,1,0,9,10,3,2,11,12,5,4,13,14,7,6,15}
+#define jsimd_h2v1_merged_upsample_altivec jsimd_h2v1_extbgrx_merged_upsample_altivec
+#define jsimd_h2v2_merged_upsample_altivec jsimd_h2v2_extbgrx_merged_upsample_altivec
+#include "jdmrgext-altivec.c"
+#undef RGB_PIXELSIZE
+#undef RGB_INDEX
+#undef jsimd_h2v1_merged_upsample_altivec
+#undef jsimd_h2v2_merged_upsample_altivec
+
+#define RGB_PIXELSIZE EXT_XBGR_PIXELSIZE
+#define RGB_INDEX {9,8,1,0,11,10,3,2,13,12,5,4,15,14,7,6}
+#define jsimd_h2v1_merged_upsample_altivec jsimd_h2v1_extxbgr_merged_upsample_altivec
+#define jsimd_h2v2_merged_upsample_altivec jsimd_h2v2_extxbgr_merged_upsample_altivec
+#include "jdmrgext-altivec.c"
+#undef RGB_PIXELSIZE
+#undef RGB_INDEX
+#undef jsimd_h2v1_merged_upsample_altivec
+#undef jsimd_h2v2_merged_upsample_altivec
+
+#define RGB_PIXELSIZE EXT_XRGB_PIXELSIZE
+#define RGB_INDEX {9,0,1,8,11,2,3,10,13,4,5,12,15,6,7,14}
+#define jsimd_h2v1_merged_upsample_altivec jsimd_h2v1_extxrgb_merged_upsample_altivec
+#define jsimd_h2v2_merged_upsample_altivec jsimd_h2v2_extxrgb_merged_upsample_altivec
+#include "jdmrgext-altivec.c"
+#undef RGB_PIXELSIZE
+#undef RGB_INDEX
+#undef jsimd_h2v1_merged_upsample_altivec
+#undef jsimd_h2v2_merged_upsample_altivec
diff --git a/simd/jdmrgext-altivec.c b/simd/jdmrgext-altivec.c
new file mode 100644
index 0000000..3b6950d
--- /dev/null
+++ b/simd/jdmrgext-altivec.c
@@ -0,0 +1,323 @@
+/*
+ * AltiVec optimizations for libjpeg-turbo
+ *
+ * Copyright (C) 2015, D. R. Commander.
+ * All rights reserved.
+ * This software is provided 'as-is', without any express or implied
+ * warranty.  In no event will the authors be held liable for any damages
+ * arising from the use of this software.
+ *
+ * Permission is granted to anyone to use this software for any purpose,
+ * including commercial applications, and to alter it and redistribute it
+ * freely, subject to the following restrictions:
+ *
+ * 1. The origin of this software must not be misrepresented; you must not
+ *    claim that you wrote the original software. If you use this software
+ *    in a product, an acknowledgment in the product documentation would be
+ *    appreciated but is not required.
+ * 2. Altered source versions must be plainly marked as such, and must not be
+ *    misrepresented as being the original software.
+ * 3. This notice may not be removed or altered from any source distribution.
+ */
+
+/* This file is included by jdmerge-altivec.c */
+
+
+void jsimd_h2v1_merged_upsample_altivec (JDIMENSION output_width,
+                                         JSAMPIMAGE input_buf,
+                                         JDIMENSION in_row_group_ctr,
+                                         JSAMPARRAY output_buf)
+{
+  JSAMPROW outptr, inptr0, inptr1, inptr2;
+  int pitch = output_width * RGB_PIXELSIZE, num_cols, yloop;
+#if __BIG_ENDIAN__
+  int offset;
+#endif
+  unsigned char __attribute__((aligned(16))) tmpbuf[RGB_PIXELSIZE * 16];
+
+  __vector unsigned char rgb0, rgb1, rgb2, rgbx0, rgbx1, rgbx2, rgbx3,
+    y, cb, cr;
+#if __BIG_ENDIAN__
+  __vector unsigned char edgel, edgeh, edges, out0, out1, out2, out3;
+#if RGB_PIXELSIZE == 4
+  __vector unsigned char out4;
+#endif
+#endif
+#if RGB_PIXELSIZE == 4
+  __vector unsigned char rgb3;
+#endif
+  __vector short rg0, rg1, rg2, rg3, bx0, bx1, bx2, bx3, ye, yo, cbl, cbh,
+    crl, crh, r_yl, r_yh, g_yl, g_yh, b_yl, b_yh, g_y0w, g_y1w, g_y2w, g_y3w,
+    rl, rh, gl, gh, bl, bh, re, ro, ge, go, be, bo;
+  __vector int g_y0, g_y1, g_y2, g_y3;
+
+  /* Constants
+   * NOTE: The >> 1 is to compensate for the fact that vec_madds() returns 17
+   * high-order bits, not 16.
+   */
+  __vector short pw_f0402 = { __8X(F_0_402 >> 1) },
+    pw_mf0228 = { __8X(-F_0_228 >> 1) },
+    pw_mf0344_f0285 = { __4X2(-F_0_344, F_0_285) },
+    pw_one = { __8X(1) }, pw_255 = { __8X(255) },
+    pw_cj = { __8X(CENTERJSAMPLE) };
+  __vector int pd_onehalf = { __4X(ONE_HALF) };
+  __vector unsigned char pb_zero = { __16X(0) },
+#if __BIG_ENDIAN__
+    shift_pack_index = {0,1,4,5,8,9,12,13,16,17,20,21,24,25,28,29},
+    even_index = {0,16,0,18,0,20,0,22,0,24,0,26,0,28,0,30},
+    odd_index = {0,17,0,19,0,21,0,23,0,25,0,27,0,29,0,31};
+#else
+    shift_pack_index = {2,3,6,7,10,11,14,15,18,19,22,23,26,27,30,31},
+    even_index = {16,0,18,0,20,0,22,0,24,0,26,0,28,0,30,0},
+    odd_index = {17,0,19,0,21,0,23,0,25,0,27,0,29,0,31,0};
+#endif
+
+  inptr0 = input_buf[0][in_row_group_ctr];
+  inptr1 = input_buf[1][in_row_group_ctr];
+  inptr2 = input_buf[2][in_row_group_ctr];
+  outptr = output_buf[0];
+
+  for (num_cols = pitch; num_cols > 0; inptr1 += 16, inptr2 += 16) {
+
+    cb = vec_ld(0, inptr1);
+    /* NOTE: We have to use vec_merge*() here because vec_unpack*() doesn't
+     * support unsigned vectors.
+     */
+    cbl = (__vector signed short)VEC_UNPACKHU(cb);
+    cbh = (__vector signed short)VEC_UNPACKLU(cb);
+    cbl = vec_sub(cbl, pw_cj);
+    cbh = vec_sub(cbh, pw_cj);
+
+    cr = vec_ld(0, inptr2);
+    crl = (__vector signed short)VEC_UNPACKHU(cr);
+    crh = (__vector signed short)VEC_UNPACKLU(cr);
+    crl = vec_sub(crl, pw_cj);
+    crh = vec_sub(crh, pw_cj);
+
+    /* (Original)
+     * R = Y                + 1.40200 * Cr
+     * G = Y - 0.34414 * Cb - 0.71414 * Cr
+     * B = Y + 1.77200 * Cb
+     *
+     * (This implementation)
+     * R = Y                + 0.40200 * Cr + Cr
+     * G = Y - 0.34414 * Cb + 0.28586 * Cr - Cr
+     * B = Y - 0.22800 * Cb + Cb + Cb
+     */
+    b_yl = vec_add(cbl, cbl);
+    b_yh = vec_add(cbh, cbh);
+    b_yl = vec_madds(b_yl, pw_mf0228, pw_one);
+    b_yh = vec_madds(b_yh, pw_mf0228, pw_one);
+    b_yl = vec_sra(b_yl, (__vector unsigned short)pw_one);
+    b_yh = vec_sra(b_yh, (__vector unsigned short)pw_one);
+    b_yl = vec_add(b_yl, cbl);
+    b_yh = vec_add(b_yh, cbh);
+    b_yl = vec_add(b_yl, cbl);
+    b_yh = vec_add(b_yh, cbh);
+
+    r_yl = vec_add(crl, crl);
+    r_yh = vec_add(crh, crh);
+    r_yl = vec_madds(r_yl, pw_f0402, pw_one);
+    r_yh = vec_madds(r_yh, pw_f0402, pw_one);
+    r_yl = vec_sra(r_yl, (__vector unsigned short)pw_one);
+    r_yh = vec_sra(r_yh, (__vector unsigned short)pw_one);
+    r_yl = vec_add(r_yl, crl);
+    r_yh = vec_add(r_yh, crh);
+
+    g_y0w = vec_mergeh(cbl, crl);
+    g_y1w = vec_mergel(cbl, crl);
+    g_y0 = vec_msums(g_y0w, pw_mf0344_f0285, pd_onehalf);
+    g_y1 = vec_msums(g_y1w, pw_mf0344_f0285, pd_onehalf);
+    g_y2w = vec_mergeh(cbh, crh);
+    g_y3w = vec_mergel(cbh, crh);
+    g_y2 = vec_msums(g_y2w, pw_mf0344_f0285, pd_onehalf);
+    g_y3 = vec_msums(g_y3w, pw_mf0344_f0285, pd_onehalf);
+    /* Clever way to avoid 4 shifts + 2 packs.  This packs the high word from
+     * each dword into a new 16-bit vector, which is the equivalent of
+     * descaling the 32-bit results (right-shifting by 16 bits) and then
+     * packing them.
+     */
+    g_yl = vec_perm((__vector short)g_y0, (__vector short)g_y1,
+                    shift_pack_index);
+    g_yh = vec_perm((__vector short)g_y2, (__vector short)g_y3,
+                    shift_pack_index);
+    g_yl = vec_sub(g_yl, crl);
+    g_yh = vec_sub(g_yh, crh);
+
+    for (yloop = 0; yloop < 2 && num_cols > 0; yloop++,
+         num_cols -= RGB_PIXELSIZE * 16,
+         outptr += RGB_PIXELSIZE * 16, inptr0 += 16) {
+
+      y = vec_ld(0, inptr0);
+      ye = (__vector signed short)vec_perm(pb_zero, y, even_index);
+      yo = (__vector signed short)vec_perm(pb_zero, y, odd_index);
+
+      if (yloop == 0) {
+        be = vec_add(b_yl, ye);
+        bo = vec_add(b_yl, yo);
+        re = vec_add(r_yl, ye);
+        ro = vec_add(r_yl, yo);
+        ge = vec_add(g_yl, ye);
+        go = vec_add(g_yl, yo);
+      } else {
+        be = vec_add(b_yh, ye);
+        bo = vec_add(b_yh, yo);
+        re = vec_add(r_yh, ye);
+        ro = vec_add(r_yh, yo);
+        ge = vec_add(g_yh, ye);
+        go = vec_add(g_yh, yo);
+      }
+
+      rl = vec_mergeh(re, ro);
+      rh = vec_mergel(re, ro);
+      gl = vec_mergeh(ge, go);
+      gh = vec_mergel(ge, go);
+      bl = vec_mergeh(be, bo);
+      bh = vec_mergel(be, bo);
+
+      rg0 = vec_mergeh(rl, gl);
+      bx0 = vec_mergeh(bl, pw_255);
+      rg1 = vec_mergel(rl, gl);
+      bx1 = vec_mergel(bl, pw_255);
+      rg2 = vec_mergeh(rh, gh);
+      bx2 = vec_mergeh(bh, pw_255);
+      rg3 = vec_mergel(rh, gh);
+      bx3 = vec_mergel(bh, pw_255);
+
+      rgbx0 = vec_packsu(rg0, bx0);
+      rgbx1 = vec_packsu(rg1, bx1);
+      rgbx2 = vec_packsu(rg2, bx2);
+      rgbx3 = vec_packsu(rg3, bx3);
+
+#if RGB_PIXELSIZE == 3
+      /* rgbx0 = R0 G0 R1 G1 R2 G2 R3 G3 B0 X0 B1 X1 B2 X2 B3 X3
+       * rgbx1 = R4 G4 R5 G5 R6 G6 R7 G7 B4 X4 B5 X5 B6 X6 B7 X7
+       * rgbx2 = R8 G8 R9 G9 Ra Ga Rb Gb B8 X8 B9 X9 Ba Xa Bb Xb
+       * rgbx3 = Rc Gc Rd Gd Re Ge Rf Gf Bc Xc Bd Xd Be Xe Bf Xf
+       *
+       * rgb0 = R0 G0 B0 R1 G1 B1 R2 G2 B2 R3 G3 B3 R4 G4 B4 R5
+       * rgb1 = G5 B5 R6 G6 B6 R7 G7 B7 R8 G8 B8 R9 G9 B9 Ra Ga
+       * rgb2 = Ba Rb Gb Bb Rc Gc Bc Rd Gd Bd Re Ge Be Rf Gf Bf
+       */
+      rgb0 = vec_perm(rgbx0, rgbx1, (__vector unsigned char)RGB_INDEX0);
+      rgb1 = vec_perm(rgbx1, rgbx2, (__vector unsigned char)RGB_INDEX1);
+      rgb2 = vec_perm(rgbx2, rgbx3, (__vector unsigned char)RGB_INDEX2);
+#else
+      /* rgbx0 = R0 G0 R1 G1 R2 G2 R3 G3 B0 X0 B1 X1 B2 X2 B3 X3
+       * rgbx1 = R4 G4 R5 G5 R6 G6 R7 G7 B4 X4 B5 X5 B6 X6 B7 X7
+       * rgbx2 = R8 G8 R9 G9 Ra Ga Rb Gb B8 X8 B9 X9 Ba Xa Bb Xb
+       * rgbx3 = Rc Gc Rd Gd Re Ge Rf Gf Bc Xc Bd Xd Be Xe Bf Xf
+       *
+       * rgb0 = R0 G0 B0 X0 R1 G1 B1 X1 R2 G2 B2 X2 R3 G3 B3 X3
+       * rgb1 = R4 G4 B4 X4 R5 G5 B5 X5 R6 G6 B6 X6 R7 G7 B7 X7
+       * rgb2 = R8 G8 B8 X8 R9 G9 B9 X9 Ra Ga Ba Xa Rb Gb Bb Xb
+       * rgb3 = Rc Gc Bc Xc Rd Gd Bd Xd Re Ge Be Xe Rf Gf Bf Xf
+       */
+      rgb0 = vec_perm(rgbx0, rgbx0, (__vector unsigned char)RGB_INDEX);
+      rgb1 = vec_perm(rgbx1, rgbx1, (__vector unsigned char)RGB_INDEX);
+      rgb2 = vec_perm(rgbx2, rgbx2, (__vector unsigned char)RGB_INDEX);
+      rgb3 = vec_perm(rgbx3, rgbx3, (__vector unsigned char)RGB_INDEX);
+#endif
+
+#if __BIG_ENDIAN__
+      offset = (size_t)outptr & 15;
+      if (offset) {
+        __vector unsigned char unaligned_shift_index;
+        int bytes = num_cols + offset;
+
+        if (bytes < (RGB_PIXELSIZE + 1) * 16 && (bytes & 15)) {
+          /* Slow path to prevent buffer overwrite.  Since there is no way to
+           * write a partial AltiVec register, overwrite would occur on the
+           * last chunk of the last image row if the right edge is not on a
+           * 16-byte boundary.  It could also occur on other rows if the bytes
+           * per row is low enough.  Since we can't determine whether we're on
+           * the last image row, we have to assume every row is the last.
+           */
+          vec_st(rgb0, 0, tmpbuf);
+          vec_st(rgb1, 16, tmpbuf);
+          vec_st(rgb2, 32, tmpbuf);
+#if RGB_PIXELSIZE == 4
+          vec_st(rgb3, 48, tmpbuf);
+#endif
+          memcpy(outptr, tmpbuf, min(num_cols, RGB_PIXELSIZE * 16));
+        } else {
+          /* Fast path */
+          unaligned_shift_index = vec_lvsl(0, outptr);
+          edgel = vec_ld(0, outptr);
+          edgeh = vec_ld(min(num_cols - 1, RGB_PIXELSIZE * 16), outptr);
+          edges = vec_perm(edgeh, edgel, unaligned_shift_index);
+          unaligned_shift_index = vec_lvsr(0, outptr);
+          out0 = vec_perm(edges, rgb0, unaligned_shift_index);
+          out1 = vec_perm(rgb0, rgb1, unaligned_shift_index);
+          out2 = vec_perm(rgb1, rgb2, unaligned_shift_index);
+#if RGB_PIXELSIZE == 4
+          out3 = vec_perm(rgb2, rgb3, unaligned_shift_index);
+          out4 = vec_perm(rgb3, edges, unaligned_shift_index);
+#else
+          out3 = vec_perm(rgb2, edges, unaligned_shift_index);
+#endif
+          vec_st(out0, 0, outptr);
+          if (bytes > 16)
+            vec_st(out1, 16, outptr);
+          if (bytes > 32)
+            vec_st(out2, 32, outptr);
+          if (bytes > 48)
+            vec_st(out3, 48, outptr);
+#if RGB_PIXELSIZE == 4
+          if (bytes > 64)
+            vec_st(out4, 64, outptr);
+#endif
+        }
+      } else {
+#endif /* __BIG_ENDIAN__ */
+        if (num_cols < RGB_PIXELSIZE * 16 && (num_cols & 15)) {
+          /* Slow path */
+          VEC_ST(rgb0, 0, tmpbuf);
+          VEC_ST(rgb1, 16, tmpbuf);
+          VEC_ST(rgb2, 32, tmpbuf);
+#if RGB_PIXELSIZE == 4
+          VEC_ST(rgb3, 48, tmpbuf);
+#endif
+          memcpy(outptr, tmpbuf, min(num_cols, RGB_PIXELSIZE * 16));
+        } else {
+          /* Fast path */
+          VEC_ST(rgb0, 0, outptr);
+          if (num_cols > 16)
+            VEC_ST(rgb1, 16, outptr);
+          if (num_cols > 32)
+            VEC_ST(rgb2, 32, outptr);
+#if RGB_PIXELSIZE == 4
+          if (num_cols > 48)
+            VEC_ST(rgb3, 48, outptr);
+#endif
+        }
+#if __BIG_ENDIAN__
+      }
+#endif
+    }
+  }
+}
+
+
+void jsimd_h2v2_merged_upsample_altivec (JDIMENSION output_width,
+                                         JSAMPIMAGE input_buf,
+                                         JDIMENSION in_row_group_ctr,
+                                         JSAMPARRAY output_buf)
+{
+  JSAMPROW inptr, outptr;
+
+  inptr = input_buf[0][in_row_group_ctr];
+  outptr = output_buf[0];
+
+  input_buf[0][in_row_group_ctr] = input_buf[0][in_row_group_ctr * 2];
+  jsimd_h2v1_merged_upsample_altivec(output_width, input_buf, in_row_group_ctr,
+                                     output_buf);
+
+  input_buf[0][in_row_group_ctr] = input_buf[0][in_row_group_ctr * 2 + 1];
+  output_buf[0] = output_buf[1];
+  jsimd_h2v1_merged_upsample_altivec(output_width, input_buf, in_row_group_ctr,
+                                     output_buf);
+
+  input_buf[0][in_row_group_ctr] = inptr;
+  output_buf[0] = outptr;
+}
diff --git a/simd/jdsample-altivec.c b/simd/jdsample-altivec.c
new file mode 100644
index 0000000..63d6d8c
--- /dev/null
+++ b/simd/jdsample-altivec.c
@@ -0,0 +1,392 @@
+/*
+ * AltiVec optimizations for libjpeg-turbo
+ *
+ * Copyright (C) 2015, D. R. Commander.
+ * All rights reserved.
+ * This software is provided 'as-is', without any express or implied
+ * warranty.  In no event will the authors be held liable for any damages
+ * arising from the use of this software.
+ *
+ * Permission is granted to anyone to use this software for any purpose,
+ * including commercial applications, and to alter it and redistribute it
+ * freely, subject to the following restrictions:
+ *
+ * 1. The origin of this software must not be misrepresented; you must not
+ *    claim that you wrote the original software. If you use this software
+ *    in a product, an acknowledgment in the product documentation would be
+ *    appreciated but is not required.
+ * 2. Altered source versions must be plainly marked as such, and must not be
+ *    misrepresented as being the original software.
+ * 3. This notice may not be removed or altered from any source distribution.
+ */
+
+/* CHROMA UPSAMPLING */
+
+#include "jsimd_altivec.h"
+
+
+void
+jsimd_h2v1_fancy_upsample_altivec (int max_v_samp_factor,
+                                   JDIMENSION downsampled_width,
+                                   JSAMPARRAY input_data,
+                                   JSAMPARRAY *output_data_ptr)
+{
+  JSAMPARRAY output_data = *output_data_ptr;
+  JSAMPROW inptr, outptr;
+  int inrow, incol;
+
+  __vector unsigned char this0, last0, p_last0, next0 = {0}, p_next0,
+    out;
+  __vector short this0e, this0o, this0l, this0h, last0l, last0h,
+    next0l, next0h, outle, outhe, outlo, outho;
+
+  /* Constants */
+  __vector unsigned char pb_zero = { __16X(0) }, pb_three = { __16X(3) },
+    last_index_col0 = {0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14},
+    last_index = {15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30},
+    next_index = {1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16},
+    next_index_lastcol = {1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,15},
+#if __BIG_ENDIAN__
+    merge_pack_index = {1,17,3,19,5,21,7,23,9,25,11,27,13,29,15,31};
+#else
+    merge_pack_index = {0,16,2,18,4,20,6,22,8,24,10,26,12,28,14,30};
+#endif
+  __vector short pw_one = { __8X(1) }, pw_two = { __8X(2) };
+
+  for (inrow = 0; inrow < max_v_samp_factor; inrow++) {
+    inptr = input_data[inrow];
+    outptr = output_data[inrow];
+
+    if (downsampled_width & 15)
+      inptr[downsampled_width] = inptr[downsampled_width - 1];
+
+    this0 = vec_ld(0, inptr);
+    p_last0 = vec_perm(this0, this0, last_index_col0);
+    last0 = this0;
+
+    for (incol = downsampled_width; incol > 0;
+         incol -= 16, inptr += 16, outptr += 32) {
+
+      if (downsampled_width - incol > 0) {
+        p_last0 = vec_perm(last0, this0, last_index);
+        last0 = this0;
+      }
+
+      if (incol <= 16)
+        p_next0 = vec_perm(this0, this0, next_index_lastcol);
+      else {
+        next0 = vec_ld(16, inptr);
+        p_next0 = vec_perm(this0, next0, next_index);
+      }
+
+      this0e = (__vector short)vec_mule(this0, pb_three);
+      this0o = (__vector short)vec_mulo(this0, pb_three);
+      this0l = vec_mergeh(this0e, this0o);
+      this0h = vec_mergel(this0e, this0o);
+
+      last0l = (__vector short)VEC_UNPACKHU(p_last0);
+      last0h = (__vector short)VEC_UNPACKLU(p_last0);
+      last0l = vec_add(last0l, pw_one);
+
+      next0l = (__vector short)VEC_UNPACKHU(p_next0);
+      next0h = (__vector short)VEC_UNPACKLU(p_next0);
+      next0l = vec_add(next0l, pw_two);
+
+      outle = vec_add(this0l, last0l);
+      outlo = vec_add(this0l, next0l);
+      outle = vec_sr(outle, (__vector unsigned short)pw_two);
+      outlo = vec_sr(outlo, (__vector unsigned short)pw_two);
+
+      out = vec_perm((__vector unsigned char)outle,
+                     (__vector unsigned char)outlo, merge_pack_index);
+      vec_st(out, 0, outptr);
+
+      if (incol > 8) {
+        last0h = vec_add(last0h, pw_one);
+        next0h = vec_add(next0h, pw_two);
+
+        outhe = vec_add(this0h, last0h);
+        outho = vec_add(this0h, next0h);
+        outhe = vec_sr(outhe, (__vector unsigned short)pw_two);
+        outho = vec_sr(outho, (__vector unsigned short)pw_two);
+
+        out = vec_perm((__vector unsigned char)outhe,
+                       (__vector unsigned char)outho, merge_pack_index);
+        vec_st(out, 16, outptr);
+      }
+
+      this0 = next0;
+    }
+  }
+}
+
+
+void
+jsimd_h2v2_fancy_upsample_altivec (int max_v_samp_factor,
+                                   JDIMENSION downsampled_width,
+                                   JSAMPARRAY input_data,
+                                   JSAMPARRAY *output_data_ptr)
+{
+  JSAMPARRAY output_data = *output_data_ptr;
+  JSAMPROW inptr_1, inptr0, inptr1, outptr0, outptr1;
+  int inrow, outrow, incol;
+
+  __vector unsigned char this_1, this0, this1, out;
+  __vector short this_1l, this_1h, this0l, this0h, this1l, this1h,
+    lastcolsum_1h, lastcolsum1h,
+    p_lastcolsum_1l, p_lastcolsum_1h, p_lastcolsum1l, p_lastcolsum1h,
+    thiscolsum_1l, thiscolsum_1h, thiscolsum1l, thiscolsum1h,
+    nextcolsum_1l = {0}, nextcolsum_1h = {0},
+    nextcolsum1l = {0}, nextcolsum1h = {0},
+    p_nextcolsum_1l, p_nextcolsum_1h, p_nextcolsum1l, p_nextcolsum1h,
+    tmpl, tmph, outle, outhe, outlo, outho;
+
+  /* Constants */
+  __vector unsigned char pb_zero = { __16X(0) },
+    last_index_col0 = {0,1,0,1,2,3,4,5,6,7,8,9,10,11,12,13},
+    last_index={14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29},
+    next_index = {2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17},
+    next_index_lastcol = {2,3,4,5,6,7,8,9,10,11,12,13,14,15,14,15},
+#if __BIG_ENDIAN__
+    merge_pack_index = {1,17,3,19,5,21,7,23,9,25,11,27,13,29,15,31};
+#else
+    merge_pack_index = {0,16,2,18,4,20,6,22,8,24,10,26,12,28,14,30};
+#endif
+  __vector short pw_zero = { __8X(0) }, pw_three = { __8X(3) },
+    pw_seven = { __8X(7) }, pw_eight = { __8X(8) };
+  __vector unsigned short pw_four = { __8X(4) };
+
+  for (inrow = 0, outrow = 0; outrow < max_v_samp_factor; inrow++) {
+
+    inptr_1 = input_data[inrow - 1];
+    inptr0 = input_data[inrow];
+    inptr1 = input_data[inrow + 1];
+    outptr0 = output_data[outrow++];
+    outptr1 = output_data[outrow++];
+
+    if (downsampled_width & 15) {
+      inptr_1[downsampled_width] = inptr_1[downsampled_width - 1];
+      inptr0[downsampled_width] = inptr0[downsampled_width - 1];
+      inptr1[downsampled_width] = inptr1[downsampled_width - 1];
+    }
+
+    this0 = vec_ld(0, inptr0);
+    this0l = (__vector short)VEC_UNPACKHU(this0);
+    this0h = (__vector short)VEC_UNPACKLU(this0);
+    this0l = vec_mladd(this0l, pw_three, pw_zero);
+    this0h = vec_mladd(this0h, pw_three, pw_zero);
+
+    this_1 = vec_ld(0, inptr_1);
+    this_1l = (__vector short)VEC_UNPACKHU(this_1);
+    this_1h = (__vector short)VEC_UNPACKLU(this_1);
+    thiscolsum_1l = vec_add(this0l, this_1l);
+    thiscolsum_1h = vec_add(this0h, this_1h);
+    lastcolsum_1h = thiscolsum_1h;
+    p_lastcolsum_1l = vec_perm(thiscolsum_1l, thiscolsum_1l, last_index_col0);
+    p_lastcolsum_1h = vec_perm(thiscolsum_1l, thiscolsum_1h, last_index);
+
+    this1 = vec_ld(0, inptr1);
+    this1l = (__vector short)VEC_UNPACKHU(this1);
+    this1h = (__vector short)VEC_UNPACKLU(this1);
+    thiscolsum1l = vec_add(this0l, this1l);
+    thiscolsum1h = vec_add(this0h, this1h);
+    lastcolsum1h = thiscolsum1h;
+    p_lastcolsum1l = vec_perm(thiscolsum1l, thiscolsum1l, last_index_col0);
+    p_lastcolsum1h = vec_perm(thiscolsum1l, thiscolsum1h, last_index);
+
+    for (incol = downsampled_width; incol > 0;
+         incol -= 16, inptr_1 += 16, inptr0 += 16, inptr1 += 16,
+         outptr0 += 32, outptr1 += 32) {
+
+      if (downsampled_width - incol > 0) {
+        p_lastcolsum_1l = vec_perm(lastcolsum_1h, thiscolsum_1l, last_index);
+        p_lastcolsum_1h = vec_perm(thiscolsum_1l, thiscolsum_1h, last_index);
+        p_lastcolsum1l = vec_perm(lastcolsum1h, thiscolsum1l, last_index);
+        p_lastcolsum1h = vec_perm(thiscolsum1l, thiscolsum1h, last_index);
+        lastcolsum_1h = thiscolsum_1h;  lastcolsum1h = thiscolsum1h;
+      }
+
+      if (incol <= 16) {
+        p_nextcolsum_1l = vec_perm(thiscolsum_1l, thiscolsum_1h, next_index);
+        p_nextcolsum_1h = vec_perm(thiscolsum_1h, thiscolsum_1h,
+                                   next_index_lastcol);
+        p_nextcolsum1l = vec_perm(thiscolsum1l, thiscolsum1h, next_index);
+        p_nextcolsum1h = vec_perm(thiscolsum1h, thiscolsum1h,
+                                  next_index_lastcol);
+      } else {
+        this0 = vec_ld(16, inptr0);
+        this0l = (__vector short)VEC_UNPACKHU(this0);
+        this0h = (__vector short)VEC_UNPACKLU(this0);
+        this0l = vec_mladd(this0l, pw_three, pw_zero);
+        this0h = vec_mladd(this0h, pw_three, pw_zero);
+
+        this_1 = vec_ld(16, inptr_1);
+        this_1l = (__vector short)VEC_UNPACKHU(this_1);
+        this_1h = (__vector short)VEC_UNPACKLU(this_1);
+        nextcolsum_1l = vec_add(this0l, this_1l);
+        nextcolsum_1h = vec_add(this0h, this_1h);
+        p_nextcolsum_1l = vec_perm(thiscolsum_1l, thiscolsum_1h, next_index);
+        p_nextcolsum_1h = vec_perm(thiscolsum_1h, nextcolsum_1l, next_index);
+
+        this1 = vec_ld(16, inptr1);
+        this1l = (__vector short)VEC_UNPACKHU(this1);
+        this1h = (__vector short)VEC_UNPACKLU(this1);
+        nextcolsum1l = vec_add(this0l, this1l);
+        nextcolsum1h = vec_add(this0h, this1h);
+        p_nextcolsum1l = vec_perm(thiscolsum1l, thiscolsum1h, next_index);
+        p_nextcolsum1h = vec_perm(thiscolsum1h, nextcolsum1l, next_index);
+      }
+
+      /* Process the upper row */
+
+      tmpl = vec_mladd(thiscolsum_1l, pw_three, pw_zero);
+      outle = vec_add(tmpl, p_lastcolsum_1l);
+      outle = vec_add(outle, pw_eight);
+      outle = vec_sr(outle, pw_four);
+
+      outlo = vec_add(tmpl, p_nextcolsum_1l);
+      outlo = vec_add(outlo, pw_seven);
+      outlo = vec_sr(outlo, pw_four);
+
+      out = vec_perm((__vector unsigned char)outle,
+                     (__vector unsigned char)outlo, merge_pack_index);
+      vec_st(out, 0, outptr0);
+
+      if (incol > 8) {
+        tmph = vec_mladd(thiscolsum_1h, pw_three, pw_zero);
+        outhe = vec_add(tmph, p_lastcolsum_1h);
+        outhe = vec_add(outhe, pw_eight);
+        outhe = vec_sr(outhe, pw_four);
+
+        outho = vec_add(tmph, p_nextcolsum_1h);
+        outho = vec_add(outho, pw_seven);
+        outho = vec_sr(outho, pw_four);
+
+        out = vec_perm((__vector unsigned char)outhe,
+                       (__vector unsigned char)outho, merge_pack_index);
+        vec_st(out, 16, outptr0);
+      }
+
+      /* Process the lower row */
+
+      tmpl = vec_mladd(thiscolsum1l, pw_three, pw_zero);
+      outle = vec_add(tmpl, p_lastcolsum1l);
+      outle = vec_add(outle, pw_eight);
+      outle = vec_sr(outle, pw_four);
+
+      outlo = vec_add(tmpl, p_nextcolsum1l);
+      outlo = vec_add(outlo, pw_seven);
+      outlo = vec_sr(outlo, pw_four);
+
+      out = vec_perm((__vector unsigned char)outle,
+                     (__vector unsigned char)outlo, merge_pack_index);
+      vec_st(out, 0, outptr1);
+
+      if (incol > 8) {
+        tmph = vec_mladd(thiscolsum1h, pw_three, pw_zero);
+        outhe = vec_add(tmph, p_lastcolsum1h);
+        outhe = vec_add(outhe, pw_eight);
+        outhe = vec_sr(outhe, pw_four);
+
+        outho = vec_add(tmph, p_nextcolsum1h);
+        outho = vec_add(outho, pw_seven);
+        outho = vec_sr(outho, pw_four);
+
+        out = vec_perm((__vector unsigned char)outhe,
+                       (__vector unsigned char)outho, merge_pack_index);
+        vec_st(out, 16, outptr1);
+      }
+
+      thiscolsum_1l = nextcolsum_1l;  thiscolsum_1h = nextcolsum_1h;
+      thiscolsum1l = nextcolsum1l;  thiscolsum1h = nextcolsum1h;
+    }
+  }
+}
+
+
+/* These are rarely used (mainly just for decompressing YCCK images) */
+
+void
+jsimd_h2v1_upsample_altivec (int max_v_samp_factor,
+                             JDIMENSION output_width,
+                             JSAMPARRAY input_data,
+                             JSAMPARRAY *output_data_ptr)
+{
+  JSAMPARRAY output_data = *output_data_ptr;
+  JSAMPROW inptr, outptr;
+  int inrow, incol;
+
+  __vector unsigned char in, inl, inh;
+
+  for (inrow = 0; inrow < max_v_samp_factor; inrow++) {
+    inptr = input_data[inrow];
+    outptr = output_data[inrow];
+
+    for (incol = (output_width + 31) & (~31); incol > 0;
+         incol -= 64, inptr += 32, outptr += 64) {
+
+      in = vec_ld(0, inptr);
+      inl = vec_mergeh(in, in);
+      inh = vec_mergel(in, in);
+
+      vec_st(inl, 0, outptr);
+      vec_st(inh, 16, outptr);
+
+      if (incol > 32) {
+        in = vec_ld(16, inptr);
+        inl = vec_mergeh(in, in);
+        inh = vec_mergel(in, in);
+
+        vec_st(inl, 32, outptr);
+        vec_st(inh, 48, outptr);
+      }
+    }
+  }
+}
+
+
+void
+jsimd_h2v2_upsample_altivec (int max_v_samp_factor,
+                             JDIMENSION output_width,
+                             JSAMPARRAY input_data,
+                             JSAMPARRAY *output_data_ptr)
+{
+  JSAMPARRAY output_data = *output_data_ptr;
+  JSAMPROW inptr, outptr0, outptr1;
+  int inrow, outrow, incol;
+
+  __vector unsigned char in, inl, inh;
+
+  for (inrow = 0, outrow = 0; outrow < max_v_samp_factor; inrow++) {
+
+    inptr = input_data[inrow];
+    outptr0 = output_data[outrow++];
+    outptr1 = output_data[outrow++];
+
+    for (incol = (output_width + 31) & (~31); incol > 0;
+         incol -= 64, inptr += 32, outptr0 += 64, outptr1 += 64) {
+
+      in = vec_ld(0, inptr);
+      inl = vec_mergeh(in, in);
+      inh = vec_mergel(in, in);
+
+      vec_st(inl, 0, outptr0);
+      vec_st(inl, 0, outptr1);
+
+      vec_st(inh, 16, outptr0);
+      vec_st(inh, 16, outptr1);
+
+      if (incol > 32) {
+        in = vec_ld(16, inptr);
+        inl = vec_mergeh(in, in);
+        inh = vec_mergel(in, in);
+
+        vec_st(inl, 32, outptr0);
+        vec_st(inl, 32, outptr1);
+
+        vec_st(inh, 48, outptr0);
+        vec_st(inh, 48, outptr1);
+      }
+    }
+  }
+}
diff --git a/simd/jdsample-mmx.asm b/simd/jdsample-mmx.asm
index 88564e4..c9e2b8b 100644
--- a/simd/jdsample-mmx.asm
+++ b/simd/jdsample-mmx.asm
@@ -49,13 +49,13 @@
 ; jsimd_h2v1_fancy_upsample_mmx (int max_v_samp_factor,
 ;                                JDIMENSION downsampled_width,
 ;                                JSAMPARRAY input_data,
-;                                JSAMPARRAY * output_data_ptr);
+;                                JSAMPARRAY *output_data_ptr);
 ;
 
 %define max_v_samp(b)           (b)+8           ; int max_v_samp_factor
 %define downsamp_width(b)       (b)+12          ; JDIMENSION downsampled_width
 %define input_data(b)           (b)+16          ; JSAMPARRAY input_data
-%define output_data_ptr(b)      (b)+20          ; JSAMPARRAY * output_data_ptr
+%define output_data_ptr(b)      (b)+20          ; JSAMPARRAY *output_data_ptr
 
         align   16
         global  EXTN(jsimd_h2v1_fancy_upsample_mmx)
@@ -202,18 +202,18 @@
 ; jsimd_h2v2_fancy_upsample_mmx (int max_v_samp_factor,
 ;                                JDIMENSION downsampled_width,
 ;                                JSAMPARRAY input_data,
-;                                JSAMPARRAY * output_data_ptr);
+;                                JSAMPARRAY *output_data_ptr);
 ;
 
 %define max_v_samp(b)           (b)+8           ; int max_v_samp_factor
 %define downsamp_width(b)       (b)+12          ; JDIMENSION downsampled_width
 %define input_data(b)           (b)+16          ; JSAMPARRAY input_data
-%define output_data_ptr(b)      (b)+20          ; JSAMPARRAY * output_data_ptr
+%define output_data_ptr(b)      (b)+20          ; JSAMPARRAY *output_data_ptr
 
 %define original_ebp    ebp+0
 %define wk(i)           ebp-(WK_NUM-(i))*SIZEOF_MMWORD  ; mmword wk[WK_NUM]
 %define WK_NUM          4
-%define gotptr          wk(0)-SIZEOF_POINTER    ; void * gotptr
+%define gotptr          wk(0)-SIZEOF_POINTER    ; void *gotptr
 
         align   16
         global  EXTN(jsimd_h2v2_fancy_upsample_mmx)
@@ -533,13 +533,13 @@
 ; jsimd_h2v1_upsample_mmx (int max_v_samp_factor,
 ;                          JDIMENSION output_width,
 ;                          JSAMPARRAY input_data,
-;                          JSAMPARRAY * output_data_ptr);
+;                          JSAMPARRAY *output_data_ptr);
 ;
 
 %define max_v_samp(b)           (b)+8           ; int max_v_samp_factor
 %define output_width(b)         (b)+12          ; JDIMENSION output_width
 %define input_data(b)           (b)+16          ; JSAMPARRAY input_data
-%define output_data_ptr(b)      (b)+20          ; JSAMPARRAY * output_data_ptr
+%define output_data_ptr(b)      (b)+20          ; JSAMPARRAY *output_data_ptr
 
         align   16
         global  EXTN(jsimd_h2v1_upsample_mmx)
@@ -634,13 +634,13 @@
 ; jsimd_h2v2_upsample_mmx (int max_v_samp_factor,
 ;                          JDIMENSION output_width,
 ;                          JSAMPARRAY input_data,
-;                          JSAMPARRAY * output_data_ptr);
+;                          JSAMPARRAY *output_data_ptr);
 ;
 
 %define max_v_samp(b)           (b)+8           ; int max_v_samp_factor
 %define output_width(b)         (b)+12          ; JDIMENSION output_width
 %define input_data(b)           (b)+16          ; JSAMPARRAY input_data
-%define output_data_ptr(b)      (b)+20          ; JSAMPARRAY * output_data_ptr
+%define output_data_ptr(b)      (b)+20          ; JSAMPARRAY *output_data_ptr
 
         align   16
         global  EXTN(jsimd_h2v2_upsample_mmx)
diff --git a/simd/jdsample-sse2-64.asm b/simd/jdsample-sse2-64.asm
index 2287c00..3aec69f 100644
--- a/simd/jdsample-sse2-64.asm
+++ b/simd/jdsample-sse2-64.asm
@@ -50,13 +50,13 @@
 ; jsimd_h2v1_fancy_upsample_sse2 (int max_v_samp_factor,
 ;                                 JDIMENSION downsampled_width,
 ;                                 JSAMPARRAY input_data,
-;                                 JSAMPARRAY * output_data_ptr);
+;                                 JSAMPARRAY *output_data_ptr);
 ;
 
 ; r10 = int max_v_samp_factor
 ; r11 = JDIMENSION downsampled_width
 ; r12 = JSAMPARRAY input_data
-; r13 = JSAMPARRAY * output_data_ptr
+; r13 = JSAMPARRAY *output_data_ptr
 
         align   16
         global  EXTN(jsimd_h2v1_fancy_upsample_sse2)
@@ -189,13 +189,13 @@
 ; jsimd_h2v2_fancy_upsample_sse2 (int max_v_samp_factor,
 ;                                 JDIMENSION downsampled_width,
 ;                                 JSAMPARRAY input_data,
-;                                 JSAMPARRAY * output_data_ptr);
+;                                 JSAMPARRAY *output_data_ptr);
 ;
 
 ; r10 = int max_v_samp_factor
 ; r11 = JDIMENSION downsampled_width
 ; r12 = JSAMPARRAY input_data
-; r13 = JSAMPARRAY * output_data_ptr
+; r13 = JSAMPARRAY *output_data_ptr
 
 %define wk(i)           rbp-(WK_NUM-(i))*SIZEOF_XMMWORD ; xmmword wk[WK_NUM]
 %define WK_NUM          4
@@ -489,13 +489,13 @@
 ; jsimd_h2v1_upsample_sse2 (int max_v_samp_factor,
 ;                           JDIMENSION output_width,
 ;                           JSAMPARRAY input_data,
-;                           JSAMPARRAY * output_data_ptr);
+;                           JSAMPARRAY *output_data_ptr);
 ;
 
 ; r10 = int max_v_samp_factor
 ; r11 = JDIMENSION output_width
 ; r12 = JSAMPARRAY input_data
-; r13 = JSAMPARRAY * output_data_ptr
+; r13 = JSAMPARRAY *output_data_ptr
 
         align   16
         global  EXTN(jsimd_h2v1_upsample_sse2)
@@ -578,13 +578,13 @@
 ; jsimd_h2v2_upsample_sse2 (nt max_v_samp_factor,
 ;                           JDIMENSION output_width,
 ;                           JSAMPARRAY input_data,
-;                           JSAMPARRAY * output_data_ptr);
+;                           JSAMPARRAY *output_data_ptr);
 ;
 
 ; r10 = int max_v_samp_factor
 ; r11 = JDIMENSION output_width
 ; r12 = JSAMPARRAY input_data
-; r13 = JSAMPARRAY * output_data_ptr
+; r13 = JSAMPARRAY *output_data_ptr
 
         align   16
         global  EXTN(jsimd_h2v2_upsample_sse2)
diff --git a/simd/jdsample-sse2.asm b/simd/jdsample-sse2.asm
index 51176d4..f75e594 100644
--- a/simd/jdsample-sse2.asm
+++ b/simd/jdsample-sse2.asm
@@ -49,13 +49,13 @@
 ; jsimd_h2v1_fancy_upsample_sse2 (int max_v_samp_factor,
 ;                                 JDIMENSION downsampled_width,
 ;                                 JSAMPARRAY input_data,
-;                                 JSAMPARRAY * output_data_ptr);
+;                                 JSAMPARRAY *output_data_ptr);
 ;
 
 %define max_v_samp(b)           (b)+8           ; int max_v_samp_factor
 %define downsamp_width(b)       (b)+12          ; JDIMENSION downsampled_width
 %define input_data(b)           (b)+16          ; JSAMPARRAY input_data
-%define output_data_ptr(b)      (b)+20          ; JSAMPARRAY * output_data_ptr
+%define output_data_ptr(b)      (b)+20          ; JSAMPARRAY *output_data_ptr
 
         align   16
         global  EXTN(jsimd_h2v1_fancy_upsample_sse2)
@@ -200,18 +200,18 @@
 ; jsimd_h2v2_fancy_upsample_sse2 (int max_v_samp_factor,
 ;                                 JDIMENSION downsampled_width,
 ;                                 JSAMPARRAY input_data,
-;                                 JSAMPARRAY * output_data_ptr);
+;                                 JSAMPARRAY *output_data_ptr);
 ;
 
 %define max_v_samp(b)           (b)+8           ; int max_v_samp_factor
 %define downsamp_width(b)       (b)+12          ; JDIMENSION downsampled_width
 %define input_data(b)           (b)+16          ; JSAMPARRAY input_data
-%define output_data_ptr(b)      (b)+20          ; JSAMPARRAY * output_data_ptr
+%define output_data_ptr(b)      (b)+20          ; JSAMPARRAY *output_data_ptr
 
 %define original_ebp    ebp+0
 %define wk(i)           ebp-(WK_NUM-(i))*SIZEOF_XMMWORD ; xmmword wk[WK_NUM]
 %define WK_NUM          4
-%define gotptr          wk(0)-SIZEOF_POINTER    ; void * gotptr
+%define gotptr          wk(0)-SIZEOF_POINTER    ; void *gotptr
 
         align   16
         global  EXTN(jsimd_h2v2_fancy_upsample_sse2)
@@ -529,13 +529,13 @@
 ; jsimd_h2v1_upsample_sse2 (int max_v_samp_factor,
 ;                           JDIMENSION output_width,
 ;                           JSAMPARRAY input_data,
-;                           JSAMPARRAY * output_data_ptr);
+;                           JSAMPARRAY *output_data_ptr);
 ;
 
 %define max_v_samp(b)           (b)+8           ; int max_v_samp_factor
 %define output_width(b)         (b)+12          ; JDIMENSION output_width
 %define input_data(b)           (b)+16          ; JSAMPARRAY input_data
-%define output_data_ptr(b)      (b)+20          ; JSAMPARRAY * output_data_ptr
+%define output_data_ptr(b)      (b)+20          ; JSAMPARRAY *output_data_ptr
 
         align   16
         global  EXTN(jsimd_h2v1_upsample_sse2)
@@ -628,13 +628,13 @@
 ; jsimd_h2v2_upsample_sse2 (nt max_v_samp_factor,
 ;                           JDIMENSION output_width,
 ;                           JSAMPARRAY input_data,
-;                           JSAMPARRAY * output_data_ptr);
+;                           JSAMPARRAY *output_data_ptr);
 ;
 
 %define max_v_samp(b)           (b)+8           ; int max_v_samp_factor
 %define output_width(b)         (b)+12          ; JDIMENSION output_width
 %define input_data(b)           (b)+16          ; JSAMPARRAY input_data
-%define output_data_ptr(b)      (b)+20          ; JSAMPARRAY * output_data_ptr
+%define output_data_ptr(b)      (b)+20          ; JSAMPARRAY *output_data_ptr
 
         align   16
         global  EXTN(jsimd_h2v2_upsample_sse2)
diff --git a/simd/jfdctflt-3dn.asm b/simd/jfdctflt-3dn.asm
index e23c521..133fe4d 100644
--- a/simd/jfdctflt-3dn.asm
+++ b/simd/jfdctflt-3dn.asm
@@ -45,10 +45,10 @@
 ; Perform the forward DCT on one block of samples.
 ;
 ; GLOBAL(void)
-; jsimd_fdct_float_3dnow (FAST_FLOAT * data)
+; jsimd_fdct_float_3dnow (FAST_FLOAT *data)
 ;
 
-%define data(b)         (b)+8           ; FAST_FLOAT * data
+%define data(b)         (b)+8           ; FAST_FLOAT *data
 
 %define original_ebp    ebp+0
 %define wk(i)           ebp-(WK_NUM-(i))*SIZEOF_MMWORD  ; mmword wk[WK_NUM]
diff --git a/simd/jfdctflt-sse-64.asm b/simd/jfdctflt-sse-64.asm
index 4184e95..02d5463 100644
--- a/simd/jfdctflt-sse-64.asm
+++ b/simd/jfdctflt-sse-64.asm
@@ -56,10 +56,10 @@
 ; Perform the forward DCT on one block of samples.
 ;
 ; GLOBAL(void)
-; jsimd_fdct_float_sse (FAST_FLOAT * data)
+; jsimd_fdct_float_sse (FAST_FLOAT *data)
 ;
 
-; r10 = FAST_FLOAT * data
+; r10 = FAST_FLOAT *data
 
 %define wk(i)           rbp-(WK_NUM-(i))*SIZEOF_XMMWORD ; xmmword wk[WK_NUM]
 %define WK_NUM          2
diff --git a/simd/jfdctflt-sse.asm b/simd/jfdctflt-sse.asm
index 477f5c8..c2f61c8 100644
--- a/simd/jfdctflt-sse.asm
+++ b/simd/jfdctflt-sse.asm
@@ -55,10 +55,10 @@
 ; Perform the forward DCT on one block of samples.
 ;
 ; GLOBAL(void)
-; jsimd_fdct_float_sse (FAST_FLOAT * data)
+; jsimd_fdct_float_sse (FAST_FLOAT *data)
 ;
 
-%define data(b)         (b)+8           ; FAST_FLOAT * data
+%define data(b)         (b)+8           ; FAST_FLOAT *data
 
 %define original_ebp    ebp+0
 %define wk(i)           ebp-(WK_NUM-(i))*SIZEOF_XMMWORD ; xmmword wk[WK_NUM]
diff --git a/simd/jfdctfst-altivec.c b/simd/jfdctfst-altivec.c
new file mode 100644
index 0000000..c4cc26e
--- /dev/null
+++ b/simd/jfdctfst-altivec.c
@@ -0,0 +1,156 @@
+/*
+ * AltiVec optimizations for libjpeg-turbo
+ *
+ * Copyright (C) 2014, D. R. Commander.
+ * All rights reserved.
+ * This software is provided 'as-is', without any express or implied
+ * warranty.  In no event will the authors be held liable for any damages
+ * arising from the use of this software.
+ *
+ * Permission is granted to anyone to use this software for any purpose,
+ * including commercial applications, and to alter it and redistribute it
+ * freely, subject to the following restrictions:
+ *
+ * 1. The origin of this software must not be misrepresented; you must not
+ *    claim that you wrote the original software. If you use this software
+ *    in a product, an acknowledgment in the product documentation would be
+ *    appreciated but is not required.
+ * 2. Altered source versions must be plainly marked as such, and must not be
+ *    misrepresented as being the original software.
+ * 3. This notice may not be removed or altered from any source distribution.
+ */
+
+/* FAST INTEGER FORWARD DCT
+ *
+ * This is similar to the SSE2 implementation, except that we left-shift the
+ * constants by 1 less bit (the -1 in CONST_SHIFT.)  This is because
+ * vec_madds(arg1, arg2, arg3) generates the 16-bit saturated sum of:
+ *   the elements in arg3 + the most significant 17 bits of
+ *     (the elements in arg1 * the elements in arg2).
+ */
+
+#include "jsimd_altivec.h"
+
+
+#define F_0_382 98   /* FIX(0.382683433) */
+#define F_0_541 139  /* FIX(0.541196100) */
+#define F_0_707 181  /* FIX(0.707106781) */
+#define F_1_306 334  /* FIX(1.306562965) */
+
+#define CONST_BITS 8
+#define PRE_MULTIPLY_SCALE_BITS 2
+#define CONST_SHIFT (16 - PRE_MULTIPLY_SCALE_BITS - CONST_BITS - 1)
+
+
+#define DO_FDCT()  \
+{  \
+  /* Even part */  \
+  \
+  tmp10 = vec_add(tmp0, tmp3);  \
+  tmp13 = vec_sub(tmp0, tmp3);  \
+  tmp11 = vec_add(tmp1, tmp2);  \
+  tmp12 = vec_sub(tmp1, tmp2);  \
+  \
+  out0  = vec_add(tmp10, tmp11);  \
+  out4  = vec_sub(tmp10, tmp11);  \
+  \
+  z1 = vec_add(tmp12, tmp13);  \
+  z1 = vec_sl(z1, pre_multiply_scale_bits);  \
+  z1 = vec_madds(z1, pw_0707, pw_zero);  \
+  \
+  out2 = vec_add(tmp13, z1);  \
+  out6 = vec_sub(tmp13, z1);  \
+  \
+  /* Odd part */  \
+  \
+  tmp10 = vec_add(tmp4, tmp5);  \
+  tmp11 = vec_add(tmp5, tmp6);  \
+  tmp12 = vec_add(tmp6, tmp7);  \
+  \
+  tmp10 = vec_sl(tmp10, pre_multiply_scale_bits);  \
+  tmp12 = vec_sl(tmp12, pre_multiply_scale_bits);  \
+  z5 = vec_sub(tmp10, tmp12);  \
+  z5 = vec_madds(z5, pw_0382, pw_zero);  \
+  \
+  z2 = vec_madds(tmp10, pw_0541, z5);  \
+  z4 = vec_madds(tmp12, pw_1306, z5);  \
+  \
+  tmp11 = vec_sl(tmp11, pre_multiply_scale_bits);  \
+  z3 = vec_madds(tmp11, pw_0707, pw_zero);  \
+  \
+  z11 = vec_add(tmp7, z3);  \
+  z13 = vec_sub(tmp7, z3);  \
+  \
+  out5 = vec_add(z13, z2);  \
+  out3 = vec_sub(z13, z2);  \
+  out1 = vec_add(z11, z4);  \
+  out7 = vec_sub(z11, z4);  \
+}
+
+
+void
+jsimd_fdct_ifast_altivec (DCTELEM *data)
+{
+  __vector short row0, row1, row2, row3, row4, row5, row6, row7,
+    col0, col1, col2, col3, col4, col5, col6, col7,
+    tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7, tmp10, tmp11, tmp12, tmp13,
+    z1, z2, z3, z4, z5, z11, z13,
+    out0, out1, out2, out3, out4, out5, out6, out7;
+
+  /* Constants */
+  __vector short pw_zero = { __8X(0) },
+    pw_0382 = { __8X(F_0_382 << CONST_SHIFT) },
+    pw_0541 = { __8X(F_0_541 << CONST_SHIFT) },
+    pw_0707 = { __8X(F_0_707 << CONST_SHIFT) },
+    pw_1306 = { __8X(F_1_306 << CONST_SHIFT) };
+  __vector unsigned short
+    pre_multiply_scale_bits = { __8X(PRE_MULTIPLY_SCALE_BITS) };
+
+  /* Pass 1: process rows */
+
+  row0 = vec_ld(0, data);
+  row1 = vec_ld(16, data);
+  row2 = vec_ld(32, data);
+  row3 = vec_ld(48, data);
+  row4 = vec_ld(64, data);
+  row5 = vec_ld(80, data);
+  row6 = vec_ld(96, data);
+  row7 = vec_ld(112, data);
+
+  TRANSPOSE(row, col);
+
+  tmp0 = vec_add(col0, col7);
+  tmp7 = vec_sub(col0, col7);
+  tmp1 = vec_add(col1, col6);
+  tmp6 = vec_sub(col1, col6);
+  tmp2 = vec_add(col2, col5);
+  tmp5 = vec_sub(col2, col5);
+  tmp3 = vec_add(col3, col4);
+  tmp4 = vec_sub(col3, col4);
+
+  DO_FDCT();
+
+  /* Pass 2: process columns */
+
+  TRANSPOSE(out, row);
+
+  tmp0 = vec_add(row0, row7);
+  tmp7 = vec_sub(row0, row7);
+  tmp1 = vec_add(row1, row6);
+  tmp6 = vec_sub(row1, row6);
+  tmp2 = vec_add(row2, row5);
+  tmp5 = vec_sub(row2, row5);
+  tmp3 = vec_add(row3, row4);
+  tmp4 = vec_sub(row3, row4);
+
+  DO_FDCT();
+
+  vec_st(out0, 0, data);
+  vec_st(out1, 16, data);
+  vec_st(out2, 32, data);
+  vec_st(out3, 48, data);
+  vec_st(out4, 64, data);
+  vec_st(out5, 80, data);
+  vec_st(out6, 96, data);
+  vec_st(out7, 112, data);
+}
diff --git a/simd/jfdctfst-mmx.asm b/simd/jfdctfst-mmx.asm
index 2c2d20c..41ba00e 100644
--- a/simd/jfdctfst-mmx.asm
+++ b/simd/jfdctfst-mmx.asm
@@ -70,10 +70,10 @@
 ; Perform the forward DCT on one block of samples.
 ;
 ; GLOBAL(void)
-; jsimd_fdct_ifast_mmx (DCTELEM * data)
+; jsimd_fdct_ifast_mmx (DCTELEM *data)
 ;
 
-%define data(b)         (b)+8           ; DCTELEM * data
+%define data(b)         (b)+8           ; DCTELEM *data
 
 %define original_ebp    ebp+0
 %define wk(i)           ebp-(WK_NUM-(i))*SIZEOF_MMWORD  ; mmword wk[WK_NUM]
diff --git a/simd/jfdctfst-sse2-64.asm b/simd/jfdctfst-sse2-64.asm
index 13c4efc..f9b1551 100644
--- a/simd/jfdctfst-sse2-64.asm
+++ b/simd/jfdctfst-sse2-64.asm
@@ -71,10 +71,10 @@
 ; Perform the forward DCT on one block of samples.
 ;
 ; GLOBAL(void)
-; jsimd_fdct_ifast_sse2 (DCTELEM * data)
+; jsimd_fdct_ifast_sse2 (DCTELEM *data)
 ;
 
-; r10 = DCTELEM * data
+; r10 = DCTELEM *data
 
 %define wk(i)           rbp-(WK_NUM-(i))*SIZEOF_XMMWORD ; xmmword wk[WK_NUM]
 %define WK_NUM          2
diff --git a/simd/jfdctfst-sse2.asm b/simd/jfdctfst-sse2.asm
index e6e4b72..ebbadad 100644
--- a/simd/jfdctfst-sse2.asm
+++ b/simd/jfdctfst-sse2.asm
@@ -70,10 +70,10 @@
 ; Perform the forward DCT on one block of samples.
 ;
 ; GLOBAL(void)
-; jsimd_fdct_ifast_sse2 (DCTELEM * data)
+; jsimd_fdct_ifast_sse2 (DCTELEM *data)
 ;
 
-%define data(b)         (b)+8           ; DCTELEM * data
+%define data(b)         (b)+8           ; DCTELEM *data
 
 %define original_ebp    ebp+0
 %define wk(i)           ebp-(WK_NUM-(i))*SIZEOF_XMMWORD ; xmmword wk[WK_NUM]
diff --git a/simd/jfdctint-altivec.c b/simd/jfdctint-altivec.c
new file mode 100644
index 0000000..c13850a
--- /dev/null
+++ b/simd/jfdctint-altivec.c
@@ -0,0 +1,262 @@
+/*
+ * AltiVec optimizations for libjpeg-turbo
+ *
+ * Copyright (C) 2014, D. R. Commander.
+ * All rights reserved.
+ * This software is provided 'as-is', without any express or implied
+ * warranty.  In no event will the authors be held liable for any damages
+ * arising from the use of this software.
+ *
+ * Permission is granted to anyone to use this software for any purpose,
+ * including commercial applications, and to alter it and redistribute it
+ * freely, subject to the following restrictions:
+ *
+ * 1. The origin of this software must not be misrepresented; you must not
+ *    claim that you wrote the original software. If you use this software
+ *    in a product, an acknowledgment in the product documentation would be
+ *    appreciated but is not required.
+ * 2. Altered source versions must be plainly marked as such, and must not be
+ *    misrepresented as being the original software.
+ * 3. This notice may not be removed or altered from any source distribution.
+ */
+
+/* SLOW INTEGER FORWARD DCT */
+
+#include "jsimd_altivec.h"
+
+
+#define F_0_298 2446   /* FIX(0.298631336) */
+#define F_0_390 3196   /* FIX(0.390180644) */
+#define F_0_541 4433   /* FIX(0.541196100) */
+#define F_0_765 6270   /* FIX(0.765366865) */
+#define F_0_899 7373   /* FIX(0.899976223) */
+#define F_1_175 9633   /* FIX(1.175875602) */
+#define F_1_501 12299  /* FIX(1.501321110) */
+#define F_1_847 15137  /* FIX(1.847759065) */
+#define F_1_961 16069  /* FIX(1.961570560) */
+#define F_2_053 16819  /* FIX(2.053119869) */
+#define F_2_562 20995  /* FIX(2.562915447) */
+#define F_3_072 25172  /* FIX(3.072711026) */
+
+#define CONST_BITS 13
+#define PASS1_BITS 2
+#define DESCALE_P1 (CONST_BITS - PASS1_BITS)
+#define DESCALE_P2 (CONST_BITS + PASS1_BITS)
+
+
+#define DO_FDCT_COMMON(PASS)  \
+{  \
+  /* (Original)  \
+   * z1 = (tmp12 + tmp13) * 0.541196100;  \
+   * data2 = z1 + tmp13 * 0.765366865;  \
+   * data6 = z1 + tmp12 * -1.847759065;  \
+   *  \
+   * (This implementation)  \
+   * data2 = tmp13 * (0.541196100 + 0.765366865) + tmp12 * 0.541196100;  \
+   * data6 = tmp13 * 0.541196100 + tmp12 * (0.541196100 - 1.847759065);  \
+   */  \
+  \
+  tmp1312l = vec_mergeh(tmp13, tmp12);  \
+  tmp1312h = vec_mergel(tmp13, tmp12);  \
+  \
+  out2l = vec_msums(tmp1312l, pw_f130_f054, pd_descale_p##PASS);  \
+  out2h = vec_msums(tmp1312h, pw_f130_f054, pd_descale_p##PASS);  \
+  out6l = vec_msums(tmp1312l, pw_f054_mf130, pd_descale_p##PASS);  \
+  out6h = vec_msums(tmp1312h, pw_f054_mf130, pd_descale_p##PASS);  \
+  \
+  out2l = vec_sra(out2l, descale_p##PASS);  \
+  out2h = vec_sra(out2h, descale_p##PASS);  \
+  out6l = vec_sra(out6l, descale_p##PASS);  \
+  out6h = vec_sra(out6h, descale_p##PASS);  \
+  \
+  out2 = vec_pack(out2l, out2h);  \
+  out6 = vec_pack(out6l, out6h);  \
+  \
+  /* Odd part */  \
+  \
+  z3 = vec_add(tmp4, tmp6);  \
+  z4 = vec_add(tmp5, tmp7);  \
+  \
+  /* (Original)  \
+   * z5 = (z3 + z4) * 1.175875602;  \
+   * z3 = z3 * -1.961570560;  z4 = z4 * -0.390180644;  \
+   * z3 += z5;  z4 += z5;  \
+   *  \
+   * (This implementation)  \
+   * z3 = z3 * (1.175875602 - 1.961570560) + z4 * 1.175875602;  \
+   * z4 = z3 * 1.175875602 + z4 * (1.175875602 - 0.390180644);  \
+   */  \
+  \
+  z34l = vec_mergeh(z3, z4);  \
+  z34h = vec_mergel(z3, z4);  \
+  \
+  z3l = vec_msums(z34l, pw_mf078_f117, pd_descale_p##PASS);  \
+  z3h = vec_msums(z34h, pw_mf078_f117, pd_descale_p##PASS);  \
+  z4l = vec_msums(z34l, pw_f117_f078, pd_descale_p##PASS);  \
+  z4h = vec_msums(z34h, pw_f117_f078, pd_descale_p##PASS);  \
+  \
+  /* (Original)  \
+   * z1 = tmp4 + tmp7;  z2 = tmp5 + tmp6;  \
+   * tmp4 = tmp4 * 0.298631336;  tmp5 = tmp5 * 2.053119869;  \
+   * tmp6 = tmp6 * 3.072711026;  tmp7 = tmp7 * 1.501321110;  \
+   * z1 = z1 * -0.899976223;  z2 = z2 * -2.562915447;  \
+   * data7 = tmp4 + z1 + z3;  data5 = tmp5 + z2 + z4;  \
+   * data3 = tmp6 + z2 + z3;  data1 = tmp7 + z1 + z4;  \
+   *  \
+   * (This implementation)  \
+   * tmp4 = tmp4 * (0.298631336 - 0.899976223) + tmp7 * -0.899976223;  \
+   * tmp5 = tmp5 * (2.053119869 - 2.562915447) + tmp6 * -2.562915447;  \
+   * tmp6 = tmp5 * -2.562915447 + tmp6 * (3.072711026 - 2.562915447);  \
+   * tmp7 = tmp4 * -0.899976223 + tmp7 * (1.501321110 - 0.899976223);  \
+   * data7 = tmp4 + z3;  data5 = tmp5 + z4;  \
+   * data3 = tmp6 + z3;  data1 = tmp7 + z4;  \
+   */  \
+  \
+  tmp47l = vec_mergeh(tmp4, tmp7);  \
+  tmp47h = vec_mergel(tmp4, tmp7);  \
+  \
+  out7l = vec_msums(tmp47l, pw_mf060_mf089, z3l);  \
+  out7h = vec_msums(tmp47h, pw_mf060_mf089, z3h);  \
+  out1l = vec_msums(tmp47l, pw_mf089_f060, z4l);  \
+  out1h = vec_msums(tmp47h, pw_mf089_f060, z4h);  \
+  \
+  out7l = vec_sra(out7l, descale_p##PASS);  \
+  out7h = vec_sra(out7h, descale_p##PASS);  \
+  out1l = vec_sra(out1l, descale_p##PASS);  \
+  out1h = vec_sra(out1h, descale_p##PASS);  \
+  \
+  out7 = vec_pack(out7l, out7h);  \
+  out1 = vec_pack(out1l, out1h);  \
+  \
+  tmp56l = vec_mergeh(tmp5, tmp6);  \
+  tmp56h = vec_mergel(tmp5, tmp6);  \
+  \
+  out5l = vec_msums(tmp56l, pw_mf050_mf256, z4l);  \
+  out5h = vec_msums(tmp56h, pw_mf050_mf256, z4h);  \
+  out3l = vec_msums(tmp56l, pw_mf256_f050, z3l);  \
+  out3h = vec_msums(tmp56h, pw_mf256_f050, z3h);  \
+  \
+  out5l = vec_sra(out5l, descale_p##PASS);  \
+  out5h = vec_sra(out5h, descale_p##PASS);  \
+  out3l = vec_sra(out3l, descale_p##PASS);  \
+  out3h = vec_sra(out3h, descale_p##PASS);  \
+  \
+  out5 = vec_pack(out5l, out5h);  \
+  out3 = vec_pack(out3l, out3h);  \
+}
+
+#define DO_FDCT_PASS1()  \
+{  \
+  /* Even part */  \
+  \
+  tmp10 = vec_add(tmp0, tmp3);  \
+  tmp13 = vec_sub(tmp0, tmp3);  \
+  tmp11 = vec_add(tmp1, tmp2);  \
+  tmp12 = vec_sub(tmp1, tmp2);  \
+  \
+  out0  = vec_add(tmp10, tmp11);  \
+  out0  = vec_sl(out0, pass1_bits);  \
+  out4  = vec_sub(tmp10, tmp11);  \
+  out4  = vec_sl(out4, pass1_bits);  \
+  \
+  DO_FDCT_COMMON(1);  \
+}
+
+#define DO_FDCT_PASS2()  \
+{  \
+  /* Even part */  \
+  \
+  tmp10 = vec_add(tmp0, tmp3);  \
+  tmp13 = vec_sub(tmp0, tmp3);  \
+  tmp11 = vec_add(tmp1, tmp2);  \
+  tmp12 = vec_sub(tmp1, tmp2);  \
+  \
+  out0  = vec_add(tmp10, tmp11);  \
+  out0  = vec_add(out0, pw_descale_p2x);  \
+  out0  = vec_sra(out0, pass1_bits);  \
+  out4  = vec_sub(tmp10, tmp11);  \
+  out4  = vec_add(out4, pw_descale_p2x);  \
+  out4  = vec_sra(out4, pass1_bits);  \
+  \
+  DO_FDCT_COMMON(2);  \
+}
+
+
+void
+jsimd_fdct_islow_altivec (DCTELEM *data)
+{
+  __vector short row0, row1, row2, row3, row4, row5, row6, row7,
+    col0, col1, col2, col3, col4, col5, col6, col7,
+    tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7, tmp10, tmp11, tmp12, tmp13,
+    tmp47l, tmp47h, tmp56l, tmp56h, tmp1312l, tmp1312h,
+    z3, z4, z34l, z34h,
+    out0, out1, out2, out3, out4, out5, out6, out7;
+  __vector int z3l, z3h, z4l, z4h,
+    out1l, out1h, out2l, out2h, out3l, out3h, out5l, out5h, out6l, out6h,
+    out7l, out7h;
+
+  /* Constants */
+  __vector short
+    pw_f130_f054 = { __4X2(F_0_541 + F_0_765, F_0_541) },
+    pw_f054_mf130 = { __4X2(F_0_541, F_0_541 - F_1_847) },
+    pw_mf078_f117 = { __4X2(F_1_175 - F_1_961, F_1_175) },
+    pw_f117_f078 = { __4X2(F_1_175, F_1_175 - F_0_390) },
+    pw_mf060_mf089 = { __4X2(F_0_298 - F_0_899, -F_0_899) },
+    pw_mf089_f060 = { __4X2(-F_0_899, F_1_501 - F_0_899) },
+    pw_mf050_mf256 = { __4X2(F_2_053 - F_2_562, -F_2_562) },
+    pw_mf256_f050 = { __4X2(-F_2_562, F_3_072 - F_2_562) },
+    pw_descale_p2x = { __8X(1 << (PASS1_BITS - 1)) };
+  __vector unsigned short pass1_bits = { __8X(PASS1_BITS) };
+  __vector int pd_descale_p1 = { __4X(1 << (DESCALE_P1 - 1)) },
+    pd_descale_p2 = { __4X(1 << (DESCALE_P2 - 1)) };
+  __vector unsigned int descale_p1 = { __4X(DESCALE_P1) },
+    descale_p2 = { __4X(DESCALE_P2) };
+
+  /* Pass 1: process rows */
+
+  row0 = vec_ld(0, data);
+  row1 = vec_ld(16, data);
+  row2 = vec_ld(32, data);
+  row3 = vec_ld(48, data);
+  row4 = vec_ld(64, data);
+  row5 = vec_ld(80, data);
+  row6 = vec_ld(96, data);
+  row7 = vec_ld(112, data);
+
+  TRANSPOSE(row, col);
+
+  tmp0 = vec_add(col0, col7);
+  tmp7 = vec_sub(col0, col7);
+  tmp1 = vec_add(col1, col6);
+  tmp6 = vec_sub(col1, col6);
+  tmp2 = vec_add(col2, col5);
+  tmp5 = vec_sub(col2, col5);
+  tmp3 = vec_add(col3, col4);
+  tmp4 = vec_sub(col3, col4);
+
+  DO_FDCT_PASS1();
+
+  /* Pass 2: process columns */
+
+  TRANSPOSE(out, row);
+
+  tmp0 = vec_add(row0, row7);
+  tmp7 = vec_sub(row0, row7);
+  tmp1 = vec_add(row1, row6);
+  tmp6 = vec_sub(row1, row6);
+  tmp2 = vec_add(row2, row5);
+  tmp5 = vec_sub(row2, row5);
+  tmp3 = vec_add(row3, row4);
+  tmp4 = vec_sub(row3, row4);
+
+  DO_FDCT_PASS2();
+
+  vec_st(out0, 0, data);
+  vec_st(out1, 16, data);
+  vec_st(out2, 32, data);
+  vec_st(out3, 48, data);
+  vec_st(out4, 64, data);
+  vec_st(out5, 80, data);
+  vec_st(out6, 96, data);
+  vec_st(out7, 112, data);
+}
diff --git a/simd/jfdctint-mmx.asm b/simd/jfdctint-mmx.asm
index 8536ae2..47f6041 100644
--- a/simd/jfdctint-mmx.asm
+++ b/simd/jfdctint-mmx.asm
@@ -91,10 +91,10 @@
 ; Perform the forward DCT on one block of samples.
 ;
 ; GLOBAL(void)
-; jsimd_fdct_islow_mmx (DCTELEM * data)
+; jsimd_fdct_islow_mmx (DCTELEM *data)
 ;
 
-%define data(b)         (b)+8           ; DCTELEM * data
+%define data(b)         (b)+8           ; DCTELEM *data
 
 %define original_ebp    ebp+0
 %define wk(i)           ebp-(WK_NUM-(i))*SIZEOF_MMWORD  ; mmword wk[WK_NUM]
diff --git a/simd/jfdctint-sse2-64.asm b/simd/jfdctint-sse2-64.asm
index 53ec527..c23fcfb 100644
--- a/simd/jfdctint-sse2-64.asm
+++ b/simd/jfdctint-sse2-64.asm
@@ -92,10 +92,10 @@
 ; Perform the forward DCT on one block of samples.
 ;
 ; GLOBAL(void)
-; jsimd_fdct_islow_sse2 (DCTELEM * data)
+; jsimd_fdct_islow_sse2 (DCTELEM *data)
 ;
 
-; r10 = DCTELEM * data
+; r10 = DCTELEM *data
 
 %define wk(i)           rbp-(WK_NUM-(i))*SIZEOF_XMMWORD ; xmmword wk[WK_NUM]
 %define WK_NUM          6
diff --git a/simd/jfdctint-sse2.asm b/simd/jfdctint-sse2.asm
index 85133f8..6b42ce5 100644
--- a/simd/jfdctint-sse2.asm
+++ b/simd/jfdctint-sse2.asm
@@ -91,10 +91,10 @@
 ; Perform the forward DCT on one block of samples.
 ;
 ; GLOBAL(void)
-; jsimd_fdct_islow_sse2 (DCTELEM * data)
+; jsimd_fdct_islow_sse2 (DCTELEM *data)
 ;
 
-%define data(b)         (b)+8           ; DCTELEM * data
+%define data(b)         (b)+8           ; DCTELEM *data
 
 %define original_ebp    ebp+0
 %define wk(i)           ebp-(WK_NUM-(i))*SIZEOF_XMMWORD ; xmmword wk[WK_NUM]
diff --git a/simd/jidctflt-3dn.asm b/simd/jidctflt-3dn.asm
index ea2e270..24bd105 100644
--- a/simd/jidctflt-3dn.asm
+++ b/simd/jidctflt-3dn.asm
@@ -47,11 +47,11 @@
 ; Perform dequantization and inverse DCT on one block of coefficients.
 ;
 ; GLOBAL(void)
-; jsimd_idct_float_3dnow (void * dct_table, JCOEFPTR coef_block,
+; jsimd_idct_float_3dnow (void *dct_table, JCOEFPTR coef_block,
 ;                         JSAMPARRAY output_buf, JDIMENSION output_col)
 ;
 
-%define dct_table(b)    (b)+8           ; void * dct_table
+%define dct_table(b)    (b)+8           ; void *dct_table
 %define coef_block(b)   (b)+12          ; JCOEFPTR coef_block
 %define output_buf(b)   (b)+16          ; JSAMPARRAY output_buf
 %define output_col(b)   (b)+20          ; JDIMENSION output_col
@@ -86,7 +86,7 @@
 ;       mov     eax, [original_ebp]
         mov     edx, POINTER [dct_table(eax)]           ; quantptr
         mov     esi, JCOEFPTR [coef_block(eax)]         ; inptr
-        lea     edi, [workspace]                        ; FAST_FLOAT * wsptr
+        lea     edi, [workspace]                        ; FAST_FLOAT *wsptr
         mov     ecx, DCTSIZE/2                          ; ctr
         alignx  16,7
 .columnloop:
@@ -290,7 +290,7 @@
         ; ---- Pass 2: process rows from work array, store into output array.
 
         mov     eax, [original_ebp]
-        lea     esi, [workspace]                        ; FAST_FLOAT * wsptr
+        lea     esi, [workspace]                        ; FAST_FLOAT *wsptr
         mov     edi, JSAMPARRAY [output_buf(eax)]       ; (JSAMPROW *)
         mov     eax, JDIMENSION [output_col(eax)]
         mov     ecx, DCTSIZE/2                          ; ctr
diff --git a/simd/jidctflt-sse.asm b/simd/jidctflt-sse.asm
index 6160e41..9605b73 100644
--- a/simd/jidctflt-sse.asm
+++ b/simd/jidctflt-sse.asm
@@ -57,11 +57,11 @@
 ; Perform dequantization and inverse DCT on one block of coefficients.
 ;
 ; GLOBAL(void)
-; jsimd_idct_float_sse (void * dct_table, JCOEFPTR coef_block,
+; jsimd_idct_float_sse (void *dct_table, JCOEFPTR coef_block,
 ;                       JSAMPARRAY output_buf, JDIMENSION output_col)
 ;
 
-%define dct_table(b)    (b)+8           ; void * dct_table
+%define dct_table(b)    (b)+8           ; void *dct_table
 %define coef_block(b)   (b)+12          ; JCOEFPTR coef_block
 %define output_buf(b)   (b)+16          ; JSAMPARRAY output_buf
 %define output_col(b)   (b)+20          ; JDIMENSION output_col
@@ -96,7 +96,7 @@
 ;       mov     eax, [original_ebp]
         mov     edx, POINTER [dct_table(eax)]           ; quantptr
         mov     esi, JCOEFPTR [coef_block(eax)]         ; inptr
-        lea     edi, [workspace]                        ; FAST_FLOAT * wsptr
+        lea     edi, [workspace]                        ; FAST_FLOAT *wsptr
         mov     ecx, DCTSIZE/4                          ; ctr
         alignx  16,7
 .columnloop:
@@ -369,7 +369,7 @@
         ; ---- Pass 2: process rows from work array, store into output array.
 
         mov     eax, [original_ebp]
-        lea     esi, [workspace]                        ; FAST_FLOAT * wsptr
+        lea     esi, [workspace]                        ; FAST_FLOAT *wsptr
         mov     edi, JSAMPARRAY [output_buf(eax)]       ; (JSAMPROW *)
         mov     eax, JDIMENSION [output_col(eax)]
         mov     ecx, DCTSIZE/4                          ; ctr
diff --git a/simd/jidctflt-sse2-64.asm b/simd/jidctflt-sse2-64.asm
index 95bd4dc..3f53501 100644
--- a/simd/jidctflt-sse2-64.asm
+++ b/simd/jidctflt-sse2-64.asm
@@ -58,11 +58,11 @@
 ; Perform dequantization and inverse DCT on one block of coefficients.
 ;
 ; GLOBAL(void)
-; jsimd_idct_float_sse2 (void * dct_table, JCOEFPTR coef_block,
+; jsimd_idct_float_sse2 (void *dct_table, JCOEFPTR coef_block,
 ;                        JSAMPARRAY output_buf, JDIMENSION output_col)
 ;
 
-; r10 = void * dct_table
+; r10 = void *dct_table
 ; r11 = JCOEFPTR coef_block
 ; r12 = JSAMPARRAY output_buf
 ; r13 = JDIMENSION output_col
@@ -91,7 +91,7 @@
 
         mov     rdx, r10                ; quantptr
         mov     rsi, r11                ; inptr
-        lea     rdi, [workspace]                        ; FAST_FLOAT * wsptr
+        lea     rdi, [workspace]                        ; FAST_FLOAT *wsptr
         mov     rcx, DCTSIZE/4                          ; ctr
 .columnloop:
 %ifndef NO_ZERO_COLUMN_TEST_FLOAT_SSE
@@ -324,7 +324,7 @@
         ; ---- Pass 2: process rows from work array, store into output array.
 
         mov     rax, [original_rbp]
-        lea     rsi, [workspace]                        ; FAST_FLOAT * wsptr
+        lea     rsi, [workspace]                        ; FAST_FLOAT *wsptr
         mov     rdi, r12        ; (JSAMPROW *)
         mov     eax, r13d
         mov     rcx, DCTSIZE/4                          ; ctr
diff --git a/simd/jidctflt-sse2.asm b/simd/jidctflt-sse2.asm
index de2cd1f..be899b3 100644
--- a/simd/jidctflt-sse2.asm
+++ b/simd/jidctflt-sse2.asm
@@ -57,11 +57,11 @@
 ; Perform dequantization and inverse DCT on one block of coefficients.
 ;
 ; GLOBAL(void)
-; jsimd_idct_float_sse2 (void * dct_table, JCOEFPTR coef_block,
+; jsimd_idct_float_sse2 (void *dct_table, JCOEFPTR coef_block,
 ;                        JSAMPARRAY output_buf, JDIMENSION output_col)
 ;
 
-%define dct_table(b)    (b)+8           ; void * dct_table
+%define dct_table(b)    (b)+8           ; void *dct_table
 %define coef_block(b)   (b)+12          ; JCOEFPTR coef_block
 %define output_buf(b)   (b)+16          ; JSAMPARRAY output_buf
 %define output_col(b)   (b)+20          ; JDIMENSION output_col
@@ -96,7 +96,7 @@
 ;       mov     eax, [original_ebp]
         mov     edx, POINTER [dct_table(eax)]           ; quantptr
         mov     esi, JCOEFPTR [coef_block(eax)]         ; inptr
-        lea     edi, [workspace]                        ; FAST_FLOAT * wsptr
+        lea     edi, [workspace]                        ; FAST_FLOAT *wsptr
         mov     ecx, DCTSIZE/4                          ; ctr
         alignx  16,7
 .columnloop:
@@ -331,7 +331,7 @@
         ; ---- Pass 2: process rows from work array, store into output array.
 
         mov     eax, [original_ebp]
-        lea     esi, [workspace]                        ; FAST_FLOAT * wsptr
+        lea     esi, [workspace]                        ; FAST_FLOAT *wsptr
         mov     edi, JSAMPARRAY [output_buf(eax)]       ; (JSAMPROW *)
         mov     eax, JDIMENSION [output_col(eax)]
         mov     ecx, DCTSIZE/4                          ; ctr
diff --git a/simd/jidctfst-altivec.c b/simd/jidctfst-altivec.c
new file mode 100644
index 0000000..67cbe84
--- /dev/null
+++ b/simd/jidctfst-altivec.c
@@ -0,0 +1,257 @@
+/*
+ * AltiVec optimizations for libjpeg-turbo
+ *
+ * Copyright (C) 2014-2015, D. R. Commander.
+ * All rights reserved.
+ * This software is provided 'as-is', without any express or implied
+ * warranty.  In no event will the authors be held liable for any damages
+ * arising from the use of this software.
+ *
+ * Permission is granted to anyone to use this software for any purpose,
+ * including commercial applications, and to alter it and redistribute it
+ * freely, subject to the following restrictions:
+ *
+ * 1. The origin of this software must not be misrepresented; you must not
+ *    claim that you wrote the original software. If you use this software
+ *    in a product, an acknowledgment in the product documentation would be
+ *    appreciated but is not required.
+ * 2. Altered source versions must be plainly marked as such, and must not be
+ *    misrepresented as being the original software.
+ * 3. This notice may not be removed or altered from any source distribution.
+ */
+
+/* FAST INTEGER INVERSE DCT
+ *
+ * This is similar to the SSE2 implementation, except that we left-shift the
+ * constants by 1 less bit (the -1 in CONST_SHIFT.)  This is because
+ * vec_madds(arg1, arg2, arg3) generates the 16-bit saturated sum of:
+ *   the elements in arg3 + the most significant 17 bits of
+ *     (the elements in arg1 * the elements in arg2).
+ */
+
+#include "jsimd_altivec.h"
+
+
+#define F_1_082 277              /* FIX(1.082392200) */
+#define F_1_414 362              /* FIX(1.414213562) */
+#define F_1_847 473              /* FIX(1.847759065) */
+#define F_2_613 669              /* FIX(2.613125930) */
+#define F_1_613 (F_2_613 - 256)  /* FIX(2.613125930) - FIX(1) */
+
+#define CONST_BITS 8
+#define PASS1_BITS 2
+#define PRE_MULTIPLY_SCALE_BITS 2
+#define CONST_SHIFT (16 - PRE_MULTIPLY_SCALE_BITS - CONST_BITS - 1)
+
+
+#define DO_IDCT(in)  \
+{  \
+  /* Even part */  \
+  \
+  tmp10 = vec_add(in##0, in##4);  \
+  tmp11 = vec_sub(in##0, in##4);  \
+  tmp13 = vec_add(in##2, in##6);  \
+  \
+  tmp12 = vec_sub(in##2, in##6);  \
+  tmp12 = vec_sl(tmp12, pre_multiply_scale_bits);  \
+  tmp12 = vec_madds(tmp12, pw_F1414, pw_zero);  \
+  tmp12 = vec_sub(tmp12, tmp13);  \
+  \
+  tmp0 = vec_add(tmp10, tmp13);  \
+  tmp3 = vec_sub(tmp10, tmp13);  \
+  tmp1 = vec_add(tmp11, tmp12);  \
+  tmp2 = vec_sub(tmp11, tmp12);  \
+  \
+  /* Odd part */  \
+  \
+  z13 = vec_add(in##5, in##3);  \
+  z10 = vec_sub(in##5, in##3);  \
+  z10s = vec_sl(z10, pre_multiply_scale_bits);  \
+  z11 = vec_add(in##1, in##7);  \
+  z12s = vec_sub(in##1, in##7);  \
+  z12s = vec_sl(z12s, pre_multiply_scale_bits);  \
+  \
+  tmp11 = vec_sub(z11, z13);  \
+  tmp11 = vec_sl(tmp11, pre_multiply_scale_bits);  \
+  tmp11 = vec_madds(tmp11, pw_F1414, pw_zero);  \
+  \
+  tmp7 = vec_add(z11, z13);  \
+  \
+  /* To avoid overflow...  \
+   *  \
+   * (Original)  \
+   * tmp12 = -2.613125930 * z10 + z5;  \
+   *  \
+   * (This implementation)  \
+   * tmp12 = (-1.613125930 - 1) * z10 + z5;  \
+   *       = -1.613125930 * z10 - z10 + z5;  \
+   */  \
+  \
+  z5 = vec_add(z10s, z12s);  \
+  z5 = vec_madds(z5, pw_F1847, pw_zero);  \
+  \
+  tmp10 = vec_madds(z12s, pw_F1082, pw_zero);  \
+  tmp10 = vec_sub(tmp10, z5);  \
+  tmp12 = vec_madds(z10s, pw_MF1613, z5);  \
+  tmp12 = vec_sub(tmp12, z10);  \
+  \
+  tmp6 = vec_sub(tmp12, tmp7);  \
+  tmp5 = vec_sub(tmp11, tmp6);  \
+  tmp4 = vec_add(tmp10, tmp5);  \
+  \
+  out0 = vec_add(tmp0, tmp7);  \
+  out1 = vec_add(tmp1, tmp6);  \
+  out2 = vec_add(tmp2, tmp5);  \
+  out3 = vec_sub(tmp3, tmp4);  \
+  out4 = vec_add(tmp3, tmp4);  \
+  out5 = vec_sub(tmp2, tmp5);  \
+  out6 = vec_sub(tmp1, tmp6);  \
+  out7 = vec_sub(tmp0, tmp7);  \
+}
+
+
+void
+jsimd_idct_ifast_altivec (void *dct_table_, JCOEFPTR coef_block,
+                          JSAMPARRAY output_buf, JDIMENSION output_col)
+{
+  short *dct_table = (short *)dct_table_;
+  int *outptr;
+
+  __vector short row0, row1, row2, row3, row4, row5, row6, row7,
+    col0, col1, col2, col3, col4, col5, col6, col7,
+    quant0, quant1, quant2, quant3, quant4, quant5, quant6, quant7,
+    tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7, tmp10, tmp11, tmp12, tmp13,
+    z5, z10, z10s, z11, z12s, z13,
+    out0, out1, out2, out3, out4, out5, out6, out7;
+  __vector signed char outb;
+
+  /* Constants */
+  __vector short pw_zero = { __8X(0) },
+    pw_F1414 = { __8X(F_1_414 << CONST_SHIFT) },
+    pw_F1847 = { __8X(F_1_847 << CONST_SHIFT) },
+    pw_MF1613 = { __8X(-F_1_613 << CONST_SHIFT) },
+    pw_F1082 = { __8X(F_1_082 << CONST_SHIFT) };
+  __vector unsigned short
+    pre_multiply_scale_bits = { __8X(PRE_MULTIPLY_SCALE_BITS) },
+    pass1_bits3 = { __8X(PASS1_BITS + 3) };
+  __vector signed char pb_centerjsamp = { __16X(CENTERJSAMPLE) };
+
+  /* Pass 1: process columns */
+
+  col0 = vec_ld(0, coef_block);
+  col1 = vec_ld(16, coef_block);
+  col2 = vec_ld(32, coef_block);
+  col3 = vec_ld(48, coef_block);
+  col4 = vec_ld(64, coef_block);
+  col5 = vec_ld(80, coef_block);
+  col6 = vec_ld(96, coef_block);
+  col7 = vec_ld(112, coef_block);
+
+  tmp1 = vec_or(col1, col2);
+  tmp2 = vec_or(col3, col4);
+  tmp1 = vec_or(tmp1, tmp2);
+  tmp3 = vec_or(col5, col6);
+  tmp3 = vec_or(tmp3, col7);
+  tmp1 = vec_or(tmp1, tmp3);
+
+  quant0 = vec_ld(0, dct_table);
+  col0 = vec_mladd(col0, quant0, pw_zero);
+
+  if (vec_all_eq(tmp1, pw_zero)) {
+    /* AC terms all zero */
+
+    row0 = vec_splat(col0, 0);
+    row1 = vec_splat(col0, 1);
+    row2 = vec_splat(col0, 2);
+    row3 = vec_splat(col0, 3);
+    row4 = vec_splat(col0, 4);
+    row5 = vec_splat(col0, 5);
+    row6 = vec_splat(col0, 6);
+    row7 = vec_splat(col0, 7);
+
+  } else {
+
+    quant1 = vec_ld(16, dct_table);
+    quant2 = vec_ld(32, dct_table);
+    quant3 = vec_ld(48, dct_table);
+    quant4 = vec_ld(64, dct_table);
+    quant5 = vec_ld(80, dct_table);
+    quant6 = vec_ld(96, dct_table);
+    quant7 = vec_ld(112, dct_table);
+
+    col1 = vec_mladd(col1, quant1, pw_zero);
+    col2 = vec_mladd(col2, quant2, pw_zero);
+    col3 = vec_mladd(col3, quant3, pw_zero);
+    col4 = vec_mladd(col4, quant4, pw_zero);
+    col5 = vec_mladd(col5, quant5, pw_zero);
+    col6 = vec_mladd(col6, quant6, pw_zero);
+    col7 = vec_mladd(col7, quant7, pw_zero);
+
+    DO_IDCT(col);
+
+    TRANSPOSE(out, row);
+  }
+
+  /* Pass 2: process rows */
+
+  DO_IDCT(row);
+
+  out0 = vec_sra(out0, pass1_bits3);
+  out1 = vec_sra(out1, pass1_bits3);
+  out2 = vec_sra(out2, pass1_bits3);
+  out3 = vec_sra(out3, pass1_bits3);
+  out4 = vec_sra(out4, pass1_bits3);
+  out5 = vec_sra(out5, pass1_bits3);
+  out6 = vec_sra(out6, pass1_bits3);
+  out7 = vec_sra(out7, pass1_bits3);
+
+  TRANSPOSE(out, col);
+
+  outb = vec_packs(col0, col0);
+  outb = vec_add(outb, pb_centerjsamp);
+  outptr = (int *)(output_buf[0] + output_col);
+  vec_ste((__vector int)outb, 0, outptr);
+  vec_ste((__vector int)outb, 4, outptr);
+
+  outb = vec_packs(col1, col1);
+  outb = vec_add(outb, pb_centerjsamp);
+  outptr = (int *)(output_buf[1] + output_col);
+  vec_ste((__vector int)outb, 0, outptr);
+  vec_ste((__vector int)outb, 4, outptr);
+
+  outb = vec_packs(col2, col2);
+  outb = vec_add(outb, pb_centerjsamp);
+  outptr = (int *)(output_buf[2] + output_col);
+  vec_ste((__vector int)outb, 0, outptr);
+  vec_ste((__vector int)outb, 4, outptr);
+
+  outb = vec_packs(col3, col3);
+  outb = vec_add(outb, pb_centerjsamp);
+  outptr = (int *)(output_buf[3] + output_col);
+  vec_ste((__vector int)outb, 0, outptr);
+  vec_ste((__vector int)outb, 4, outptr);
+
+  outb = vec_packs(col4, col4);
+  outb = vec_add(outb, pb_centerjsamp);
+  outptr = (int *)(output_buf[4] + output_col);
+  vec_ste((__vector int)outb, 0, outptr);
+  vec_ste((__vector int)outb, 4, outptr);
+
+  outb = vec_packs(col5, col5);
+  outb = vec_add(outb, pb_centerjsamp);
+  outptr = (int *)(output_buf[5] + output_col);
+  vec_ste((__vector int)outb, 0, outptr);
+  vec_ste((__vector int)outb, 4, outptr);
+
+  outb = vec_packs(col6, col6);
+  outb = vec_add(outb, pb_centerjsamp);
+  outptr = (int *)(output_buf[6] + output_col);
+  vec_ste((__vector int)outb, 0, outptr);
+  vec_ste((__vector int)outb, 4, outptr);
+
+  outb = vec_packs(col7, col7);
+  outb = vec_add(outb, pb_centerjsamp);
+  outptr = (int *)(output_buf[7] + output_col);
+  vec_ste((__vector int)outb, 0, outptr);
+  vec_ste((__vector int)outb, 4, outptr);
+}
diff --git a/simd/jidctfst-mmx.asm b/simd/jidctfst-mmx.asm
index d97c02a..0e3963d 100644
--- a/simd/jidctfst-mmx.asm
+++ b/simd/jidctfst-mmx.asm
@@ -78,11 +78,11 @@
 ; Perform dequantization and inverse DCT on one block of coefficients.
 ;
 ; GLOBAL(void)
-; jsimd_idct_ifast_mmx (void * dct_table, JCOEFPTR coef_block,
+; jsimd_idct_ifast_mmx (void *dct_table, JCOEFPTR coef_block,
 ;                       JSAMPARRAY output_buf, JDIMENSION output_col)
 ;
 
-%define dct_table(b)    (b)+8           ; jpeg_component_info * compptr
+%define dct_table(b)    (b)+8           ; jpeg_component_info *compptr
 %define coef_block(b)   (b)+12          ; JCOEFPTR coef_block
 %define output_buf(b)   (b)+16          ; JSAMPARRAY output_buf
 %define output_col(b)   (b)+20          ; JDIMENSION output_col
@@ -117,7 +117,7 @@
 ;       mov     eax, [original_ebp]
         mov     edx, POINTER [dct_table(eax)]           ; quantptr
         mov     esi, JCOEFPTR [coef_block(eax)]         ; inptr
-        lea     edi, [workspace]                        ; JCOEF * wsptr
+        lea     edi, [workspace]                        ; JCOEF *wsptr
         mov     ecx, DCTSIZE/4                          ; ctr
         alignx  16,7
 .columnloop:
@@ -323,7 +323,7 @@
         ; ---- Pass 2: process rows from work array, store into output array.
 
         mov     eax, [original_ebp]
-        lea     esi, [workspace]                        ; JCOEF * wsptr
+        lea     esi, [workspace]                        ; JCOEF *wsptr
         mov     edi, JSAMPARRAY [output_buf(eax)]       ; (JSAMPROW *)
         mov     eax, JDIMENSION [output_col(eax)]
         mov     ecx, DCTSIZE/4                          ; ctr
diff --git a/simd/jidctfst-sse2-64.asm b/simd/jidctfst-sse2-64.asm
index 0f86429..da4ecf2 100644
--- a/simd/jidctfst-sse2-64.asm
+++ b/simd/jidctfst-sse2-64.asm
@@ -79,11 +79,11 @@
 ; Perform dequantization and inverse DCT on one block of coefficients.
 ;
 ; GLOBAL(void)
-; jsimd_idct_ifast_sse2 (void * dct_table, JCOEFPTR coef_block,
+; jsimd_idct_ifast_sse2 (void *dct_table, JCOEFPTR coef_block,
 ;                       JSAMPARRAY output_buf, JDIMENSION output_col)
 ;
 
-; r10 = jpeg_component_info * compptr
+; r10 = jpeg_component_info *compptr
 ; r11 = JCOEFPTR coef_block
 ; r12 = JSAMPARRAY output_buf
 ; r13 = JDIMENSION output_col
diff --git a/simd/jidctfst-sse2.asm b/simd/jidctfst-sse2.asm
index 4658be3..065842c 100644
--- a/simd/jidctfst-sse2.asm
+++ b/simd/jidctfst-sse2.asm
@@ -78,11 +78,11 @@
 ; Perform dequantization and inverse DCT on one block of coefficients.
 ;
 ; GLOBAL(void)
-; jsimd_idct_ifast_sse2 (void * dct_table, JCOEFPTR coef_block,
+; jsimd_idct_ifast_sse2 (void *dct_table, JCOEFPTR coef_block,
 ;                       JSAMPARRAY output_buf, JDIMENSION output_col)
 ;
 
-%define dct_table(b)    (b)+8           ; jpeg_component_info * compptr
+%define dct_table(b)    (b)+8           ; jpeg_component_info *compptr
 %define coef_block(b)   (b)+12          ; JCOEFPTR coef_block
 %define output_buf(b)   (b)+16          ; JSAMPARRAY output_buf
 %define output_col(b)   (b)+20          ; JDIMENSION output_col
diff --git a/simd/jidctint-altivec.c b/simd/jidctint-altivec.c
new file mode 100644
index 0000000..5f1a5df
--- /dev/null
+++ b/simd/jidctint-altivec.c
@@ -0,0 +1,359 @@
+/*
+ * AltiVec optimizations for libjpeg-turbo
+ *
+ * Copyright (C) 2014-2015, D. R. Commander.
+ * All rights reserved.
+ * This software is provided 'as-is', without any express or implied
+ * warranty.  In no event will the authors be held liable for any damages
+ * arising from the use of this software.
+ *
+ * Permission is granted to anyone to use this software for any purpose,
+ * including commercial applications, and to alter it and redistribute it
+ * freely, subject to the following restrictions:
+ *
+ * 1. The origin of this software must not be misrepresented; you must not
+ *    claim that you wrote the original software. If you use this software
+ *    in a product, an acknowledgment in the product documentation would be
+ *    appreciated but is not required.
+ * 2. Altered source versions must be plainly marked as such, and must not be
+ *    misrepresented as being the original software.
+ * 3. This notice may not be removed or altered from any source distribution.
+ */
+
+/* SLOW INTEGER INVERSE DCT */
+
+#include "jsimd_altivec.h"
+
+
+#define F_0_298 2446   /* FIX(0.298631336) */
+#define F_0_390 3196   /* FIX(0.390180644) */
+#define F_0_541 4433   /* FIX(0.541196100) */
+#define F_0_765 6270   /* FIX(0.765366865) */
+#define F_0_899 7373   /* FIX(0.899976223) */
+#define F_1_175 9633   /* FIX(1.175875602) */
+#define F_1_501 12299  /* FIX(1.501321110) */
+#define F_1_847 15137  /* FIX(1.847759065) */
+#define F_1_961 16069  /* FIX(1.961570560) */
+#define F_2_053 16819  /* FIX(2.053119869) */
+#define F_2_562 20995  /* FIX(2.562915447) */
+#define F_3_072 25172  /* FIX(3.072711026) */
+
+#define CONST_BITS 13
+#define PASS1_BITS 2
+#define DESCALE_P1 (CONST_BITS - PASS1_BITS)
+#define DESCALE_P2 (CONST_BITS + PASS1_BITS + 3)
+
+
+#define DO_IDCT(in, PASS)  \
+{  \
+  /* Even part  \
+   *  \
+   * (Original)  \
+   * z1 = (z2 + z3) * 0.541196100;  \
+   * tmp2 = z1 + z3 * -1.847759065;  \
+   * tmp3 = z1 + z2 * 0.765366865;  \
+   *  \
+   * (This implementation)  \
+   * tmp2 = z2 * 0.541196100 + z3 * (0.541196100 - 1.847759065);  \
+   * tmp3 = z2 * (0.541196100 + 0.765366865) + z3 * 0.541196100;  \
+   */  \
+  \
+  in##26l = vec_mergeh(in##2, in##6);  \
+  in##26h = vec_mergel(in##2, in##6);  \
+  \
+  tmp3l = vec_msums(in##26l, pw_f130_f054, pd_zero);  \
+  tmp3h = vec_msums(in##26h, pw_f130_f054, pd_zero);  \
+  tmp2l = vec_msums(in##26l, pw_f054_mf130, pd_zero);  \
+  tmp2h = vec_msums(in##26h, pw_f054_mf130, pd_zero);  \
+  \
+  tmp0 = vec_add(in##0, in##4);  \
+  tmp1 = vec_sub(in##0, in##4);  \
+  \
+  tmp0l = vec_unpackh(tmp0);  \
+  tmp0h = vec_unpackl(tmp0);  \
+  tmp0l = vec_sl(tmp0l, const_bits);  \
+  tmp0h = vec_sl(tmp0h, const_bits);  \
+  tmp0l = vec_add(tmp0l, pd_descale_p##PASS);  \
+  tmp0h = vec_add(tmp0h, pd_descale_p##PASS);  \
+  \
+  tmp10l = vec_add(tmp0l, tmp3l);  \
+  tmp10h = vec_add(tmp0h, tmp3h);  \
+  tmp13l = vec_sub(tmp0l, tmp3l);  \
+  tmp13h = vec_sub(tmp0h, tmp3h);  \
+  \
+  tmp1l = vec_unpackh(tmp1);  \
+  tmp1h = vec_unpackl(tmp1);  \
+  tmp1l = vec_sl(tmp1l, const_bits);  \
+  tmp1h = vec_sl(tmp1h, const_bits);  \
+  tmp1l = vec_add(tmp1l, pd_descale_p##PASS);  \
+  tmp1h = vec_add(tmp1h, pd_descale_p##PASS);  \
+  \
+  tmp11l = vec_add(tmp1l, tmp2l);  \
+  tmp11h = vec_add(tmp1h, tmp2h);  \
+  tmp12l = vec_sub(tmp1l, tmp2l);  \
+  tmp12h = vec_sub(tmp1h, tmp2h);  \
+  \
+  /* Odd part */  \
+  \
+  z3 = vec_add(in##3, in##7);  \
+  z4 = vec_add(in##1, in##5);  \
+  \
+  /* (Original)  \
+   * z5 = (z3 + z4) * 1.175875602;  \
+   * z3 = z3 * -1.961570560;  z4 = z4 * -0.390180644;  \
+   * z3 += z5;  z4 += z5;  \
+   *  \
+   * (This implementation)  \
+   * z3 = z3 * (1.175875602 - 1.961570560) + z4 * 1.175875602;  \
+   * z4 = z3 * 1.175875602 + z4 * (1.175875602 - 0.390180644);  \
+   */  \
+  \
+  z34l = vec_mergeh(z3, z4);  \
+  z34h = vec_mergel(z3, z4);  \
+  \
+  z3l = vec_msums(z34l, pw_mf078_f117, pd_zero);  \
+  z3h = vec_msums(z34h, pw_mf078_f117, pd_zero);  \
+  z4l = vec_msums(z34l, pw_f117_f078, pd_zero);  \
+  z4h = vec_msums(z34h, pw_f117_f078, pd_zero);  \
+  \
+  /* (Original)  \
+   * z1 = tmp0 + tmp3;  z2 = tmp1 + tmp2;  \
+   * tmp0 = tmp0 * 0.298631336;  tmp1 = tmp1 * 2.053119869;  \
+   * tmp2 = tmp2 * 3.072711026;  tmp3 = tmp3 * 1.501321110;  \
+   * z1 = z1 * -0.899976223;  z2 = z2 * -2.562915447;  \
+   * tmp0 += z1 + z3;  tmp1 += z2 + z4;  \
+   * tmp2 += z2 + z3;  tmp3 += z1 + z4;  \
+   *  \
+   * (This implementation)  \
+   * tmp0 = tmp0 * (0.298631336 - 0.899976223) + tmp3 * -0.899976223;  \
+   * tmp1 = tmp1 * (2.053119869 - 2.562915447) + tmp2 * -2.562915447;  \
+   * tmp2 = tmp1 * -2.562915447 + tmp2 * (3.072711026 - 2.562915447);  \
+   * tmp3 = tmp0 * -0.899976223 + tmp3 * (1.501321110 - 0.899976223);  \
+   * tmp0 += z3;  tmp1 += z4;  \
+   * tmp2 += z3;  tmp3 += z4;  \
+   */  \
+  \
+  in##71l = vec_mergeh(in##7, in##1);  \
+  in##71h = vec_mergel(in##7, in##1);  \
+  \
+  tmp0l = vec_msums(in##71l, pw_mf060_mf089, z3l);  \
+  tmp0h = vec_msums(in##71h, pw_mf060_mf089, z3h);  \
+  tmp3l = vec_msums(in##71l, pw_mf089_f060, z4l);  \
+  tmp3h = vec_msums(in##71h, pw_mf089_f060, z4h);  \
+  \
+  in##53l = vec_mergeh(in##5, in##3);  \
+  in##53h = vec_mergel(in##5, in##3);  \
+  \
+  tmp1l = vec_msums(in##53l, pw_mf050_mf256, z4l);  \
+  tmp1h = vec_msums(in##53h, pw_mf050_mf256, z4h);  \
+  tmp2l = vec_msums(in##53l, pw_mf256_f050, z3l);  \
+  tmp2h = vec_msums(in##53h, pw_mf256_f050, z3h);  \
+  \
+  /* Final output stage */  \
+  \
+  out0l = vec_add(tmp10l, tmp3l);  \
+  out0h = vec_add(tmp10h, tmp3h);  \
+  out7l = vec_sub(tmp10l, tmp3l);  \
+  out7h = vec_sub(tmp10h, tmp3h);  \
+  \
+  out0l = vec_sra(out0l, descale_p##PASS);  \
+  out0h = vec_sra(out0h, descale_p##PASS);  \
+  out7l = vec_sra(out7l, descale_p##PASS);  \
+  out7h = vec_sra(out7h, descale_p##PASS);  \
+  \
+  out0 = vec_pack(out0l, out0h);  \
+  out7 = vec_pack(out7l, out7h);  \
+  \
+  out1l = vec_add(tmp11l, tmp2l);  \
+  out1h = vec_add(tmp11h, tmp2h);  \
+  out6l = vec_sub(tmp11l, tmp2l);  \
+  out6h = vec_sub(tmp11h, tmp2h);  \
+  \
+  out1l = vec_sra(out1l, descale_p##PASS);  \
+  out1h = vec_sra(out1h, descale_p##PASS);  \
+  out6l = vec_sra(out6l, descale_p##PASS);  \
+  out6h = vec_sra(out6h, descale_p##PASS);  \
+  \
+  out1 = vec_pack(out1l, out1h);  \
+  out6 = vec_pack(out6l, out6h);  \
+  \
+  out2l = vec_add(tmp12l, tmp1l);  \
+  out2h = vec_add(tmp12h, tmp1h);  \
+  out5l = vec_sub(tmp12l, tmp1l);  \
+  out5h = vec_sub(tmp12h, tmp1h);  \
+  \
+  out2l = vec_sra(out2l, descale_p##PASS);  \
+  out2h = vec_sra(out2h, descale_p##PASS);  \
+  out5l = vec_sra(out5l, descale_p##PASS);  \
+  out5h = vec_sra(out5h, descale_p##PASS);  \
+  \
+  out2 = vec_pack(out2l, out2h);  \
+  out5 = vec_pack(out5l, out5h);  \
+  \
+  out3l = vec_add(tmp13l, tmp0l);  \
+  out3h = vec_add(tmp13h, tmp0h);  \
+  out4l = vec_sub(tmp13l, tmp0l);  \
+  out4h = vec_sub(tmp13h, tmp0h);  \
+  \
+  out3l = vec_sra(out3l, descale_p##PASS);  \
+  out3h = vec_sra(out3h, descale_p##PASS);  \
+  out4l = vec_sra(out4l, descale_p##PASS);  \
+  out4h = vec_sra(out4h, descale_p##PASS);  \
+  \
+  out3 = vec_pack(out3l, out3h);  \
+  out4 = vec_pack(out4l, out4h);  \
+}
+
+
+void
+jsimd_idct_islow_altivec (void *dct_table_, JCOEFPTR coef_block,
+                          JSAMPARRAY output_buf, JDIMENSION output_col)
+{
+  short *dct_table = (short *)dct_table_;
+  int *outptr;
+
+  __vector short row0, row1, row2, row3, row4, row5, row6, row7,
+    col0, col1, col2, col3, col4, col5, col6, col7,
+    quant0, quant1, quant2, quant3, quant4, quant5, quant6, quant7,
+    tmp0, tmp1, tmp2, tmp3, z3, z4,
+    z34l, z34h, col71l, col71h, col26l, col26h, col53l, col53h,
+    row71l, row71h, row26l, row26h, row53l, row53h,
+    out0, out1, out2, out3, out4, out5, out6, out7;
+  __vector int tmp0l, tmp0h, tmp1l, tmp1h, tmp2l, tmp2h, tmp3l, tmp3h,
+    tmp10l, tmp10h, tmp11l, tmp11h, tmp12l, tmp12h, tmp13l, tmp13h,
+    z3l, z3h, z4l, z4h,
+    out0l, out0h, out1l, out1h, out2l, out2h, out3l, out3h, out4l, out4h,
+    out5l, out5h, out6l, out6h, out7l, out7h;
+  __vector signed char outb;
+
+  /* Constants */
+  __vector short pw_zero = { __8X(0) },
+    pw_f130_f054 = { __4X2(F_0_541 + F_0_765, F_0_541) },
+    pw_f054_mf130 = { __4X2(F_0_541, F_0_541 - F_1_847) },
+    pw_mf078_f117 = { __4X2(F_1_175 - F_1_961, F_1_175) },
+    pw_f117_f078 = { __4X2(F_1_175, F_1_175 - F_0_390) },
+    pw_mf060_mf089 = { __4X2(F_0_298 - F_0_899, -F_0_899) },
+    pw_mf089_f060 = { __4X2(-F_0_899, F_1_501 - F_0_899) },
+    pw_mf050_mf256 = { __4X2(F_2_053 - F_2_562, -F_2_562) },
+    pw_mf256_f050 = { __4X2(-F_2_562, F_3_072 - F_2_562) };
+  __vector unsigned short pass1_bits = { __8X(PASS1_BITS) };
+  __vector int pd_zero = { __4X(0) },
+    pd_descale_p1 = { __4X(1 << (DESCALE_P1 - 1)) },
+    pd_descale_p2 = { __4X(1 << (DESCALE_P2 - 1)) };
+  __vector unsigned int descale_p1 = { __4X(DESCALE_P1) },
+    descale_p2 = { __4X(DESCALE_P2) },
+    const_bits = { __4X(CONST_BITS) };
+  __vector signed char pb_centerjsamp = { __16X(CENTERJSAMPLE) };
+
+  /* Pass 1: process columns */
+
+  col0 = vec_ld(0, coef_block);
+  col1 = vec_ld(16, coef_block);
+  col2 = vec_ld(32, coef_block);
+  col3 = vec_ld(48, coef_block);
+  col4 = vec_ld(64, coef_block);
+  col5 = vec_ld(80, coef_block);
+  col6 = vec_ld(96, coef_block);
+  col7 = vec_ld(112, coef_block);
+
+  tmp1 = vec_or(col1, col2);
+  tmp2 = vec_or(col3, col4);
+  tmp1 = vec_or(tmp1, tmp2);
+  tmp3 = vec_or(col5, col6);
+  tmp3 = vec_or(tmp3, col7);
+  tmp1 = vec_or(tmp1, tmp3);
+
+  quant0 = vec_ld(0, dct_table);
+  col0 = vec_mladd(col0, quant0, pw_zero);
+
+  if (vec_all_eq(tmp1, pw_zero)) {
+    /* AC terms all zero */
+
+    col0 = vec_sl(col0, pass1_bits);
+
+    row0 = vec_splat(col0, 0);
+    row1 = vec_splat(col0, 1);
+    row2 = vec_splat(col0, 2);
+    row3 = vec_splat(col0, 3);
+    row4 = vec_splat(col0, 4);
+    row5 = vec_splat(col0, 5);
+    row6 = vec_splat(col0, 6);
+    row7 = vec_splat(col0, 7);
+
+  } else {
+
+    quant1 = vec_ld(16, dct_table);
+    quant2 = vec_ld(32, dct_table);
+    quant3 = vec_ld(48, dct_table);
+    quant4 = vec_ld(64, dct_table);
+    quant5 = vec_ld(80, dct_table);
+    quant6 = vec_ld(96, dct_table);
+    quant7 = vec_ld(112, dct_table);
+
+    col1 = vec_mladd(col1, quant1, pw_zero);
+    col2 = vec_mladd(col2, quant2, pw_zero);
+    col3 = vec_mladd(col3, quant3, pw_zero);
+    col4 = vec_mladd(col4, quant4, pw_zero);
+    col5 = vec_mladd(col5, quant5, pw_zero);
+    col6 = vec_mladd(col6, quant6, pw_zero);
+    col7 = vec_mladd(col7, quant7, pw_zero);
+
+    DO_IDCT(col, 1);
+
+    TRANSPOSE(out, row);
+  }
+
+  /* Pass 2: process rows */
+
+  DO_IDCT(row, 2);
+
+  TRANSPOSE(out, col);
+
+  outb = vec_packs(col0, col0);
+  outb = vec_add(outb, pb_centerjsamp);
+  outptr = (int *)(output_buf[0] + output_col);
+  vec_ste((__vector int)outb, 0, outptr);
+  vec_ste((__vector int)outb, 4, outptr);
+
+  outb = vec_packs(col1, col1);
+  outb = vec_add(outb, pb_centerjsamp);
+  outptr = (int *)(output_buf[1] + output_col);
+  vec_ste((__vector int)outb, 0, outptr);
+  vec_ste((__vector int)outb, 4, outptr);
+
+  outb = vec_packs(col2, col2);
+  outb = vec_add(outb, pb_centerjsamp);
+  outptr = (int *)(output_buf[2] + output_col);
+  vec_ste((__vector int)outb, 0, outptr);
+  vec_ste((__vector int)outb, 4, outptr);
+
+  outb = vec_packs(col3, col3);
+  outb = vec_add(outb, pb_centerjsamp);
+  outptr = (int *)(output_buf[3] + output_col);
+  vec_ste((__vector int)outb, 0, outptr);
+  vec_ste((__vector int)outb, 4, outptr);
+
+  outb = vec_packs(col4, col4);
+  outb = vec_add(outb, pb_centerjsamp);
+  outptr = (int *)(output_buf[4] + output_col);
+  vec_ste((__vector int)outb, 0, outptr);
+  vec_ste((__vector int)outb, 4, outptr);
+
+  outb = vec_packs(col5, col5);
+  outb = vec_add(outb, pb_centerjsamp);
+  outptr = (int *)(output_buf[5] + output_col);
+  vec_ste((__vector int)outb, 0, outptr);
+  vec_ste((__vector int)outb, 4, outptr);
+
+  outb = vec_packs(col6, col6);
+  outb = vec_add(outb, pb_centerjsamp);
+  outptr = (int *)(output_buf[6] + output_col);
+  vec_ste((__vector int)outb, 0, outptr);
+  vec_ste((__vector int)outb, 4, outptr);
+
+  outb = vec_packs(col7, col7);
+  outb = vec_add(outb, pb_centerjsamp);
+  outptr = (int *)(output_buf[7] + output_col);
+  vec_ste((__vector int)outb, 0, outptr);
+  vec_ste((__vector int)outb, 4, outptr);
+}
diff --git a/simd/jidctint-mmx.asm b/simd/jidctint-mmx.asm
index 7e25b82..fda3b63 100644
--- a/simd/jidctint-mmx.asm
+++ b/simd/jidctint-mmx.asm
@@ -91,11 +91,11 @@
 ; Perform dequantization and inverse DCT on one block of coefficients.
 ;
 ; GLOBAL(void)
-; jsimd_idct_islow_mmx (void * dct_table, JCOEFPTR coef_block,
+; jsimd_idct_islow_mmx (void *dct_table, JCOEFPTR coef_block,
 ;                       JSAMPARRAY output_buf, JDIMENSION output_col)
 ;
 
-%define dct_table(b)    (b)+8           ; jpeg_component_info * compptr
+%define dct_table(b)    (b)+8           ; jpeg_component_info *compptr
 %define coef_block(b)   (b)+12          ; JCOEFPTR coef_block
 %define output_buf(b)   (b)+16          ; JSAMPARRAY output_buf
 %define output_col(b)   (b)+20          ; JDIMENSION output_col
@@ -130,7 +130,7 @@
 ;       mov     eax, [original_ebp]
         mov     edx, POINTER [dct_table(eax)]           ; quantptr
         mov     esi, JCOEFPTR [coef_block(eax)]         ; inptr
-        lea     edi, [workspace]                        ; JCOEF * wsptr
+        lea     edi, [workspace]                        ; JCOEF *wsptr
         mov     ecx, DCTSIZE/4                          ; ctr
         alignx  16,7
 .columnloop:
@@ -510,7 +510,7 @@
         ; ---- Pass 2: process rows from work array, store into output array.
 
         mov     eax, [original_ebp]
-        lea     esi, [workspace]                        ; JCOEF * wsptr
+        lea     esi, [workspace]                        ; JCOEF *wsptr
         mov     edi, JSAMPARRAY [output_buf(eax)]       ; (JSAMPROW *)
         mov     eax, JDIMENSION [output_col(eax)]
         mov     ecx, DCTSIZE/4                          ; ctr
diff --git a/simd/jidctint-sse2-64.asm b/simd/jidctint-sse2-64.asm
index 1cc3086..bfec499 100644
--- a/simd/jidctint-sse2-64.asm
+++ b/simd/jidctint-sse2-64.asm
@@ -92,11 +92,11 @@
 ; Perform dequantization and inverse DCT on one block of coefficients.
 ;
 ; GLOBAL(void)
-; jsimd_idct_islow_sse2 (void * dct_table, JCOEFPTR coef_block,
+; jsimd_idct_islow_sse2 (void *dct_table, JCOEFPTR coef_block,
 ;                        JSAMPARRAY output_buf, JDIMENSION output_col)
 ;
 
-; r10 = jpeg_component_info * compptr
+; r10 = jpeg_component_info *compptr
 ; r11 = JCOEFPTR coef_block
 ; r12 = JSAMPARRAY output_buf
 ; r13 = JDIMENSION output_col
diff --git a/simd/jidctint-sse2.asm b/simd/jidctint-sse2.asm
index 4a35f3d..1960bcd 100644
--- a/simd/jidctint-sse2.asm
+++ b/simd/jidctint-sse2.asm
@@ -91,11 +91,11 @@
 ; Perform dequantization and inverse DCT on one block of coefficients.
 ;
 ; GLOBAL(void)
-; jsimd_idct_islow_sse2 (void * dct_table, JCOEFPTR coef_block,
+; jsimd_idct_islow_sse2 (void *dct_table, JCOEFPTR coef_block,
 ;                        JSAMPARRAY output_buf, JDIMENSION output_col)
 ;
 
-%define dct_table(b)    (b)+8           ; jpeg_component_info * compptr
+%define dct_table(b)    (b)+8           ; jpeg_component_info *compptr
 %define coef_block(b)   (b)+12          ; JCOEFPTR coef_block
 %define output_buf(b)   (b)+16          ; JSAMPARRAY output_buf
 %define output_col(b)   (b)+20          ; JDIMENSION output_col
diff --git a/simd/jidctred-mmx.asm b/simd/jidctred-mmx.asm
index 1c93901..21e17fc 100644
--- a/simd/jidctred-mmx.asm
+++ b/simd/jidctred-mmx.asm
@@ -99,11 +99,11 @@
 ; producing a reduced-size 4x4 output block.
 ;
 ; GLOBAL(void)
-; jsimd_idct_4x4_mmx (void * dct_table, JCOEFPTR coef_block,
+; jsimd_idct_4x4_mmx (void *dct_table, JCOEFPTR coef_block,
 ;                     JSAMPARRAY output_buf, JDIMENSION output_col)
 ;
 
-%define dct_table(b)    (b)+8           ; void * dct_table
+%define dct_table(b)    (b)+8           ; void *dct_table
 %define coef_block(b)   (b)+12          ; JCOEFPTR coef_block
 %define output_buf(b)   (b)+16          ; JSAMPARRAY output_buf
 %define output_col(b)   (b)+20          ; JDIMENSION output_col
@@ -138,7 +138,7 @@
 ;       mov     eax, [original_ebp]
         mov     edx, POINTER [dct_table(eax)]           ; quantptr
         mov     esi, JCOEFPTR [coef_block(eax)]         ; inptr
-        lea     edi, [workspace]                        ; JCOEF * wsptr
+        lea     edi, [workspace]                        ; JCOEF *wsptr
         mov     ecx, DCTSIZE/4                          ; ctr
         alignx  16,7
 .columnloop:
@@ -332,7 +332,7 @@
         ; ---- Pass 2: process rows from work array, store into output array.
 
         mov     eax, [original_ebp]
-        lea     esi, [workspace]                        ; JCOEF * wsptr
+        lea     esi, [workspace]                        ; JCOEF *wsptr
         mov     edi, JSAMPARRAY [output_buf(eax)]       ; (JSAMPROW *)
         mov     eax, JDIMENSION [output_col(eax)]
 
@@ -493,11 +493,11 @@
 ; producing a reduced-size 2x2 output block.
 ;
 ; GLOBAL(void)
-; jsimd_idct_2x2_mmx (void * dct_table, JCOEFPTR coef_block,
+; jsimd_idct_2x2_mmx (void *dct_table, JCOEFPTR coef_block,
 ;                     JSAMPARRAY output_buf, JDIMENSION output_col)
 ;
 
-%define dct_table(b)    (b)+8           ; void * dct_table
+%define dct_table(b)    (b)+8           ; void *dct_table
 %define coef_block(b)   (b)+12          ; JCOEFPTR coef_block
 %define output_buf(b)   (b)+16          ; JSAMPARRAY output_buf
 %define output_col(b)   (b)+20          ; JDIMENSION output_col
diff --git a/simd/jidctred-sse2-64.asm b/simd/jidctred-sse2-64.asm
index 02b155a..d1b1874 100644
--- a/simd/jidctred-sse2-64.asm
+++ b/simd/jidctred-sse2-64.asm
@@ -100,11 +100,11 @@
 ; producing a reduced-size 4x4 output block.
 ;
 ; GLOBAL(void)
-; jsimd_idct_4x4_sse2 (void * dct_table, JCOEFPTR coef_block,
+; jsimd_idct_4x4_sse2 (void *dct_table, JCOEFPTR coef_block,
 ;                      JSAMPARRAY output_buf, JDIMENSION output_col)
 ;
 
-; r10 = void * dct_table
+; r10 = void *dct_table
 ; r11 = JCOEFPTR coef_block
 ; r12 = JSAMPARRAY output_buf
 ; r13 = JDIMENSION output_col
@@ -403,11 +403,11 @@
 ; producing a reduced-size 2x2 output block.
 ;
 ; GLOBAL(void)
-; jsimd_idct_2x2_sse2 (void * dct_table, JCOEFPTR coef_block,
+; jsimd_idct_2x2_sse2 (void *dct_table, JCOEFPTR coef_block,
 ;                      JSAMPARRAY output_buf, JDIMENSION output_col)
 ;
 
-; r10 = void * dct_table
+; r10 = void *dct_table
 ; r11 = JCOEFPTR coef_block
 ; r12 = JSAMPARRAY output_buf
 ; r13 = JDIMENSION output_col
diff --git a/simd/jidctred-sse2.asm b/simd/jidctred-sse2.asm
index 06dade8..e48c0c5 100644
--- a/simd/jidctred-sse2.asm
+++ b/simd/jidctred-sse2.asm
@@ -99,11 +99,11 @@
 ; producing a reduced-size 4x4 output block.
 ;
 ; GLOBAL(void)
-; jsimd_idct_4x4_sse2 (void * dct_table, JCOEFPTR coef_block,
+; jsimd_idct_4x4_sse2 (void *dct_table, JCOEFPTR coef_block,
 ;                      JSAMPARRAY output_buf, JDIMENSION output_col)
 ;
 
-%define dct_table(b)    (b)+8           ; void * dct_table
+%define dct_table(b)    (b)+8           ; void *dct_table
 %define coef_block(b)   (b)+12          ; JCOEFPTR coef_block
 %define output_buf(b)   (b)+16          ; JSAMPARRAY output_buf
 %define output_col(b)   (b)+20          ; JDIMENSION output_col
@@ -414,11 +414,11 @@
 ; producing a reduced-size 2x2 output block.
 ;
 ; GLOBAL(void)
-; jsimd_idct_2x2_sse2 (void * dct_table, JCOEFPTR coef_block,
+; jsimd_idct_2x2_sse2 (void *dct_table, JCOEFPTR coef_block,
 ;                      JSAMPARRAY output_buf, JDIMENSION output_col)
 ;
 
-%define dct_table(b)    (b)+8           ; void * dct_table
+%define dct_table(b)    (b)+8           ; void *dct_table
 %define coef_block(b)   (b)+12          ; JCOEFPTR coef_block
 %define output_buf(b)   (b)+16          ; JSAMPARRAY output_buf
 %define output_col(b)   (b)+20          ; JDIMENSION output_col
diff --git a/simd/jpeg_nbits_table.inc b/simd/jpeg_nbits_table.inc
new file mode 100644
index 0000000..cbc6990
--- /dev/null
+++ b/simd/jpeg_nbits_table.inc
@@ -0,0 +1,4097 @@
+jpeg_nbits_table db  \
+   0,  1,  2,  2,  3,  3,  3,  3,  4,  4,  4,  4,  4,  4,  4,  4,  \
+   5,  5,  5,  5,  5,  5,  5,  5,  5,  5,  5,  5,  5,  5,  5,  5,  \
+   6,  6,  6,  6,  6,  6,  6,  6,  6,  6,  6,  6,  6,  6,  6,  6,  \
+   6,  6,  6,  6,  6,  6,  6,  6,  6,  6,  6,  6,  6,  6,  6,  6,  \
+   7,  7,  7,  7,  7,  7,  7,  7,  7,  7,  7,  7,  7,  7,  7,  7,  \
+   7,  7,  7,  7,  7,  7,  7,  7,  7,  7,  7,  7,  7,  7,  7,  7,  \
+   7,  7,  7,  7,  7,  7,  7,  7,  7,  7,  7,  7,  7,  7,  7,  7,  \
+   7,  7,  7,  7,  7,  7,  7,  7,  7,  7,  7,  7,  7,  7,  7,  7,  \
+   8,  8,  8,  8,  8,  8,  8,  8,  8,  8,  8,  8,  8,  8,  8,  8,  \
+   8,  8,  8,  8,  8,  8,  8,  8,  8,  8,  8,  8,  8,  8,  8,  8,  \
+   8,  8,  8,  8,  8,  8,  8,  8,  8,  8,  8,  8,  8,  8,  8,  8,  \
+   8,  8,  8,  8,  8,  8,  8,  8,  8,  8,  8,  8,  8,  8,  8,  8,  \
+   8,  8,  8,  8,  8,  8,  8,  8,  8,  8,  8,  8,  8,  8,  8,  8,  \
+   8,  8,  8,  8,  8,  8,  8,  8,  8,  8,  8,  8,  8,  8,  8,  8,  \
+   8,  8,  8,  8,  8,  8,  8,  8,  8,  8,  8,  8,  8,  8,  8,  8,  \
+   8,  8,  8,  8,  8,  8,  8,  8,  8,  8,  8,  8,  8,  8,  8,  8,  \
+   9,  9,  9,  9,  9,  9,  9,  9,  9,  9,  9,  9,  9,  9,  9,  9,  \
+   9,  9,  9,  9,  9,  9,  9,  9,  9,  9,  9,  9,  9,  9,  9,  9,  \
+   9,  9,  9,  9,  9,  9,  9,  9,  9,  9,  9,  9,  9,  9,  9,  9,  \
+   9,  9,  9,  9,  9,  9,  9,  9,  9,  9,  9,  9,  9,  9,  9,  9,  \
+   9,  9,  9,  9,  9,  9,  9,  9,  9,  9,  9,  9,  9,  9,  9,  9,  \
+   9,  9,  9,  9,  9,  9,  9,  9,  9,  9,  9,  9,  9,  9,  9,  9,  \
+   9,  9,  9,  9,  9,  9,  9,  9,  9,  9,  9,  9,  9,  9,  9,  9,  \
+   9,  9,  9,  9,  9,  9,  9,  9,  9,  9,  9,  9,  9,  9,  9,  9,  \
+   9,  9,  9,  9,  9,  9,  9,  9,  9,  9,  9,  9,  9,  9,  9,  9,  \
+   9,  9,  9,  9,  9,  9,  9,  9,  9,  9,  9,  9,  9,  9,  9,  9,  \
+   9,  9,  9,  9,  9,  9,  9,  9,  9,  9,  9,  9,  9,  9,  9,  9,  \
+   9,  9,  9,  9,  9,  9,  9,  9,  9,  9,  9,  9,  9,  9,  9,  9,  \
+   9,  9,  9,  9,  9,  9,  9,  9,  9,  9,  9,  9,  9,  9,  9,  9,  \
+   9,  9,  9,  9,  9,  9,  9,  9,  9,  9,  9,  9,  9,  9,  9,  9,  \
+   9,  9,  9,  9,  9,  9,  9,  9,  9,  9,  9,  9,  9,  9,  9,  9,  \
+   9,  9,  9,  9,  9,  9,  9,  9,  9,  9,  9,  9,  9,  9,  9,  9,  \
+  10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10,  \
+  10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10,  \
+  10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10,  \
+  10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10,  \
+  10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10,  \
+  10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10,  \
+  10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10,  \
+  10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10,  \
+  10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10,  \
+  10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10,  \
+  10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10,  \
+  10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10,  \
+  10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10,  \
+  10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10,  \
+  10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10,  \
+  10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10,  \
+  10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10,  \
+  10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10,  \
+  10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10,  \
+  10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10,  \
+  10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10,  \
+  10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10,  \
+  10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10,  \
+  10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10,  \
+  10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10,  \
+  10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10,  \
+  10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10,  \
+  10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10,  \
+  10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10,  \
+  10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10,  \
+  10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10,  \
+  10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10,  \
+  11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11,  \
+  11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11,  \
+  11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11,  \
+  11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11,  \
+  11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11,  \
+  11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11,  \
+  11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11,  \
+  11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11,  \
+  11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11,  \
+  11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11,  \
+  11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11,  \
+  11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11,  \
+  11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11,  \
+  11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11,  \
+  11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11,  \
+  11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11,  \
+  11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11,  \
+  11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11,  \
+  11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11,  \
+  11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11,  \
+  11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11,  \
+  11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11,  \
+  11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11,  \
+  11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11,  \
+  11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11,  \
+  11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11,  \
+  11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11,  \
+  11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11,  \
+  11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11,  \
+  11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11,  \
+  11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11,  \
+  11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11,  \
+  11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11,  \
+  11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11,  \
+  11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11,  \
+  11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11,  \
+  11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11,  \
+  11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11,  \
+  11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11,  \
+  11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11,  \
+  11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11,  \
+  11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11,  \
+  11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11,  \
+  11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11,  \
+  11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11,  \
+  11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11,  \
+  11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11,  \
+  11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11,  \
+  11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11,  \
+  11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11,  \
+  11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11,  \
+  11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11,  \
+  11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11,  \
+  11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11,  \
+  11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11,  \
+  11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11,  \
+  11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11,  \
+  11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11,  \
+  11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11,  \
+  11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11,  \
+  11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11,  \
+  11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11,  \
+  11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11,  \
+  11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11,  \
+  12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12,  \
+  12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12,  \
+  12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12,  \
+  12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12,  \
+  12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12,  \
+  12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12,  \
+  12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12,  \
+  12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12,  \
+  12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12,  \
+  12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12,  \
+  12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12,  \
+  12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12,  \
+  12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12,  \
+  12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12,  \
+  12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12,  \
+  12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12,  \
+  12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12,  \
+  12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12,  \
+  12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12,  \
+  12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12,  \
+  12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12,  \
+  12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12,  \
+  12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12,  \
+  12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12,  \
+  12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12,  \
+  12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12,  \
+  12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12,  \
+  12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12,  \
+  12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12,  \
+  12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12,  \
+  12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12,  \
+  12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12,  \
+  12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12,  \
+  12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12,  \
+  12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12,  \
+  12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12,  \
+  12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12,  \
+  12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12,  \
+  12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12,  \
+  12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12,  \
+  12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12,  \
+  12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12,  \
+  12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12,  \
+  12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12,  \
+  12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12,  \
+  12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12,  \
+  12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12,  \
+  12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12,  \
+  12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12,  \
+  12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12,  \
+  12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12,  \
+  12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12,  \
+  12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12,  \
+  12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12,  \
+  12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12,  \
+  12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12,  \
+  12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12,  \
+  12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12,  \
+  12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12,  \
+  12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12,  \
+  12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12,  \
+  12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12,  \
+  12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12,  \
+  12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12,  \
+  12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12,  \
+  12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12,  \
+  12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12,  \
+  12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12,  \
+  12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12,  \
+  12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12,  \
+  12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12,  \
+  12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12,  \
+  12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12,  \
+  12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12,  \
+  12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12,  \
+  12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12,  \
+  12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12,  \
+  12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12,  \
+  12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12,  \
+  12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12,  \
+  12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12,  \
+  12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12,  \
+  12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12,  \
+  12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12,  \
+  12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12,  \
+  12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12,  \
+  12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12,  \
+  12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12,  \
+  12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12,  \
+  12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12,  \
+  12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12,  \
+  12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12,  \
+  12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12,  \
+  12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12,  \
+  12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12,  \
+  12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12,  \
+  12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12,  \
+  12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12,  \
+  12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12,  \
+  12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12,  \
+  12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12,  \
+  12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12,  \
+  12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12,  \
+  12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12,  \
+  12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12,  \
+  12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12,  \
+  12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12,  \
+  12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12,  \
+  12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12,  \
+  12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12,  \
+  12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12,  \
+  12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12,  \
+  12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12,  \
+  12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12,  \
+  12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12,  \
+  12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12,  \
+  12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12,  \
+  12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12,  \
+  12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12,  \
+  12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12,  \
+  12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12,  \
+  12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12,  \
+  12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12,  \
+  12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12,  \
+  12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12,  \
+  12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12,  \
+  12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12,  \
+  12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12,  \
+  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,  \
+  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,  \
+  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,  \
+  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,  \
+  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,  \
+  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,  \
+  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,  \
+  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,  \
+  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,  \
+  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,  \
+  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,  \
+  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,  \
+  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,  \
+  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,  \
+  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,  \
+  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,  \
+  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,  \
+  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,  \
+  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,  \
+  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,  \
+  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,  \
+  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,  \
+  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,  \
+  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,  \
+  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,  \
+  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,  \
+  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,  \
+  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,  \
+  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,  \
+  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,  \
+  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,  \
+  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,  \
+  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,  \
+  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,  \
+  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,  \
+  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,  \
+  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,  \
+  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,  \
+  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,  \
+  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,  \
+  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,  \
+  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,  \
+  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,  \
+  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,  \
+  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,  \
+  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,  \
+  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,  \
+  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,  \
+  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,  \
+  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,  \
+  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,  \
+  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,  \
+  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,  \
+  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,  \
+  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,  \
+  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,  \
+  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,  \
+  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,  \
+  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,  \
+  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,  \
+  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,  \
+  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,  \
+  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,  \
+  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,  \
+  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,  \
+  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,  \
+  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,  \
+  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,  \
+  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,  \
+  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,  \
+  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,  \
+  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,  \
+  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,  \
+  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,  \
+  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,  \
+  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,  \
+  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,  \
+  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,  \
+  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,  \
+  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,  \
+  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,  \
+  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,  \
+  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,  \
+  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,  \
+  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,  \
+  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,  \
+  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,  \
+  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,  \
+  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,  \
+  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,  \
+  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,  \
+  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,  \
+  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,  \
+  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,  \
+  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,  \
+  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,  \
+  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,  \
+  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,  \
+  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,  \
+  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,  \
+  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,  \
+  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,  \
+  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,  \
+  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,  \
+  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,  \
+  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,  \
+  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,  \
+  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,  \
+  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,  \
+  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,  \
+  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,  \
+  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,  \
+  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,  \
+  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,  \
+  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,  \
+  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,  \
+  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,  \
+  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,  \
+  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,  \
+  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,  \
+  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,  \
+  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,  \
+  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,  \
+  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,  \
+  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,  \
+  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,  \
+  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,  \
+  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,  \
+  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,  \
+  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,  \
+  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,  \
+  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,  \
+  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,  \
+  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,  \
+  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,  \
+  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,  \
+  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,  \
+  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,  \
+  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,  \
+  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,  \
+  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,  \
+  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,  \
+  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,  \
+  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,  \
+  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,  \
+  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,  \
+  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,  \
+  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,  \
+  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,  \
+  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,  \
+  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,  \
+  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,  \
+  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,  \
+  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,  \
+  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,  \
+  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,  \
+  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,  \
+  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,  \
+  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,  \
+  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,  \
+  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,  \
+  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,  \
+  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,  \
+  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,  \
+  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,  \
+  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,  \
+  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,  \
+  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,  \
+  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,  \
+  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,  \
+  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,  \
+  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,  \
+  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,  \
+  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,  \
+  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,  \
+  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,  \
+  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,  \
+  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,  \
+  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,  \
+  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,  \
+  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,  \
+  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,  \
+  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,  \
+  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,  \
+  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,  \
+  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,  \
+  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,  \
+  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,  \
+  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,  \
+  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,  \
+  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,  \
+  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,  \
+  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,  \
+  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,  \
+  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,  \
+  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,  \
+  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,  \
+  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,  \
+  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,  \
+  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,  \
+  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,  \
+  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,  \
+  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,  \
+  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,  \
+  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,  \
+  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,  \
+  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,  \
+  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,  \
+  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,  \
+  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,  \
+  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,  \
+  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,  \
+  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,  \
+  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,  \
+  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,  \
+  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,  \
+  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,  \
+  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,  \
+  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,  \
+  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,  \
+  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,  \
+  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,  \
+  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,  \
+  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,  \
+  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,  \
+  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,  \
+  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,  \
+  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,  \
+  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,  \
+  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,  \
+  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,  \
+  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,  \
+  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,  \
+  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,  \
+  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,  \
+  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,  \
+  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,  \
+  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,  \
+  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,  \
+  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,  \
+  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,  \
+  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,  \
+  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,  \
+  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,  \
+  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,  \
+  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,  \
+  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,  \
+  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,  \
+  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,  \
+  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,  \
+  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,  \
+  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,  \
+  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,  \
+  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,  \
+  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,  \
+  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,  \
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16
diff --git a/simd/jquant-3dn.asm b/simd/jquant-3dn.asm
index 76e19f7..6b7c11c 100644
--- a/simd/jquant-3dn.asm
+++ b/simd/jquant-3dn.asm
@@ -27,12 +27,12 @@
 ;
 ; GLOBAL(void)
 ; jsimd_convsamp_float_3dnow (JSAMPARRAY sample_data, JDIMENSION start_col,
-;                             FAST_FLOAT * workspace);
+;                             FAST_FLOAT *workspace);
 ;
 
 %define sample_data     ebp+8           ; JSAMPARRAY sample_data
 %define start_col       ebp+12          ; JDIMENSION start_col
-%define workspace       ebp+16          ; FAST_FLOAT * workspace
+%define workspace       ebp+16          ; FAST_FLOAT *workspace
 
         align   16
         global  EXTN(jsimd_convsamp_float_3dnow)
@@ -129,13 +129,13 @@
 ; Quantize/descale the coefficients, and store into coef_block
 ;
 ; GLOBAL(void)
-; jsimd_quantize_float_3dnow (JCOEFPTR coef_block, FAST_FLOAT * divisors,
-;                             FAST_FLOAT * workspace);
+; jsimd_quantize_float_3dnow (JCOEFPTR coef_block, FAST_FLOAT *divisors,
+;                             FAST_FLOAT *workspace);
 ;
 
 %define coef_block      ebp+8           ; JCOEFPTR coef_block
-%define divisors        ebp+12          ; FAST_FLOAT * divisors
-%define workspace       ebp+16          ; FAST_FLOAT * workspace
+%define divisors        ebp+12          ; FAST_FLOAT *divisors
+%define workspace       ebp+16          ; FAST_FLOAT *workspace
 
         align   16
         global  EXTN(jsimd_quantize_float_3dnow)
diff --git a/simd/jquant-mmx.asm b/simd/jquant-mmx.asm
index 822c7ee..dbfecee 100644
--- a/simd/jquant-mmx.asm
+++ b/simd/jquant-mmx.asm
@@ -27,12 +27,12 @@
 ;
 ; GLOBAL(void)
 ; jsimd_convsamp_mmx (JSAMPARRAY sample_data, JDIMENSION start_col,
-;                     DCTELEM * workspace);
+;                     DCTELEM *workspace);
 ;
 
 %define sample_data     ebp+8           ; JSAMPARRAY sample_data
 %define start_col       ebp+12          ; JDIMENSION start_col
-%define workspace       ebp+16          ; DCTELEM * workspace
+%define workspace       ebp+16          ; DCTELEM *workspace
 
         align   16
         global  EXTN(jsimd_convsamp_mmx)
@@ -126,8 +126,8 @@
 ;   (http://www.agner.org/assem/).
 ;
 ; GLOBAL(void)
-; jsimd_quantize_mmx (JCOEFPTR coef_block, DCTELEM * divisors,
-;                     DCTELEM * workspace);
+; jsimd_quantize_mmx (JCOEFPTR coef_block, DCTELEM *divisors,
+;                     DCTELEM *workspace);
 ;
 
 %define RECIPROCAL(m,n,b) MMBLOCK(DCTSIZE*0+(m),(n),(b),SIZEOF_DCTELEM)
@@ -136,8 +136,8 @@
 %define SHIFT(m,n,b)      MMBLOCK(DCTSIZE*3+(m),(n),(b),SIZEOF_DCTELEM)
 
 %define coef_block      ebp+8           ; JCOEFPTR coef_block
-%define divisors        ebp+12          ; DCTELEM * divisors
-%define workspace       ebp+16          ; DCTELEM * workspace
+%define divisors        ebp+12          ; DCTELEM *divisors
+%define workspace       ebp+16          ; DCTELEM *workspace
 
         align   16
         global  EXTN(jsimd_quantize_mmx)
diff --git a/simd/jquant-sse.asm b/simd/jquant-sse.asm
index 3f7fa5d..796723a 100644
--- a/simd/jquant-sse.asm
+++ b/simd/jquant-sse.asm
@@ -27,12 +27,12 @@
 ;
 ; GLOBAL(void)
 ; jsimd_convsamp_float_sse (JSAMPARRAY sample_data, JDIMENSION start_col,
-;                           FAST_FLOAT * workspace);
+;                           FAST_FLOAT *workspace);
 ;
 
 %define sample_data     ebp+8           ; JSAMPARRAY sample_data
 %define start_col       ebp+12          ; JDIMENSION start_col
-%define workspace       ebp+16          ; FAST_FLOAT * workspace
+%define workspace       ebp+16          ; FAST_FLOAT *workspace
 
         align   16
         global  EXTN(jsimd_convsamp_float_sse)
@@ -129,13 +129,13 @@
 ; Quantize/descale the coefficients, and store into coef_block
 ;
 ; GLOBAL(void)
-; jsimd_quantize_float_sse (JCOEFPTR coef_block, FAST_FLOAT * divisors,
-;                           FAST_FLOAT * workspace);
+; jsimd_quantize_float_sse (JCOEFPTR coef_block, FAST_FLOAT *divisors,
+;                           FAST_FLOAT *workspace);
 ;
 
 %define coef_block      ebp+8           ; JCOEFPTR coef_block
-%define divisors        ebp+12          ; FAST_FLOAT * divisors
-%define workspace       ebp+16          ; FAST_FLOAT * workspace
+%define divisors        ebp+12          ; FAST_FLOAT *divisors
+%define workspace       ebp+16          ; FAST_FLOAT *workspace
 
         align   16
         global  EXTN(jsimd_quantize_float_sse)
diff --git a/simd/jquantf-sse2-64.asm b/simd/jquantf-sse2-64.asm
index cf7f0d8..8af256c 100644
--- a/simd/jquantf-sse2-64.asm
+++ b/simd/jquantf-sse2-64.asm
@@ -28,12 +28,12 @@
 ;
 ; GLOBAL(void)
 ; jsimd_convsamp_float_sse2 (JSAMPARRAY sample_data, JDIMENSION start_col,
-;                            FAST_FLOAT * workspace);
+;                            FAST_FLOAT *workspace);
 ;
 
 ; r10 = JSAMPARRAY sample_data
 ; r11 = JDIMENSION start_col
-; r12 = FAST_FLOAT * workspace
+; r12 = FAST_FLOAT *workspace
 
         align   16
         global  EXTN(jsimd_convsamp_float_sse2)
@@ -101,13 +101,13 @@
 ; Quantize/descale the coefficients, and store into coef_block
 ;
 ; GLOBAL(void)
-; jsimd_quantize_float_sse2 (JCOEFPTR coef_block, FAST_FLOAT * divisors,
-;                         FAST_FLOAT * workspace);
+; jsimd_quantize_float_sse2 (JCOEFPTR coef_block, FAST_FLOAT *divisors,
+;                         FAST_FLOAT *workspace);
 ;
 
 ; r10 = JCOEFPTR coef_block
-; r11 = FAST_FLOAT * divisors
-; r12 = FAST_FLOAT * workspace
+; r11 = FAST_FLOAT *divisors
+; r12 = FAST_FLOAT *workspace
 
         align   16
         global  EXTN(jsimd_quantize_float_sse2)
diff --git a/simd/jquantf-sse2.asm b/simd/jquantf-sse2.asm
index b1d3efc..a8d4cd3 100644
--- a/simd/jquantf-sse2.asm
+++ b/simd/jquantf-sse2.asm
@@ -27,12 +27,12 @@
 ;
 ; GLOBAL(void)
 ; jsimd_convsamp_float_sse2 (JSAMPARRAY sample_data, JDIMENSION start_col,
-;                            FAST_FLOAT * workspace);
+;                            FAST_FLOAT *workspace);
 ;
 
 %define sample_data     ebp+8           ; JSAMPARRAY sample_data
 %define start_col       ebp+12          ; JDIMENSION start_col
-%define workspace       ebp+16          ; FAST_FLOAT * workspace
+%define workspace       ebp+16          ; FAST_FLOAT *workspace
 
         align   16
         global  EXTN(jsimd_convsamp_float_sse2)
@@ -106,13 +106,13 @@
 ; Quantize/descale the coefficients, and store into coef_block
 ;
 ; GLOBAL(void)
-; jsimd_quantize_float_sse2 (JCOEFPTR coef_block, FAST_FLOAT * divisors,
-;                         FAST_FLOAT * workspace);
+; jsimd_quantize_float_sse2 (JCOEFPTR coef_block, FAST_FLOAT *divisors,
+;                            FAST_FLOAT *workspace);
 ;
 
 %define coef_block      ebp+8           ; JCOEFPTR coef_block
-%define divisors        ebp+12          ; FAST_FLOAT * divisors
-%define workspace       ebp+16          ; FAST_FLOAT * workspace
+%define divisors        ebp+12          ; FAST_FLOAT *divisors
+%define workspace       ebp+16          ; FAST_FLOAT *workspace
 
         align   16
         global  EXTN(jsimd_quantize_float_sse2)
diff --git a/simd/jquanti-altivec.c b/simd/jquanti-altivec.c
new file mode 100644
index 0000000..b3adab9
--- /dev/null
+++ b/simd/jquanti-altivec.c
@@ -0,0 +1,252 @@
+/*
+ * AltiVec optimizations for libjpeg-turbo
+ *
+ * Copyright (C) 2014-2015, D. R. Commander.
+ * All rights reserved.
+ * This software is provided 'as-is', without any express or implied
+ * warranty.  In no event will the authors be held liable for any damages
+ * arising from the use of this software.
+ *
+ * Permission is granted to anyone to use this software for any purpose,
+ * including commercial applications, and to alter it and redistribute it
+ * freely, subject to the following restrictions:
+ *
+ * 1. The origin of this software must not be misrepresented; you must not
+ *    claim that you wrote the original software. If you use this software
+ *    in a product, an acknowledgment in the product documentation would be
+ *    appreciated but is not required.
+ * 2. Altered source versions must be plainly marked as such, and must not be
+ *    misrepresented as being the original software.
+ * 3. This notice may not be removed or altered from any source distribution.
+ */
+
+/* INTEGER QUANTIZATION AND SAMPLE CONVERSION */
+
+#include "jsimd_altivec.h"
+
+
+/* NOTE: The address will either be aligned or offset by 8 bytes, so we can
+ * always get the data we want by using a single vector load (although we may
+ * have to permute the result.)
+ */
+#if __BIG_ENDIAN__
+
+#define LOAD_ROW(row) {  \
+  elemptr = sample_data[row] + start_col;  \
+  in##row = vec_ld(0, elemptr);  \
+  if ((size_t)elemptr & 15)  \
+    in##row = vec_perm(in##row, in##row, vec_lvsl(0, elemptr));  \
+}
+
+#else
+
+#define LOAD_ROW(row) {  \
+  elemptr = sample_data[row] + start_col;  \
+  in##row = vec_vsx_ld(0, elemptr);  \
+}
+
+#endif
+
+
+void
+jsimd_convsamp_altivec (JSAMPARRAY sample_data, JDIMENSION start_col,
+                        DCTELEM *workspace)
+{
+  JSAMPROW elemptr;
+
+  __vector unsigned char in0, in1, in2, in3, in4, in5, in6, in7;
+  __vector short out0, out1, out2, out3, out4, out5, out6, out7;
+
+  /* Constants */
+  __vector short pw_centerjsamp = { __8X(CENTERJSAMPLE) };
+  __vector unsigned char pb_zero = { __16X(0) };
+
+  LOAD_ROW(0);
+  LOAD_ROW(1);
+  LOAD_ROW(2);
+  LOAD_ROW(3);
+  LOAD_ROW(4);
+  LOAD_ROW(5);
+  LOAD_ROW(6);
+  LOAD_ROW(7);
+
+  out0 = (__vector short)VEC_UNPACKHU(in0);
+  out1 = (__vector short)VEC_UNPACKHU(in1);
+  out2 = (__vector short)VEC_UNPACKHU(in2);
+  out3 = (__vector short)VEC_UNPACKHU(in3);
+  out4 = (__vector short)VEC_UNPACKHU(in4);
+  out5 = (__vector short)VEC_UNPACKHU(in5);
+  out6 = (__vector short)VEC_UNPACKHU(in6);
+  out7 = (__vector short)VEC_UNPACKHU(in7);
+
+  out0 = vec_sub(out0, pw_centerjsamp);
+  out1 = vec_sub(out1, pw_centerjsamp);
+  out2 = vec_sub(out2, pw_centerjsamp);
+  out3 = vec_sub(out3, pw_centerjsamp);
+  out4 = vec_sub(out4, pw_centerjsamp);
+  out5 = vec_sub(out5, pw_centerjsamp);
+  out6 = vec_sub(out6, pw_centerjsamp);
+  out7 = vec_sub(out7, pw_centerjsamp);
+
+  vec_st(out0, 0, workspace);
+  vec_st(out1, 16, workspace);
+  vec_st(out2, 32, workspace);
+  vec_st(out3, 48, workspace);
+  vec_st(out4, 64, workspace);
+  vec_st(out5, 80, workspace);
+  vec_st(out6, 96, workspace);
+  vec_st(out7, 112, workspace);
+}
+
+
+#define WORD_BIT 16
+
+/* There is no AltiVec 16-bit unsigned multiply instruction, hence this.
+   We basically need an unsigned equivalent of vec_madds(). */
+
+#define MULTIPLY(vs0, vs1, out) {  \
+  tmpe = vec_mule((__vector unsigned short)vs0,  \
+                  (__vector unsigned short)vs1);  \
+  tmpo = vec_mulo((__vector unsigned short)vs0,  \
+                  (__vector unsigned short)vs1);  \
+  out = (__vector short)vec_perm((__vector unsigned short)tmpe,  \
+                                 (__vector unsigned short)tmpo,  \
+                                 shift_pack_index);  \
+}
+
+void
+jsimd_quantize_altivec (JCOEFPTR coef_block, DCTELEM *divisors,
+                        DCTELEM *workspace)
+{
+  __vector short row0, row1, row2, row3, row4, row5, row6, row7,
+    row0s, row1s, row2s, row3s, row4s, row5s, row6s, row7s,
+    corr0, corr1, corr2, corr3, corr4, corr5, corr6, corr7,
+    recip0, recip1, recip2, recip3, recip4, recip5, recip6, recip7,
+    scale0, scale1, scale2, scale3, scale4, scale5, scale6, scale7;
+  __vector unsigned int tmpe, tmpo;
+
+  /* Constants */
+  __vector unsigned short pw_word_bit_m1 = { __8X(WORD_BIT - 1) };
+#if __BIG_ENDIAN__
+  __vector unsigned char shift_pack_index =
+    {0,1,16,17,4,5,20,21,8,9,24,25,12,13,28,29};
+#else
+  __vector unsigned char shift_pack_index =
+    {2,3,18,19,6,7,22,23,10,11,26,27,14,15,30,31};
+#endif
+
+  row0 = vec_ld(0, workspace);
+  row1 = vec_ld(16, workspace);
+  row2 = vec_ld(32, workspace);
+  row3 = vec_ld(48, workspace);
+  row4 = vec_ld(64, workspace);
+  row5 = vec_ld(80, workspace);
+  row6 = vec_ld(96, workspace);
+  row7 = vec_ld(112, workspace);
+
+  /* Branch-less absolute value */
+  row0s = vec_sra(row0, pw_word_bit_m1);
+  row1s = vec_sra(row1, pw_word_bit_m1);
+  row2s = vec_sra(row2, pw_word_bit_m1);
+  row3s = vec_sra(row3, pw_word_bit_m1);
+  row4s = vec_sra(row4, pw_word_bit_m1);
+  row5s = vec_sra(row5, pw_word_bit_m1);
+  row6s = vec_sra(row6, pw_word_bit_m1);
+  row7s = vec_sra(row7, pw_word_bit_m1);
+  row0 = vec_xor(row0, row0s);
+  row1 = vec_xor(row1, row1s);
+  row2 = vec_xor(row2, row2s);
+  row3 = vec_xor(row3, row3s);
+  row4 = vec_xor(row4, row4s);
+  row5 = vec_xor(row5, row5s);
+  row6 = vec_xor(row6, row6s);
+  row7 = vec_xor(row7, row7s);
+  row0 = vec_sub(row0, row0s);
+  row1 = vec_sub(row1, row1s);
+  row2 = vec_sub(row2, row2s);
+  row3 = vec_sub(row3, row3s);
+  row4 = vec_sub(row4, row4s);
+  row5 = vec_sub(row5, row5s);
+  row6 = vec_sub(row6, row6s);
+  row7 = vec_sub(row7, row7s);
+
+  corr0 = vec_ld(DCTSIZE2 * 2, divisors);
+  corr1 = vec_ld(DCTSIZE2 * 2 + 16, divisors);
+  corr2 = vec_ld(DCTSIZE2 * 2 + 32, divisors);
+  corr3 = vec_ld(DCTSIZE2 * 2 + 48, divisors);
+  corr4 = vec_ld(DCTSIZE2 * 2 + 64, divisors);
+  corr5 = vec_ld(DCTSIZE2 * 2 + 80, divisors);
+  corr6 = vec_ld(DCTSIZE2 * 2 + 96, divisors);
+  corr7 = vec_ld(DCTSIZE2 * 2 + 112, divisors);
+
+  row0 = vec_add(row0, corr0);
+  row1 = vec_add(row1, corr1);
+  row2 = vec_add(row2, corr2);
+  row3 = vec_add(row3, corr3);
+  row4 = vec_add(row4, corr4);
+  row5 = vec_add(row5, corr5);
+  row6 = vec_add(row6, corr6);
+  row7 = vec_add(row7, corr7);
+
+  recip0 = vec_ld(0, divisors);
+  recip1 = vec_ld(16, divisors);
+  recip2 = vec_ld(32, divisors);
+  recip3 = vec_ld(48, divisors);
+  recip4 = vec_ld(64, divisors);
+  recip5 = vec_ld(80, divisors);
+  recip6 = vec_ld(96, divisors);
+  recip7 = vec_ld(112, divisors);
+
+  MULTIPLY(row0, recip0, row0);
+  MULTIPLY(row1, recip1, row1);
+  MULTIPLY(row2, recip2, row2);
+  MULTIPLY(row3, recip3, row3);
+  MULTIPLY(row4, recip4, row4);
+  MULTIPLY(row5, recip5, row5);
+  MULTIPLY(row6, recip6, row6);
+  MULTIPLY(row7, recip7, row7);
+
+  scale0 = vec_ld(DCTSIZE2 * 4, divisors);
+  scale1 = vec_ld(DCTSIZE2 * 4 + 16, divisors);
+  scale2 = vec_ld(DCTSIZE2 * 4 + 32, divisors);
+  scale3 = vec_ld(DCTSIZE2 * 4 + 48, divisors);
+  scale4 = vec_ld(DCTSIZE2 * 4 + 64, divisors);
+  scale5 = vec_ld(DCTSIZE2 * 4 + 80, divisors);
+  scale6 = vec_ld(DCTSIZE2 * 4 + 96, divisors);
+  scale7 = vec_ld(DCTSIZE2 * 4 + 112, divisors);
+
+  MULTIPLY(row0, scale0, row0);
+  MULTIPLY(row1, scale1, row1);
+  MULTIPLY(row2, scale2, row2);
+  MULTIPLY(row3, scale3, row3);
+  MULTIPLY(row4, scale4, row4);
+  MULTIPLY(row5, scale5, row5);
+  MULTIPLY(row6, scale6, row6);
+  MULTIPLY(row7, scale7, row7);
+
+  row0 = vec_xor(row0, row0s);
+  row1 = vec_xor(row1, row1s);
+  row2 = vec_xor(row2, row2s);
+  row3 = vec_xor(row3, row3s);
+  row4 = vec_xor(row4, row4s);
+  row5 = vec_xor(row5, row5s);
+  row6 = vec_xor(row6, row6s);
+  row7 = vec_xor(row7, row7s);
+  row0 = vec_sub(row0, row0s);
+  row1 = vec_sub(row1, row1s);
+  row2 = vec_sub(row2, row2s);
+  row3 = vec_sub(row3, row3s);
+  row4 = vec_sub(row4, row4s);
+  row5 = vec_sub(row5, row5s);
+  row6 = vec_sub(row6, row6s);
+  row7 = vec_sub(row7, row7s);
+
+  vec_st(row0, 0, coef_block);
+  vec_st(row1, 16, coef_block);
+  vec_st(row2, 32, coef_block);
+  vec_st(row3, 48, coef_block);
+  vec_st(row4, 64, coef_block);
+  vec_st(row5, 80, coef_block);
+  vec_st(row6, 96, coef_block);
+  vec_st(row7, 112, coef_block);
+}
diff --git a/simd/jquanti-sse2-64.asm b/simd/jquanti-sse2-64.asm
index b61f4db..9b3f4ee 100644
--- a/simd/jquanti-sse2-64.asm
+++ b/simd/jquanti-sse2-64.asm
@@ -28,12 +28,12 @@
 ;
 ; GLOBAL(void)
 ; jsimd_convsamp_sse2 (JSAMPARRAY sample_data, JDIMENSION start_col,
-;                      DCTELEM * workspace);
+;                      DCTELEM *workspace);
 ;
 
 ; r10 = JSAMPARRAY sample_data
 ; r11 = JDIMENSION start_col
-; r12 = DCTELEM * workspace
+; r12 = DCTELEM *workspace
 
         align   16
         global  EXTN(jsimd_convsamp_sse2)
@@ -99,8 +99,8 @@
 ;   (http://www.agner.org/assem/).
 ;
 ; GLOBAL(void)
-; jsimd_quantize_sse2 (JCOEFPTR coef_block, DCTELEM * divisors,
-;                      DCTELEM * workspace);
+; jsimd_quantize_sse2 (JCOEFPTR coef_block, DCTELEM *divisors,
+;                      DCTELEM *workspace);
 ;
 
 %define RECIPROCAL(m,n,b) XMMBLOCK(DCTSIZE*0+(m),(n),(b),SIZEOF_DCTELEM)
@@ -108,8 +108,8 @@
 %define SCALE(m,n,b)      XMMBLOCK(DCTSIZE*2+(m),(n),(b),SIZEOF_DCTELEM)
 
 ; r10 = JCOEFPTR coef_block
-; r11 = DCTELEM * divisors
-; r12 = DCTELEM * workspace
+; r11 = DCTELEM *divisors
+; r12 = DCTELEM *workspace
 
         align   16
         global  EXTN(jsimd_quantize_sse2)
diff --git a/simd/jquanti-sse2.asm b/simd/jquanti-sse2.asm
index 79ca3db..4299c33 100644
--- a/simd/jquanti-sse2.asm
+++ b/simd/jquanti-sse2.asm
@@ -27,12 +27,12 @@
 ;
 ; GLOBAL(void)
 ; jsimd_convsamp_sse2 (JSAMPARRAY sample_data, JDIMENSION start_col,
-;                      DCTELEM * workspace);
+;                      DCTELEM *workspace);
 ;
 
 %define sample_data     ebp+8           ; JSAMPARRAY sample_data
 %define start_col       ebp+12          ; JDIMENSION start_col
-%define workspace       ebp+16          ; DCTELEM * workspace
+%define workspace       ebp+16          ; DCTELEM *workspace
 
         align   16
         global  EXTN(jsimd_convsamp_sse2)
@@ -104,8 +104,8 @@
 ;   (http://www.agner.org/assem/).
 ;
 ; GLOBAL(void)
-; jsimd_quantize_sse2 (JCOEFPTR coef_block, DCTELEM * divisors,
-;                      DCTELEM * workspace);
+; jsimd_quantize_sse2 (JCOEFPTR coef_block, DCTELEM *divisors,
+;                      DCTELEM *workspace);
 ;
 
 %define RECIPROCAL(m,n,b) XMMBLOCK(DCTSIZE*0+(m),(n),(b),SIZEOF_DCTELEM)
@@ -113,8 +113,8 @@
 %define SCALE(m,n,b)      XMMBLOCK(DCTSIZE*2+(m),(n),(b),SIZEOF_DCTELEM)
 
 %define coef_block      ebp+8           ; JCOEFPTR coef_block
-%define divisors        ebp+12          ; DCTELEM * divisors
-%define workspace       ebp+16          ; DCTELEM * workspace
+%define divisors        ebp+12          ; DCTELEM *divisors
+%define workspace       ebp+16          ; DCTELEM *workspace
 
         align   16
         global  EXTN(jsimd_quantize_sse2)
diff --git a/simd/jsimd.h b/simd/jsimd.h
index c5abd45..a39fafa 100644
--- a/simd/jsimd.h
+++ b/simd/jsimd.h
@@ -2,9 +2,10 @@
  * simd/jsimd.h
  *
  * Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
- * Copyright 2011 D. R. Commander
+ * Copyright (C) 2011, 2014-2016 D. R. Commander
  * Copyright (C) 2013-2014, MIPS Technologies, Inc., California
  * Copyright (C) 2014 Linaro Limited
+ * Copyright (C) 2015-2016 Matthieu Darbois
  *
  * Based on the x86 SIMD extension for IJG JPEG library,
  * Copyright (C) 1999-2006, MIYASAKA Masaru.
@@ -21,6 +22,7 @@
 #define JSIMD_SSE2       0x08
 #define JSIMD_ARM_NEON   0x10
 #define JSIMD_MIPS_DSPR2 0x20
+#define JSIMD_ALTIVEC    0x40
 
 /* SIMD Ext: retrieve SIMD/CPU information */
 EXTERN(unsigned int) jpeg_simd_cpu_support (void);
@@ -93,6 +95,13 @@
         (JDIMENSION img_width, JSAMPARRAY input_buf, JSAMPIMAGE output_buf,
          JDIMENSION output_row, int num_rows);
 
+EXTERN(void) jsimd_extrgb_ycc_convert_neon_slowld3
+        (JDIMENSION img_width, JSAMPARRAY input_buf, JSAMPIMAGE output_buf,
+         JDIMENSION output_row, int num_rows);
+EXTERN(void) jsimd_extbgr_ycc_convert_neon_slowld3
+        (JDIMENSION img_width, JSAMPARRAY input_buf, JSAMPIMAGE output_buf,
+         JDIMENSION output_row, int num_rows);
+
 EXTERN(void) jsimd_rgb_ycc_convert_mips_dspr2
         (JDIMENSION img_width, JSAMPARRAY input_buf, JSAMPIMAGE output_buf,
          JDIMENSION output_row, int num_rows);
@@ -115,6 +124,28 @@
         (JDIMENSION img_width, JSAMPARRAY input_buf, JSAMPIMAGE output_buf,
          JDIMENSION output_row, int num_rows);
 
+EXTERN(void) jsimd_rgb_ycc_convert_altivec
+        (JDIMENSION img_width, JSAMPARRAY input_buf, JSAMPIMAGE output_buf,
+         JDIMENSION output_row, int num_rows);
+EXTERN(void) jsimd_extrgb_ycc_convert_altivec
+        (JDIMENSION img_width, JSAMPARRAY input_buf, JSAMPIMAGE output_buf,
+         JDIMENSION output_row, int num_rows);
+EXTERN(void) jsimd_extrgbx_ycc_convert_altivec
+        (JDIMENSION img_width, JSAMPARRAY input_buf, JSAMPIMAGE output_buf,
+         JDIMENSION output_row, int num_rows);
+EXTERN(void) jsimd_extbgr_ycc_convert_altivec
+        (JDIMENSION img_width, JSAMPARRAY input_buf, JSAMPIMAGE output_buf,
+         JDIMENSION output_row, int num_rows);
+EXTERN(void) jsimd_extbgrx_ycc_convert_altivec
+        (JDIMENSION img_width, JSAMPARRAY input_buf, JSAMPIMAGE output_buf,
+         JDIMENSION output_row, int num_rows);
+EXTERN(void) jsimd_extxbgr_ycc_convert_altivec
+        (JDIMENSION img_width, JSAMPARRAY input_buf, JSAMPIMAGE output_buf,
+         JDIMENSION output_row, int num_rows);
+EXTERN(void) jsimd_extxrgb_ycc_convert_altivec
+        (JDIMENSION img_width, JSAMPARRAY input_buf, JSAMPIMAGE output_buf,
+         JDIMENSION output_row, int num_rows);
+
 /* RGB & extended RGB --> Grayscale Colorspace Conversion */
 EXTERN(void) jsimd_rgb_gray_convert_mmx
         (JDIMENSION img_width, JSAMPARRAY input_buf, JSAMPIMAGE output_buf,
@@ -183,6 +214,28 @@
         (JDIMENSION img_width, JSAMPARRAY input_buf, JSAMPIMAGE output_buf,
          JDIMENSION output_row, int num_rows);
 
+EXTERN(void) jsimd_rgb_gray_convert_altivec
+        (JDIMENSION img_width, JSAMPARRAY input_buf, JSAMPIMAGE output_buf,
+         JDIMENSION output_row, int num_rows);
+EXTERN(void) jsimd_extrgb_gray_convert_altivec
+        (JDIMENSION img_width, JSAMPARRAY input_buf, JSAMPIMAGE output_buf,
+         JDIMENSION output_row, int num_rows);
+EXTERN(void) jsimd_extrgbx_gray_convert_altivec
+        (JDIMENSION img_width, JSAMPARRAY input_buf, JSAMPIMAGE output_buf,
+         JDIMENSION output_row, int num_rows);
+EXTERN(void) jsimd_extbgr_gray_convert_altivec
+        (JDIMENSION img_width, JSAMPARRAY input_buf, JSAMPIMAGE output_buf,
+         JDIMENSION output_row, int num_rows);
+EXTERN(void) jsimd_extbgrx_gray_convert_altivec
+        (JDIMENSION img_width, JSAMPARRAY input_buf, JSAMPIMAGE output_buf,
+         JDIMENSION output_row, int num_rows);
+EXTERN(void) jsimd_extxbgr_gray_convert_altivec
+        (JDIMENSION img_width, JSAMPARRAY input_buf, JSAMPIMAGE output_buf,
+         JDIMENSION output_row, int num_rows);
+EXTERN(void) jsimd_extxrgb_gray_convert_altivec
+        (JDIMENSION img_width, JSAMPARRAY input_buf, JSAMPIMAGE output_buf,
+         JDIMENSION output_row, int num_rows);
+
 /* YCC --> RGB & extended RGB Colorspace Conversion */
 EXTERN(void) jsimd_ycc_rgb_convert_mmx
         (JDIMENSION out_width, JSAMPIMAGE input_buf, JDIMENSION input_row,
@@ -254,6 +307,13 @@
         (JDIMENSION out_width, JSAMPIMAGE input_buf, JDIMENSION input_row,
          JSAMPARRAY output_buf, int num_rows);
 
+EXTERN(void) jsimd_ycc_extrgb_convert_neon_slowst3
+        (JDIMENSION out_width, JSAMPIMAGE input_buf, JDIMENSION input_row,
+         JSAMPARRAY output_buf, int num_rows);
+EXTERN(void) jsimd_ycc_extbgr_convert_neon_slowst3
+        (JDIMENSION out_width, JSAMPIMAGE input_buf, JDIMENSION input_row,
+         JSAMPARRAY output_buf, int num_rows);
+
 EXTERN(void) jsimd_ycc_rgb_convert_mips_dspr2
         (JDIMENSION out_width, JSAMPIMAGE input_buf, JDIMENSION input_row,
          JSAMPARRAY output_buf, int num_rows);
@@ -276,6 +336,28 @@
         (JDIMENSION out_width, JSAMPIMAGE input_buf, JDIMENSION input_row,
          JSAMPARRAY output_buf, int num_rows);
 
+EXTERN(void) jsimd_ycc_rgb_convert_altivec
+        (JDIMENSION out_width, JSAMPIMAGE input_buf, JDIMENSION input_row,
+         JSAMPARRAY output_buf, int num_rows);
+EXTERN(void) jsimd_ycc_extrgb_convert_altivec
+        (JDIMENSION out_width, JSAMPIMAGE input_buf, JDIMENSION input_row,
+         JSAMPARRAY output_buf, int num_rows);
+EXTERN(void) jsimd_ycc_extrgbx_convert_altivec
+        (JDIMENSION out_width, JSAMPIMAGE input_buf, JDIMENSION input_row,
+         JSAMPARRAY output_buf, int num_rows);
+EXTERN(void) jsimd_ycc_extbgr_convert_altivec
+        (JDIMENSION out_width, JSAMPIMAGE input_buf, JDIMENSION input_row,
+         JSAMPARRAY output_buf, int num_rows);
+EXTERN(void) jsimd_ycc_extbgrx_convert_altivec
+        (JDIMENSION out_width, JSAMPIMAGE input_buf, JDIMENSION input_row,
+         JSAMPARRAY output_buf, int num_rows);
+EXTERN(void) jsimd_ycc_extxbgr_convert_altivec
+        (JDIMENSION out_width, JSAMPIMAGE input_buf, JDIMENSION input_row,
+         JSAMPARRAY output_buf, int num_rows);
+EXTERN(void) jsimd_ycc_extxrgb_convert_altivec
+        (JDIMENSION out_width, JSAMPIMAGE input_buf, JDIMENSION input_row,
+         JSAMPARRAY output_buf, int num_rows);
+
 /* NULL Colorspace Conversion */
 EXTERN(void) jsimd_c_null_convert_mips_dspr2
         (JDIMENSION img_width, JSAMPARRAY input_buf, JSAMPIMAGE output_buf,
@@ -292,11 +374,21 @@
          JDIMENSION v_samp_factor, JDIMENSION width_blocks,
          JSAMPARRAY input_data, JSAMPARRAY output_data);
 
+EXTERN(void) jsimd_h2v1_downsample_neon
+        (JDIMENSION image_width, int max_v_samp_factor,
+         JDIMENSION v_samp_factor, JDIMENSION width_blocks,
+         JSAMPARRAY input_data, JSAMPARRAY output_data);
+
 EXTERN(void) jsimd_h2v1_downsample_mips_dspr2
         (JDIMENSION image_width, int max_v_samp_factor,
          JDIMENSION v_samp_factor, JDIMENSION width_blocks,
          JSAMPARRAY input_data, JSAMPARRAY output_data);
 
+EXTERN(void) jsimd_h2v1_downsample_altivec
+        (JDIMENSION image_width, int max_v_samp_factor,
+         JDIMENSION v_samp_factor, JDIMENSION width_blocks,
+         JSAMPARRAY input_data, JSAMPARRAY output_data);
+
 /* h2v2 Downsampling */
 EXTERN(void) jsimd_h2v2_downsample_mmx
         (JDIMENSION image_width, int max_v_samp_factor,
@@ -308,11 +400,21 @@
          JDIMENSION v_samp_factor, JDIMENSION width_blocks,
          JSAMPARRAY input_data, JSAMPARRAY output_data);
 
+EXTERN(void) jsimd_h2v2_downsample_neon
+        (JDIMENSION image_width, int max_v_samp_factor,
+         JDIMENSION v_samp_factor, JDIMENSION width_blocks,
+         JSAMPARRAY input_data, JSAMPARRAY output_data);
+
 EXTERN(void) jsimd_h2v2_downsample_mips_dspr2
         (JDIMENSION image_width, int max_v_samp_factor,
          JDIMENSION v_samp_factor, JDIMENSION width_blocks,
          JSAMPARRAY input_data, JSAMPARRAY output_data);
 
+EXTERN(void) jsimd_h2v2_downsample_altivec
+        (JDIMENSION image_width, int max_v_samp_factor,
+         JDIMENSION v_samp_factor, JDIMENSION width_blocks,
+         JSAMPARRAY input_data, JSAMPARRAY output_data);
+
 /* h2v2 Smooth Downsampling */
 EXTERN(void) jsimd_h2v2_smooth_downsample_mips_dspr2
         (JSAMPARRAY input_data, JSAMPARRAY output_data,
@@ -324,57 +426,70 @@
 /* Upsampling */
 EXTERN(void) jsimd_h2v1_upsample_mmx
         (int max_v_samp_factor, JDIMENSION output_width, JSAMPARRAY input_data,
-         JSAMPARRAY * output_data_ptr);
+         JSAMPARRAY *output_data_ptr);
 EXTERN(void) jsimd_h2v2_upsample_mmx
         (int max_v_samp_factor, JDIMENSION output_width, JSAMPARRAY input_data,
-         JSAMPARRAY * output_data_ptr);
+         JSAMPARRAY *output_data_ptr);
 
 EXTERN(void) jsimd_h2v1_upsample_sse2
         (int max_v_samp_factor, JDIMENSION output_width, JSAMPARRAY input_data,
-         JSAMPARRAY * output_data_ptr);
+         JSAMPARRAY *output_data_ptr);
 EXTERN(void) jsimd_h2v2_upsample_sse2
         (int max_v_samp_factor, JDIMENSION output_width, JSAMPARRAY input_data,
-         JSAMPARRAY * output_data_ptr);
+         JSAMPARRAY *output_data_ptr);
 
 EXTERN(void) jsimd_h2v1_upsample_mips_dspr2
         (int max_v_samp_factor, JDIMENSION output_width, JSAMPARRAY input_data,
-         JSAMPARRAY * output_data_ptr);
+         JSAMPARRAY *output_data_ptr);
 EXTERN(void) jsimd_h2v2_upsample_mips_dspr2
         (int max_v_samp_factor, JDIMENSION output_width, JSAMPARRAY input_data,
-         JSAMPARRAY * output_data_ptr);
+         JSAMPARRAY *output_data_ptr);
 
 EXTERN(void) jsimd_int_upsample_mips_dspr2
         (UINT8 h_expand, UINT8 v_expand, JSAMPARRAY input_data,
-         JSAMPARRAY * output_data_ptr, JDIMENSION output_width,
+         JSAMPARRAY *output_data_ptr, JDIMENSION output_width,
          int max_v_samp_factor);
 
+EXTERN(void) jsimd_h2v1_upsample_altivec
+        (int max_v_samp_factor, JDIMENSION output_width, JSAMPARRAY input_data,
+         JSAMPARRAY *output_data_ptr);
+EXTERN(void) jsimd_h2v2_upsample_altivec
+        (int max_v_samp_factor, JDIMENSION output_width, JSAMPARRAY input_data,
+         JSAMPARRAY *output_data_ptr);
 
 /* Fancy Upsampling */
 EXTERN(void) jsimd_h2v1_fancy_upsample_mmx
         (int max_v_samp_factor, JDIMENSION downsampled_width,
-         JSAMPARRAY input_data, JSAMPARRAY * output_data_ptr);
+         JSAMPARRAY input_data, JSAMPARRAY *output_data_ptr);
 EXTERN(void) jsimd_h2v2_fancy_upsample_mmx
         (int max_v_samp_factor, JDIMENSION downsampled_width,
-         JSAMPARRAY input_data, JSAMPARRAY * output_data_ptr);
+         JSAMPARRAY input_data, JSAMPARRAY *output_data_ptr);
 
 extern const int jconst_fancy_upsample_sse2[];
 EXTERN(void) jsimd_h2v1_fancy_upsample_sse2
         (int max_v_samp_factor, JDIMENSION downsampled_width,
-         JSAMPARRAY input_data, JSAMPARRAY * output_data_ptr);
+         JSAMPARRAY input_data, JSAMPARRAY *output_data_ptr);
 EXTERN(void) jsimd_h2v2_fancy_upsample_sse2
         (int max_v_samp_factor, JDIMENSION downsampled_width,
-         JSAMPARRAY input_data, JSAMPARRAY * output_data_ptr);
+         JSAMPARRAY input_data, JSAMPARRAY *output_data_ptr);
 
 EXTERN(void) jsimd_h2v1_fancy_upsample_neon
         (int max_v_samp_factor, JDIMENSION downsampled_width,
-         JSAMPARRAY input_data, JSAMPARRAY * output_data_ptr);
+         JSAMPARRAY input_data, JSAMPARRAY *output_data_ptr);
 
 EXTERN(void) jsimd_h2v1_fancy_upsample_mips_dspr2
         (int max_v_samp_factor, JDIMENSION downsampled_width,
-         JSAMPARRAY input_data, JSAMPARRAY * output_data_ptr);
+         JSAMPARRAY input_data, JSAMPARRAY *output_data_ptr);
 EXTERN(void) jsimd_h2v2_fancy_upsample_mips_dspr2
         (int max_v_samp_factor, JDIMENSION downsampled_width,
-         JSAMPARRAY input_data, JSAMPARRAY * output_data_ptr);
+         JSAMPARRAY input_data, JSAMPARRAY *output_data_ptr);
+
+EXTERN(void) jsimd_h2v1_fancy_upsample_altivec
+        (int max_v_samp_factor, JDIMENSION downsampled_width,
+         JSAMPARRAY input_data, JSAMPARRAY *output_data_ptr);
+EXTERN(void) jsimd_h2v2_fancy_upsample_altivec
+        (int max_v_samp_factor, JDIMENSION downsampled_width,
+         JSAMPARRAY input_data, JSAMPARRAY *output_data_ptr);
 
 /* Merged Upsampling */
 EXTERN(void) jsimd_h2v1_merged_upsample_mmx
@@ -510,169 +625,247 @@
         (JDIMENSION output_width, JSAMPIMAGE input_buf,
          JDIMENSION in_row_group_ctr, JSAMPARRAY output_buf, JSAMPLE* range);
 
+EXTERN(void) jsimd_h2v1_merged_upsample_altivec
+        (JDIMENSION output_width, JSAMPIMAGE input_buf,
+         JDIMENSION in_row_group_ctr, JSAMPARRAY output_buf);
+EXTERN(void) jsimd_h2v1_extrgb_merged_upsample_altivec
+        (JDIMENSION output_width, JSAMPIMAGE input_buf,
+         JDIMENSION in_row_group_ctr, JSAMPARRAY output_buf);
+EXTERN(void) jsimd_h2v1_extrgbx_merged_upsample_altivec
+        (JDIMENSION output_width, JSAMPIMAGE input_buf,
+         JDIMENSION in_row_group_ctr, JSAMPARRAY output_buf);
+EXTERN(void) jsimd_h2v1_extbgr_merged_upsample_altivec
+        (JDIMENSION output_width, JSAMPIMAGE input_buf,
+         JDIMENSION in_row_group_ctr, JSAMPARRAY output_buf);
+EXTERN(void) jsimd_h2v1_extbgrx_merged_upsample_altivec
+        (JDIMENSION output_width, JSAMPIMAGE input_buf,
+         JDIMENSION in_row_group_ctr, JSAMPARRAY output_buf);
+EXTERN(void) jsimd_h2v1_extxbgr_merged_upsample_altivec
+        (JDIMENSION output_width, JSAMPIMAGE input_buf,
+         JDIMENSION in_row_group_ctr, JSAMPARRAY output_buf);
+EXTERN(void) jsimd_h2v1_extxrgb_merged_upsample_altivec
+        (JDIMENSION output_width, JSAMPIMAGE input_buf,
+         JDIMENSION in_row_group_ctr, JSAMPARRAY output_buf);
+
+EXTERN(void) jsimd_h2v2_merged_upsample_altivec
+        (JDIMENSION output_width, JSAMPIMAGE input_buf,
+         JDIMENSION in_row_group_ctr, JSAMPARRAY output_buf);
+EXTERN(void) jsimd_h2v2_extrgb_merged_upsample_altivec
+        (JDIMENSION output_width, JSAMPIMAGE input_buf,
+         JDIMENSION in_row_group_ctr, JSAMPARRAY output_buf);
+EXTERN(void) jsimd_h2v2_extrgbx_merged_upsample_altivec
+        (JDIMENSION output_width, JSAMPIMAGE input_buf,
+         JDIMENSION in_row_group_ctr, JSAMPARRAY output_buf);
+EXTERN(void) jsimd_h2v2_extbgr_merged_upsample_altivec
+        (JDIMENSION output_width, JSAMPIMAGE input_buf,
+         JDIMENSION in_row_group_ctr, JSAMPARRAY output_buf);
+EXTERN(void) jsimd_h2v2_extbgrx_merged_upsample_altivec
+        (JDIMENSION output_width, JSAMPIMAGE input_buf,
+         JDIMENSION in_row_group_ctr, JSAMPARRAY output_buf);
+EXTERN(void) jsimd_h2v2_extxbgr_merged_upsample_altivec
+        (JDIMENSION output_width, JSAMPIMAGE input_buf,
+         JDIMENSION in_row_group_ctr, JSAMPARRAY output_buf);
+EXTERN(void) jsimd_h2v2_extxrgb_merged_upsample_altivec
+        (JDIMENSION output_width, JSAMPIMAGE input_buf,
+         JDIMENSION in_row_group_ctr, JSAMPARRAY output_buf);
+
 /* Sample Conversion */
 EXTERN(void) jsimd_convsamp_mmx
-        (JSAMPARRAY sample_data, JDIMENSION start_col, DCTELEM * workspace);
+        (JSAMPARRAY sample_data, JDIMENSION start_col, DCTELEM *workspace);
 
 EXTERN(void) jsimd_convsamp_sse2
-        (JSAMPARRAY sample_data, JDIMENSION start_col, DCTELEM * workspace);
+        (JSAMPARRAY sample_data, JDIMENSION start_col, DCTELEM *workspace);
 
 EXTERN(void) jsimd_convsamp_neon
-        (JSAMPARRAY sample_data, JDIMENSION start_col, DCTELEM * workspace);
+        (JSAMPARRAY sample_data, JDIMENSION start_col, DCTELEM *workspace);
 
 EXTERN(void) jsimd_convsamp_mips_dspr2
-        (JSAMPARRAY sample_data, JDIMENSION start_col, DCTELEM * workspace);
+        (JSAMPARRAY sample_data, JDIMENSION start_col, DCTELEM *workspace);
+
+EXTERN(void) jsimd_convsamp_altivec
+        (JSAMPARRAY sample_data, JDIMENSION start_col, DCTELEM *workspace);
 
 /* Floating Point Sample Conversion */
 EXTERN(void) jsimd_convsamp_float_3dnow
-        (JSAMPARRAY sample_data, JDIMENSION start_col, FAST_FLOAT * workspace);
+        (JSAMPARRAY sample_data, JDIMENSION start_col, FAST_FLOAT *workspace);
 
 EXTERN(void) jsimd_convsamp_float_sse
-        (JSAMPARRAY sample_data, JDIMENSION start_col, FAST_FLOAT * workspace);
+        (JSAMPARRAY sample_data, JDIMENSION start_col, FAST_FLOAT *workspace);
 
 EXTERN(void) jsimd_convsamp_float_sse2
-        (JSAMPARRAY sample_data, JDIMENSION start_col, FAST_FLOAT * workspace);
+        (JSAMPARRAY sample_data, JDIMENSION start_col, FAST_FLOAT *workspace);
 
 EXTERN(void) jsimd_convsamp_float_mips_dspr2
-        (JSAMPARRAY sample_data, JDIMENSION start_col, FAST_FLOAT * workspace);
+        (JSAMPARRAY sample_data, JDIMENSION start_col, FAST_FLOAT *workspace);
 
 /* Slow Integer Forward DCT */
-EXTERN(void) jsimd_fdct_islow_mmx (DCTELEM * data);
+EXTERN(void) jsimd_fdct_islow_mmx (DCTELEM *data);
 
 extern const int jconst_fdct_islow_sse2[];
-EXTERN(void) jsimd_fdct_islow_sse2 (DCTELEM * data);
+EXTERN(void) jsimd_fdct_islow_sse2 (DCTELEM *data);
 
-EXTERN(void) jsimd_fdct_islow_mips_dspr2 (DCTELEM * data);
+EXTERN(void) jsimd_fdct_islow_neon (DCTELEM *data);
+
+EXTERN(void) jsimd_fdct_islow_mips_dspr2 (DCTELEM *data);
+
+EXTERN(void) jsimd_fdct_islow_altivec (DCTELEM *data);
 
 /* Fast Integer Forward DCT */
-EXTERN(void) jsimd_fdct_ifast_mmx (DCTELEM * data);
+EXTERN(void) jsimd_fdct_ifast_mmx (DCTELEM *data);
 
 extern const int jconst_fdct_ifast_sse2[];
-EXTERN(void) jsimd_fdct_ifast_sse2 (DCTELEM * data);
+EXTERN(void) jsimd_fdct_ifast_sse2 (DCTELEM *data);
 
-EXTERN(void) jsimd_fdct_ifast_neon (DCTELEM * data);
+EXTERN(void) jsimd_fdct_ifast_neon (DCTELEM *data);
 
-EXTERN(void) jsimd_fdct_ifast_mips_dspr2 (DCTELEM * data);
+EXTERN(void) jsimd_fdct_ifast_mips_dspr2 (DCTELEM *data);
+
+EXTERN(void) jsimd_fdct_ifast_altivec (DCTELEM *data);
 
 /* Floating Point Forward DCT */
-EXTERN(void) jsimd_fdct_float_3dnow (FAST_FLOAT * data);
+EXTERN(void) jsimd_fdct_float_3dnow (FAST_FLOAT *data);
 
 extern const int jconst_fdct_float_sse[];
-EXTERN(void) jsimd_fdct_float_sse (FAST_FLOAT * data);
+EXTERN(void) jsimd_fdct_float_sse (FAST_FLOAT *data);
 
 /* Quantization */
 EXTERN(void) jsimd_quantize_mmx
-        (JCOEFPTR coef_block, DCTELEM * divisors, DCTELEM * workspace);
+        (JCOEFPTR coef_block, DCTELEM *divisors, DCTELEM *workspace);
 
 EXTERN(void) jsimd_quantize_sse2
-        (JCOEFPTR coef_block, DCTELEM * divisors, DCTELEM * workspace);
+        (JCOEFPTR coef_block, DCTELEM *divisors, DCTELEM *workspace);
 
 EXTERN(void) jsimd_quantize_neon
-        (JCOEFPTR coef_block, DCTELEM * divisors, DCTELEM * workspace);
+        (JCOEFPTR coef_block, DCTELEM *divisors, DCTELEM *workspace);
 
 EXTERN(void) jsimd_quantize_mips_dspr2
-        (JCOEFPTR coef_block, DCTELEM * divisors, DCTELEM * workspace);
+        (JCOEFPTR coef_block, DCTELEM *divisors, DCTELEM *workspace);
+
+EXTERN(void) jsimd_quantize_altivec
+        (JCOEFPTR coef_block, DCTELEM *divisors, DCTELEM *workspace);
 
 /* Floating Point Quantization */
 EXTERN(void) jsimd_quantize_float_3dnow
-        (JCOEFPTR coef_block, FAST_FLOAT * divisors, FAST_FLOAT * workspace);
+        (JCOEFPTR coef_block, FAST_FLOAT *divisors, FAST_FLOAT *workspace);
 
 EXTERN(void) jsimd_quantize_float_sse
-        (JCOEFPTR coef_block, FAST_FLOAT * divisors, FAST_FLOAT * workspace);
+        (JCOEFPTR coef_block, FAST_FLOAT *divisors, FAST_FLOAT *workspace);
 
 EXTERN(void) jsimd_quantize_float_sse2
-        (JCOEFPTR coef_block, FAST_FLOAT * divisors, FAST_FLOAT * workspace);
+        (JCOEFPTR coef_block, FAST_FLOAT *divisors, FAST_FLOAT *workspace);
 
 EXTERN(void) jsimd_quantize_float_mips_dspr2
-        (JCOEFPTR coef_block, FAST_FLOAT * divisors, FAST_FLOAT * workspace);
+        (JCOEFPTR coef_block, FAST_FLOAT *divisors, FAST_FLOAT *workspace);
 
 /* Scaled Inverse DCT */
 EXTERN(void) jsimd_idct_2x2_mmx
-        (void * dct_table, JCOEFPTR coef_block, JSAMPARRAY output_buf,
+        (void *dct_table, JCOEFPTR coef_block, JSAMPARRAY output_buf,
          JDIMENSION output_col);
 EXTERN(void) jsimd_idct_4x4_mmx
-        (void * dct_table, JCOEFPTR coef_block, JSAMPARRAY output_buf,
+        (void *dct_table, JCOEFPTR coef_block, JSAMPARRAY output_buf,
          JDIMENSION output_col);
 
 extern const int jconst_idct_red_sse2[];
 EXTERN(void) jsimd_idct_2x2_sse2
-        (void * dct_table, JCOEFPTR coef_block, JSAMPARRAY output_buf,
+        (void *dct_table, JCOEFPTR coef_block, JSAMPARRAY output_buf,
          JDIMENSION output_col);
 EXTERN(void) jsimd_idct_4x4_sse2
-        (void * dct_table, JCOEFPTR coef_block, JSAMPARRAY output_buf,
+        (void *dct_table, JCOEFPTR coef_block, JSAMPARRAY output_buf,
          JDIMENSION output_col);
 
 EXTERN(void) jsimd_idct_2x2_neon
-        (void * dct_table, JCOEFPTR coef_block, JSAMPARRAY output_buf,
+        (void *dct_table, JCOEFPTR coef_block, JSAMPARRAY output_buf,
          JDIMENSION output_col);
 EXTERN(void) jsimd_idct_4x4_neon
-        (void * dct_table, JCOEFPTR coef_block, JSAMPARRAY output_buf,
+        (void *dct_table, JCOEFPTR coef_block, JSAMPARRAY output_buf,
          JDIMENSION output_col);
 
 EXTERN(void) jsimd_idct_2x2_mips_dspr2
-        (void * dct_table, JCOEFPTR coef_block, JSAMPARRAY output_buf,
+        (void *dct_table, JCOEFPTR coef_block, JSAMPARRAY output_buf,
          JDIMENSION output_col);
 EXTERN(void) jsimd_idct_4x4_mips_dspr2
-        (void * dct_table, JCOEFPTR coef_block, JSAMPARRAY output_buf,
-         JDIMENSION output_col, int * workspace);
+        (void *dct_table, JCOEFPTR coef_block, JSAMPARRAY output_buf,
+         JDIMENSION output_col, int *workspace);
 EXTERN(void) jsimd_idct_6x6_mips_dspr2
-        (void * dct_table, JCOEFPTR coef_block, JSAMPARRAY output_buf,
+        (void *dct_table, JCOEFPTR coef_block, JSAMPARRAY output_buf,
          JDIMENSION output_col);
 EXTERN(void) jsimd_idct_12x12_pass1_mips_dspr2
-        (JCOEFPTR coef_block, void * dct_table, int * workspace);
+        (JCOEFPTR coef_block, void *dct_table, int *workspace);
 EXTERN(void) jsimd_idct_12x12_pass2_mips_dspr2
-        (int * workspace, int * output);
+        (int *workspace, int *output);
 
 /* Slow Integer Inverse DCT */
 EXTERN(void) jsimd_idct_islow_mmx
-        (void * dct_table, JCOEFPTR coef_block, JSAMPARRAY output_buf,
+        (void *dct_table, JCOEFPTR coef_block, JSAMPARRAY output_buf,
          JDIMENSION output_col);
 
 extern const int jconst_idct_islow_sse2[];
 EXTERN(void) jsimd_idct_islow_sse2
-        (void * dct_table, JCOEFPTR coef_block, JSAMPARRAY output_buf,
+        (void *dct_table, JCOEFPTR coef_block, JSAMPARRAY output_buf,
          JDIMENSION output_col);
 
 EXTERN(void) jsimd_idct_islow_neon
-        (void * dct_table, JCOEFPTR coef_block, JSAMPARRAY output_buf,
+        (void *dct_table, JCOEFPTR coef_block, JSAMPARRAY output_buf,
          JDIMENSION output_col);
 
 EXTERN(void) jsimd_idct_islow_mips_dspr2
-        (void * dct_table, JCOEFPTR coef_block, int * output_buf,
-         JSAMPLE * output_col);
+        (void *dct_table, JCOEFPTR coef_block, int *output_buf,
+         JSAMPLE *output_col);
+
+EXTERN(void) jsimd_idct_islow_altivec
+        (void *dct_table, JCOEFPTR coef_block, JSAMPARRAY output_buf,
+         JDIMENSION output_col);
 
 /* Fast Integer Inverse DCT */
 EXTERN(void) jsimd_idct_ifast_mmx
-        (void * dct_table, JCOEFPTR coef_block, JSAMPARRAY output_buf,
+        (void *dct_table, JCOEFPTR coef_block, JSAMPARRAY output_buf,
          JDIMENSION output_col);
 
 extern const int jconst_idct_ifast_sse2[];
 EXTERN(void) jsimd_idct_ifast_sse2
-        (void * dct_table, JCOEFPTR coef_block, JSAMPARRAY output_buf,
+        (void *dct_table, JCOEFPTR coef_block, JSAMPARRAY output_buf,
          JDIMENSION output_col);
 
 EXTERN(void) jsimd_idct_ifast_neon
-        (void * dct_table, JCOEFPTR coef_block, JSAMPARRAY output_buf,
+        (void *dct_table, JCOEFPTR coef_block, JSAMPARRAY output_buf,
          JDIMENSION output_col);
 
 EXTERN(void) jsimd_idct_ifast_cols_mips_dspr2
-        (JCOEF * inptr, IFAST_MULT_TYPE * quantptr, DCTELEM * wsptr,
-         const int * idct_coefs);
+        (JCOEF *inptr, IFAST_MULT_TYPE *quantptr, DCTELEM *wsptr,
+         const int *idct_coefs);
 EXTERN(void) jsimd_idct_ifast_rows_mips_dspr2
-        (DCTELEM * wsptr, JSAMPARRAY output_buf, JDIMENSION output_col,
-         const int * idct_coefs);
+        (DCTELEM *wsptr, JSAMPARRAY output_buf, JDIMENSION output_col,
+         const int *idct_coefs);
+
+EXTERN(void) jsimd_idct_ifast_altivec
+        (void *dct_table, JCOEFPTR coef_block, JSAMPARRAY output_buf,
+         JDIMENSION output_col);
 
 /* Floating Point Inverse DCT */
 EXTERN(void) jsimd_idct_float_3dnow
-        (void * dct_table, JCOEFPTR coef_block, JSAMPARRAY output_buf,
+        (void *dct_table, JCOEFPTR coef_block, JSAMPARRAY output_buf,
          JDIMENSION output_col);
 
 extern const int jconst_idct_float_sse[];
 EXTERN(void) jsimd_idct_float_sse
-        (void * dct_table, JCOEFPTR coef_block, JSAMPARRAY output_buf,
+        (void *dct_table, JCOEFPTR coef_block, JSAMPARRAY output_buf,
          JDIMENSION output_col);
 
 extern const int jconst_idct_float_sse2[];
 EXTERN(void) jsimd_idct_float_sse2
-        (void * dct_table, JCOEFPTR coef_block, JSAMPARRAY output_buf,
+        (void *dct_table, JCOEFPTR coef_block, JSAMPARRAY output_buf,
          JDIMENSION output_col);
+
+/* Huffman coding */
+extern const int jconst_huff_encode_one_block[];
+EXTERN(JOCTET*) jsimd_huff_encode_one_block_sse2
+        (void *state, JOCTET *buffer, JCOEFPTR block, int last_dc_val,
+         c_derived_tbl *dctbl, c_derived_tbl *actbl);
+
+EXTERN(JOCTET*) jsimd_huff_encode_one_block_neon
+        (void *state, JOCTET *buffer, JCOEFPTR block, int last_dc_val,
+         c_derived_tbl *dctbl, c_derived_tbl *actbl);
+
+EXTERN(JOCTET*) jsimd_huff_encode_one_block_neon_slowtbl
+        (void *state, JOCTET *buffer, JCOEFPTR block, int last_dc_val,
+         c_derived_tbl *dctbl, c_derived_tbl *actbl);
diff --git a/simd/jsimd_altivec.h b/simd/jsimd_altivec.h
new file mode 100644
index 0000000..2660219
--- /dev/null
+++ b/simd/jsimd_altivec.h
@@ -0,0 +1,99 @@
+/*
+ * AltiVec optimizations for libjpeg-turbo
+ *
+ * Copyright (C) 2014-2015, D. R. Commander.
+ * All rights reserved.
+ * This software is provided 'as-is', without any express or implied
+ * warranty.  In no event will the authors be held liable for any damages
+ * arising from the use of this software.
+ *
+ * Permission is granted to anyone to use this software for any purpose,
+ * including commercial applications, and to alter it and redistribute it
+ * freely, subject to the following restrictions:
+ *
+ * 1. The origin of this software must not be misrepresented; you must not
+ *    claim that you wrote the original software. If you use this software
+ *    in a product, an acknowledgment in the product documentation would be
+ *    appreciated but is not required.
+ * 2. Altered source versions must be plainly marked as such, and must not be
+ *    misrepresented as being the original software.
+ * 3. This notice may not be removed or altered from any source distribution.
+ */
+
+#define JPEG_INTERNALS
+#include "../jinclude.h"
+#include "../jpeglib.h"
+#include "../jsimd.h"
+#include "../jdct.h"
+#include "../jsimddct.h"
+#include "jsimd.h"
+#include <altivec.h>
+
+
+/* Common code */
+
+#define __4X(a) a, a, a, a
+#define __4X2(a, b) a, b, a, b, a, b, a, b
+#define __8X(a) __4X(a), __4X(a)
+#define __16X(a) __8X(a), __8X(a)
+
+#define TRANSPOSE(row, col)  \
+{  \
+  __vector short row04l, row04h, row15l, row15h,  \
+                 row26l, row26h, row37l, row37h;  \
+  __vector short col01e, col01o, col23e, col23o,  \
+                 col45e, col45o, col67e, col67o;  \
+  \
+                                       /* transpose coefficients (phase 1) */ \
+  row04l = vec_mergeh(row##0, row##4); /* row04l=(00 40 01 41 02 42 03 43) */ \
+  row04h = vec_mergel(row##0, row##4); /* row04h=(04 44 05 45 06 46 07 47) */ \
+  row15l = vec_mergeh(row##1, row##5); /* row15l=(10 50 11 51 12 52 13 53) */ \
+  row15h = vec_mergel(row##1, row##5); /* row15h=(14 54 15 55 16 56 17 57) */ \
+  row26l = vec_mergeh(row##2, row##6); /* row26l=(20 60 21 61 22 62 23 63) */ \
+  row26h = vec_mergel(row##2, row##6); /* row26h=(24 64 25 65 26 66 27 67) */ \
+  row37l = vec_mergeh(row##3, row##7); /* row37l=(30 70 31 71 32 72 33 73) */ \
+  row37h = vec_mergel(row##3, row##7); /* row37h=(34 74 35 75 36 76 37 77) */ \
+  \
+                                       /* transpose coefficients (phase 2) */ \
+  col01e = vec_mergeh(row04l, row26l); /* col01e=(00 20 40 60 01 21 41 61) */ \
+  col23e = vec_mergel(row04l, row26l); /* col23e=(02 22 42 62 03 23 43 63) */ \
+  col45e = vec_mergeh(row04h, row26h); /* col45e=(04 24 44 64 05 25 45 65) */ \
+  col67e = vec_mergel(row04h, row26h); /* col67e=(06 26 46 66 07 27 47 67) */ \
+  col01o = vec_mergeh(row15l, row37l); /* col01o=(10 30 50 70 11 31 51 71) */ \
+  col23o = vec_mergel(row15l, row37l); /* col23o=(12 32 52 72 13 33 53 73) */ \
+  col45o = vec_mergeh(row15h, row37h); /* col45o=(14 34 54 74 15 35 55 75) */ \
+  col67o = vec_mergel(row15h, row37h); /* col67o=(16 36 56 76 17 37 57 77) */ \
+  \
+                                       /* transpose coefficients (phase 3) */ \
+  col##0 = vec_mergeh(col01e, col01o); /* col0=(00 10 20 30 40 50 60 70) */   \
+  col##1 = vec_mergel(col01e, col01o); /* col1=(01 11 21 31 41 51 61 71) */   \
+  col##2 = vec_mergeh(col23e, col23o); /* col2=(02 12 22 32 42 52 62 72) */   \
+  col##3 = vec_mergel(col23e, col23o); /* col3=(03 13 23 33 43 53 63 73) */   \
+  col##4 = vec_mergeh(col45e, col45o); /* col4=(04 14 24 34 44 54 64 74) */   \
+  col##5 = vec_mergel(col45e, col45o); /* col5=(05 15 25 35 45 55 65 75) */   \
+  col##6 = vec_mergeh(col67e, col67o); /* col6=(06 16 26 36 46 56 66 76) */   \
+  col##7 = vec_mergel(col67e, col67o); /* col7=(07 17 27 37 47 57 67 77) */   \
+}
+
+#ifndef min
+#define min(a,b) ((a) < (b) ? (a) : (b))
+#endif
+
+
+/* Macros to abstract big/little endian bit twiddling */
+
+#if __BIG_ENDIAN__
+
+#define VEC_LD(a, b) vec_ld(a, b)
+#define VEC_ST(a, b, c) vec_st(a, b, c)
+#define VEC_UNPACKHU(a) vec_mergeh(pb_zero, a)
+#define VEC_UNPACKLU(a) vec_mergel(pb_zero, a)
+
+#else
+
+#define VEC_LD(a, b) vec_vsx_ld(a, b)
+#define VEC_ST(a, b, c) vec_vsx_st(a, b, c)
+#define VEC_UNPACKHU(a) vec_mergeh(a, pb_zero)
+#define VEC_UNPACKLU(a) vec_mergel(a, pb_zero)
+
+#endif
diff --git a/simd/jsimd_arm.c b/simd/jsimd_arm.c
index 4cbcf2d..ea621da 100644
--- a/simd/jsimd_arm.c
+++ b/simd/jsimd_arm.c
@@ -2,7 +2,8 @@
  * jsimd_arm.c
  *
  * Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
- * Copyright 2009-2011, 2013-2014 D. R. Commander
+ * Copyright 2009-2011, 2013-2014, 2016 D. R. Commander
+ * Copyright 2015-2016 Matthieu Darbois
  *
  * Based on the x86 SIMD extension for IJG JPEG library,
  * Copyright (C) 1999-2006, MIYASAKA Masaru.
@@ -26,6 +27,7 @@
 #include <ctype.h>
 
 static unsigned int simd_support = ~0;
+static unsigned int simd_huffman = 1;
 
 #if defined(__linux__) || defined(ANDROID) || defined(__ANDROID__)
 
@@ -127,6 +129,9 @@
   env = getenv("JSIMD_FORCENONE");
   if ((env != NULL) && (strcmp(env, "1") == 0))
     simd_support = 0;
+  env = getenv("JSIMD_NOHUFFENC");
+  if ((env != NULL) && (strcmp(env, "1") == 0))
+    simd_huffman = 0;
 }
 
 GLOBAL(int)
@@ -227,8 +232,7 @@
       break;
   }
 
-  if (simd_support & JSIMD_ARM_NEON)
-    neonfct(cinfo->image_width, input_buf, output_buf, output_row, num_rows);
+  neonfct(cinfo->image_width, input_buf, output_buf, output_row, num_rows);
 }
 
 GLOBAL(void)
@@ -273,8 +277,7 @@
       break;
   }
 
-  if (simd_support & JSIMD_ARM_NEON)
-    neonfct(cinfo->output_width, input_buf, input_row, output_buf, num_rows);
+  neonfct(cinfo->output_width, input_buf, input_row, output_buf, num_rows);
 }
 
 GLOBAL(void)
@@ -282,9 +285,8 @@
                           JSAMPIMAGE input_buf, JDIMENSION input_row,
                           JSAMPARRAY output_buf, int num_rows)
 {
-  if (simd_support & JSIMD_ARM_NEON)
-    jsimd_ycc_rgb565_convert_neon(cinfo->output_width, input_buf, input_row,
-                                  output_buf, num_rows);
+  jsimd_ycc_rgb565_convert_neon(cinfo->output_width, input_buf, input_row,
+                                output_buf, num_rows);
 }
 
 GLOBAL(int)
@@ -304,13 +306,13 @@
 }
 
 GLOBAL(void)
-jsimd_h2v2_downsample (j_compress_ptr cinfo, jpeg_component_info * compptr,
+jsimd_h2v2_downsample (j_compress_ptr cinfo, jpeg_component_info *compptr,
                        JSAMPARRAY input_data, JSAMPARRAY output_data)
 {
 }
 
 GLOBAL(void)
-jsimd_h2v1_downsample (j_compress_ptr cinfo, jpeg_component_info * compptr,
+jsimd_h2v1_downsample (j_compress_ptr cinfo, jpeg_component_info *compptr,
                        JSAMPARRAY input_data, JSAMPARRAY output_data)
 {
 }
@@ -333,17 +335,17 @@
 
 GLOBAL(void)
 jsimd_h2v2_upsample (j_decompress_ptr cinfo,
-                     jpeg_component_info * compptr,
+                     jpeg_component_info *compptr,
                      JSAMPARRAY input_data,
-                     JSAMPARRAY * output_data_ptr)
+                     JSAMPARRAY *output_data_ptr)
 {
 }
 
 GLOBAL(void)
 jsimd_h2v1_upsample (j_decompress_ptr cinfo,
-                     jpeg_component_info * compptr,
+                     jpeg_component_info *compptr,
                      JSAMPARRAY input_data,
-                     JSAMPARRAY * output_data_ptr)
+                     JSAMPARRAY *output_data_ptr)
 {
 }
 
@@ -374,22 +376,21 @@
 
 GLOBAL(void)
 jsimd_h2v2_fancy_upsample (j_decompress_ptr cinfo,
-                           jpeg_component_info * compptr,
+                           jpeg_component_info *compptr,
                            JSAMPARRAY input_data,
-                           JSAMPARRAY * output_data_ptr)
+                           JSAMPARRAY *output_data_ptr)
 {
 }
 
 GLOBAL(void)
 jsimd_h2v1_fancy_upsample (j_decompress_ptr cinfo,
-                           jpeg_component_info * compptr,
+                           jpeg_component_info *compptr,
                            JSAMPARRAY input_data,
-                           JSAMPARRAY * output_data_ptr)
+                           JSAMPARRAY *output_data_ptr)
 {
-  if (simd_support & JSIMD_ARM_NEON)
-    jsimd_h2v1_fancy_upsample_neon(cinfo->max_v_samp_factor,
-                                   compptr->downsampled_width, input_data,
-                                   output_data_ptr);
+  jsimd_h2v1_fancy_upsample_neon(cinfo->max_v_samp_factor,
+                                 compptr->downsampled_width, input_data,
+                                 output_data_ptr);
 }
 
 GLOBAL(int)
@@ -455,15 +456,14 @@
 
 GLOBAL(void)
 jsimd_convsamp (JSAMPARRAY sample_data, JDIMENSION start_col,
-                DCTELEM * workspace)
+                DCTELEM *workspace)
 {
-  if (simd_support & JSIMD_ARM_NEON)
-    jsimd_convsamp_neon(sample_data, start_col, workspace);
+  jsimd_convsamp_neon(sample_data, start_col, workspace);
 }
 
 GLOBAL(void)
 jsimd_convsamp_float (JSAMPARRAY sample_data, JDIMENSION start_col,
-                      FAST_FLOAT * workspace)
+                      FAST_FLOAT *workspace)
 {
 }
 
@@ -501,19 +501,18 @@
 }
 
 GLOBAL(void)
-jsimd_fdct_islow (DCTELEM * data)
+jsimd_fdct_islow (DCTELEM *data)
 {
 }
 
 GLOBAL(void)
-jsimd_fdct_ifast (DCTELEM * data)
+jsimd_fdct_ifast (DCTELEM *data)
 {
-  if (simd_support & JSIMD_ARM_NEON)
-    jsimd_fdct_ifast_neon(data);
+  jsimd_fdct_ifast_neon(data);
 }
 
 GLOBAL(void)
-jsimd_fdct_float (FAST_FLOAT * data)
+jsimd_fdct_float (FAST_FLOAT *data)
 {
 }
 
@@ -545,16 +544,15 @@
 }
 
 GLOBAL(void)
-jsimd_quantize (JCOEFPTR coef_block, DCTELEM * divisors,
-                DCTELEM * workspace)
+jsimd_quantize (JCOEFPTR coef_block, DCTELEM *divisors,
+                DCTELEM *workspace)
 {
-  if (simd_support & JSIMD_ARM_NEON)
-    jsimd_quantize_neon(coef_block, divisors, workspace);
+  jsimd_quantize_neon(coef_block, divisors, workspace);
 }
 
 GLOBAL(void)
-jsimd_quantize_float (JCOEFPTR coef_block, FAST_FLOAT * divisors,
-                      FAST_FLOAT * workspace)
+jsimd_quantize_float (JCOEFPTR coef_block, FAST_FLOAT *divisors,
+                      FAST_FLOAT *workspace)
 {
 }
 
@@ -605,23 +603,21 @@
 }
 
 GLOBAL(void)
-jsimd_idct_2x2 (j_decompress_ptr cinfo, jpeg_component_info * compptr,
+jsimd_idct_2x2 (j_decompress_ptr cinfo, jpeg_component_info *compptr,
                 JCOEFPTR coef_block, JSAMPARRAY output_buf,
                 JDIMENSION output_col)
 {
-  if (simd_support & JSIMD_ARM_NEON)
-    jsimd_idct_2x2_neon(compptr->dct_table, coef_block, output_buf,
-                        output_col);
+  jsimd_idct_2x2_neon(compptr->dct_table, coef_block, output_buf,
+                      output_col);
 }
 
 GLOBAL(void)
-jsimd_idct_4x4 (j_decompress_ptr cinfo, jpeg_component_info * compptr,
+jsimd_idct_4x4 (j_decompress_ptr cinfo, jpeg_component_info *compptr,
                 JCOEFPTR coef_block, JSAMPARRAY output_buf,
                 JDIMENSION output_col)
 {
-  if (simd_support & JSIMD_ARM_NEON)
-    jsimd_idct_4x4_neon(compptr->dct_table, coef_block, output_buf,
-                        output_col);
+  jsimd_idct_4x4_neon(compptr->dct_table, coef_block, output_buf,
+                      output_col);
 }
 
 GLOBAL(int)
@@ -681,28 +677,51 @@
 }
 
 GLOBAL(void)
-jsimd_idct_islow (j_decompress_ptr cinfo, jpeg_component_info * compptr,
+jsimd_idct_islow (j_decompress_ptr cinfo, jpeg_component_info *compptr,
                   JCOEFPTR coef_block, JSAMPARRAY output_buf,
                   JDIMENSION output_col)
 {
-  if (simd_support & JSIMD_ARM_NEON)
-    jsimd_idct_islow_neon(compptr->dct_table, coef_block, output_buf,
-                          output_col);
+  jsimd_idct_islow_neon(compptr->dct_table, coef_block, output_buf,
+                        output_col);
 }
 
 GLOBAL(void)
-jsimd_idct_ifast (j_decompress_ptr cinfo, jpeg_component_info * compptr,
+jsimd_idct_ifast (j_decompress_ptr cinfo, jpeg_component_info *compptr,
                   JCOEFPTR coef_block, JSAMPARRAY output_buf,
                   JDIMENSION output_col)
 {
-  if (simd_support & JSIMD_ARM_NEON)
-    jsimd_idct_ifast_neon(compptr->dct_table, coef_block, output_buf,
-                          output_col);
+  jsimd_idct_ifast_neon(compptr->dct_table, coef_block, output_buf,
+                        output_col);
 }
 
 GLOBAL(void)
-jsimd_idct_float (j_decompress_ptr cinfo, jpeg_component_info * compptr,
+jsimd_idct_float (j_decompress_ptr cinfo, jpeg_component_info *compptr,
                   JCOEFPTR coef_block, JSAMPARRAY output_buf,
                   JDIMENSION output_col)
 {
 }
+
+GLOBAL(int)
+jsimd_can_huff_encode_one_block (void)
+{
+  init_simd();
+
+  if (DCTSIZE != 8)
+    return 0;
+  if (sizeof(JCOEF) != 2)
+    return 0;
+
+  if (simd_support & JSIMD_ARM_NEON && simd_huffman)
+    return 1;
+
+  return 0;
+}
+
+GLOBAL(JOCTET*)
+jsimd_huff_encode_one_block (void *state, JOCTET *buffer, JCOEFPTR block,
+                             int last_dc_val, c_derived_tbl *dctbl,
+                             c_derived_tbl *actbl)
+{
+  return jsimd_huff_encode_one_block_neon(state, buffer, block, last_dc_val,
+                                          dctbl, actbl);
+}
diff --git a/simd/jsimd_arm64.c b/simd/jsimd_arm64.c
index 65724cb..62dbc45 100644
--- a/simd/jsimd_arm64.c
+++ b/simd/jsimd_arm64.c
@@ -2,7 +2,8 @@
  * jsimd_arm64.c
  *
  * Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
- * Copyright 2009-2011, 2013-2014 D. R. Commander
+ * Copyright 2009-2011, 2013-2014, 2016 D. R. Commander
+ * Copyright 2015-2016 Matthieu Darbois
  *
  * Based on the x86 SIMD extension for IJG JPEG library,
  * Copyright (C) 1999-2006, MIYASAKA Masaru.
@@ -25,7 +26,84 @@
 #include <string.h>
 #include <ctype.h>
 
+#define JSIMD_FASTLD3 1
+#define JSIMD_FASTST3 2
+#define JSIMD_FASTTBL 4
+
 static unsigned int simd_support = ~0;
+static unsigned int simd_huffman = 1;
+static unsigned int simd_features = JSIMD_FASTLD3 | JSIMD_FASTST3 |
+                                    JSIMD_FASTTBL;
+
+#if defined(__linux__) || defined(ANDROID) || defined(__ANDROID__)
+
+#define SOMEWHAT_SANE_PROC_CPUINFO_SIZE_LIMIT (1024 * 1024)
+
+LOCAL(int)
+check_cpuinfo (char *buffer, const char *field, char *value)
+{
+  char *p;
+  if (*value == 0)
+    return 0;
+  if (strncmp(buffer, field, strlen(field)) != 0)
+    return 0;
+  buffer += strlen(field);
+  while (isspace(*buffer))
+    buffer++;
+
+  /* Check if 'value' is present in the buffer as a separate word */
+  while ((p = strstr(buffer, value))) {
+    if (p > buffer && !isspace(*(p - 1))) {
+      buffer++;
+      continue;
+    }
+    p += strlen(value);
+    if (*p != 0 && !isspace(*p)) {
+      buffer++;
+      continue;
+    }
+    return 1;
+  }
+  return 0;
+}
+
+LOCAL(int)
+parse_proc_cpuinfo (int bufsize)
+{
+  char *buffer = (char *)malloc(bufsize);
+  FILE *fd;
+
+  if (!buffer)
+    return 0;
+
+  fd = fopen("/proc/cpuinfo", "r");
+  if (fd) {
+    while (fgets(buffer, bufsize, fd)) {
+      if (!strchr(buffer, '\n') && !feof(fd)) {
+        /* "impossible" happened - insufficient size of the buffer! */
+        fclose(fd);
+        free(buffer);
+        return 0;
+      }
+      if (check_cpuinfo(buffer, "CPU part", "0xd03") ||
+          check_cpuinfo(buffer, "CPU part", "0xd07"))
+        /* The Cortex-A53 has a slow tbl implementation.  We can gain a few
+           percent speedup by disabling the use of that instruction.  The
+           speedup on Cortex-A57 is more subtle but still measurable. */
+        simd_features &= ~JSIMD_FASTTBL;
+      else if (check_cpuinfo(buffer, "CPU part", "0x0a1"))
+        /* The SIMD version of Huffman encoding is slower than the C version on
+           Cavium ThunderX.  Also, ld3 and st3 are abyssmally slow on that
+           CPU. */
+        simd_huffman = simd_features = 0;
+    }
+    fclose(fd);
+  }
+  free(buffer);
+  return 1;
+}
+
+#endif
 
 /*
  * Check what SIMD accelerations are supported.
@@ -33,16 +111,19 @@
  * FIXME: This code is racy under a multi-threaded environment.
  */
 
-/* 
+/*
  * ARMv8 architectures support NEON extensions by default.
  * It is no longer optional as it was with ARMv7.
- */ 
+ */
 
 
 LOCAL(void)
 init_simd (void)
 {
   char *env = NULL;
+#if defined(__linux__) || defined(ANDROID) || defined(__ANDROID__)
+  int bufsize = 1024; /* an initial guess for the line buffer size limit */
+#endif
 
   if (simd_support != ~0U)
     return;
@@ -50,6 +131,13 @@
   simd_support = 0;
 
   simd_support |= JSIMD_ARM_NEON;
+#if defined(__linux__) || defined(ANDROID) || defined(__ANDROID__)
+  while (!parse_proc_cpuinfo(bufsize)) {
+    bufsize *= 2;
+    if (bufsize > SOMEWHAT_SANE_PROC_CPUINFO_SIZE_LIMIT)
+      break;
+  }
+#endif
 
   /* Force different settings through environment variables */
   env = getenv("JSIMD_FORCENEON");
@@ -58,6 +146,19 @@
   env = getenv("JSIMD_FORCENONE");
   if ((env != NULL) && (strcmp(env, "1") == 0))
     simd_support = 0;
+  env = getenv("JSIMD_NOHUFFENC");
+  if ((env != NULL) && (strcmp(env, "1") == 0))
+    simd_huffman = 0;
+  env = getenv("JSIMD_FASTLD3");
+  if ((env != NULL) && (strcmp(env, "1") == 0))
+    simd_features |= JSIMD_FASTLD3;
+  if ((env != NULL) && (strcmp(env, "0") == 0))
+    simd_features &= ~JSIMD_FASTLD3;
+  env = getenv("JSIMD_FASTST3");
+  if ((env != NULL) && (strcmp(env, "1") == 0))
+    simd_features |= JSIMD_FASTST3;
+  if ((env != NULL) && (strcmp(env, "0") == 0))
+    simd_features &= ~JSIMD_FASTST3;
 }
 
 GLOBAL(int)
@@ -65,6 +166,17 @@
 {
   init_simd();
 
+  /* The code is optimised for these values only */
+  if (BITS_IN_JSAMPLE != 8)
+    return 0;
+  if (sizeof(JDIMENSION) != 4)
+    return 0;
+  if ((RGB_PIXELSIZE != 3) && (RGB_PIXELSIZE != 4))
+    return 0;
+
+  if (simd_support & JSIMD_ARM_NEON)
+    return 1;
+
   return 0;
 }
 
@@ -117,6 +229,46 @@
                        JSAMPARRAY input_buf, JSAMPIMAGE output_buf,
                        JDIMENSION output_row, int num_rows)
 {
+  void (*neonfct)(JDIMENSION, JSAMPARRAY, JSAMPIMAGE, JDIMENSION, int);
+
+  switch(cinfo->in_color_space) {
+    case JCS_EXT_RGB:
+      if (simd_features & JSIMD_FASTLD3)
+        neonfct=jsimd_extrgb_ycc_convert_neon;
+      else
+        neonfct=jsimd_extrgb_ycc_convert_neon_slowld3;
+      break;
+    case JCS_EXT_RGBX:
+    case JCS_EXT_RGBA:
+      neonfct=jsimd_extrgbx_ycc_convert_neon;
+      break;
+    case JCS_EXT_BGR:
+      if (simd_features & JSIMD_FASTLD3)
+        neonfct=jsimd_extbgr_ycc_convert_neon;
+      else
+        neonfct=jsimd_extbgr_ycc_convert_neon_slowld3;
+      break;
+    case JCS_EXT_BGRX:
+    case JCS_EXT_BGRA:
+      neonfct=jsimd_extbgrx_ycc_convert_neon;
+      break;
+    case JCS_EXT_XBGR:
+    case JCS_EXT_ABGR:
+      neonfct=jsimd_extxbgr_ycc_convert_neon;
+      break;
+    case JCS_EXT_XRGB:
+    case JCS_EXT_ARGB:
+      neonfct=jsimd_extxrgb_ycc_convert_neon;
+      break;
+    default:
+      if (simd_features & JSIMD_FASTLD3)
+        neonfct=jsimd_extrgb_ycc_convert_neon;
+      else
+        neonfct=jsimd_extrgb_ycc_convert_neon_slowld3;
+      break;
+  }
+
+  neonfct(cinfo->image_width, input_buf, output_buf, output_row, num_rows);
 }
 
 GLOBAL(void)
@@ -135,14 +287,20 @@
 
   switch(cinfo->out_color_space) {
     case JCS_EXT_RGB:
-      neonfct=jsimd_ycc_extrgb_convert_neon;
+      if (simd_features & JSIMD_FASTST3)
+        neonfct=jsimd_ycc_extrgb_convert_neon;
+      else
+        neonfct=jsimd_ycc_extrgb_convert_neon_slowst3;
       break;
     case JCS_EXT_RGBX:
     case JCS_EXT_RGBA:
       neonfct=jsimd_ycc_extrgbx_convert_neon;
       break;
     case JCS_EXT_BGR:
-      neonfct=jsimd_ycc_extbgr_convert_neon;
+      if (simd_features & JSIMD_FASTST3)
+        neonfct=jsimd_ycc_extbgr_convert_neon;
+      else
+        neonfct=jsimd_ycc_extbgr_convert_neon_slowst3;
       break;
     case JCS_EXT_BGRX:
     case JCS_EXT_BGRA:
@@ -157,12 +315,14 @@
       neonfct=jsimd_ycc_extxrgb_convert_neon;
       break;
     default:
-      neonfct=jsimd_ycc_extrgb_convert_neon;
+      if (simd_features & JSIMD_FASTST3)
+        neonfct=jsimd_ycc_extrgb_convert_neon;
+      else
+        neonfct=jsimd_ycc_extrgb_convert_neon_slowst3;
       break;
   }
 
-  if (simd_support & JSIMD_ARM_NEON)
-    neonfct(cinfo->output_width, input_buf, input_row, output_buf, num_rows);
+  neonfct(cinfo->output_width, input_buf, input_row, output_buf, num_rows);
 }
 
 GLOBAL(void)
@@ -170,9 +330,8 @@
                           JSAMPIMAGE input_buf, JDIMENSION input_row,
                           JSAMPARRAY output_buf, int num_rows)
 {
-  if (simd_support & JSIMD_ARM_NEON)
-    jsimd_ycc_rgb565_convert_neon(cinfo->output_width, input_buf, input_row,
-                                  output_buf, num_rows);
+  jsimd_ycc_rgb565_convert_neon(cinfo->output_width, input_buf, input_row,
+                                output_buf, num_rows);
 }
 
 GLOBAL(int)
@@ -180,6 +339,17 @@
 {
   init_simd();
 
+  /* The code is optimised for these values only */
+  if (BITS_IN_JSAMPLE != 8)
+    return 0;
+  if (DCTSIZE != 8)
+    return 0;
+  if (sizeof(JDIMENSION) != 4)
+    return 0;
+
+  if (simd_support & JSIMD_ARM_NEON)
+    return 1;
+
   return 0;
 }
 
@@ -188,19 +358,36 @@
 {
   init_simd();
 
+  /* The code is optimised for these values only */
+  if (BITS_IN_JSAMPLE != 8)
+    return 0;
+  if (DCTSIZE != 8)
+    return 0;
+  if (sizeof(JDIMENSION) != 4)
+    return 0;
+
+  if (simd_support & JSIMD_ARM_NEON)
+    return 1;
+
   return 0;
 }
 
 GLOBAL(void)
-jsimd_h2v2_downsample (j_compress_ptr cinfo, jpeg_component_info * compptr,
+jsimd_h2v2_downsample (j_compress_ptr cinfo, jpeg_component_info *compptr,
                        JSAMPARRAY input_data, JSAMPARRAY output_data)
 {
+  jsimd_h2v2_downsample_neon(cinfo->image_width, cinfo->max_v_samp_factor,
+                             compptr->v_samp_factor, compptr->width_in_blocks,
+                             input_data, output_data);
 }
 
 GLOBAL(void)
-jsimd_h2v1_downsample (j_compress_ptr cinfo, jpeg_component_info * compptr,
+jsimd_h2v1_downsample (j_compress_ptr cinfo, jpeg_component_info *compptr,
                        JSAMPARRAY input_data, JSAMPARRAY output_data)
 {
+  jsimd_h2v1_downsample_neon(cinfo->image_width, cinfo->max_v_samp_factor,
+                             compptr->v_samp_factor, compptr->width_in_blocks,
+                             input_data, output_data);
 }
 
 GLOBAL(int)
@@ -221,17 +408,17 @@
 
 GLOBAL(void)
 jsimd_h2v2_upsample (j_decompress_ptr cinfo,
-                     jpeg_component_info * compptr,
+                     jpeg_component_info *compptr,
                      JSAMPARRAY input_data,
-                     JSAMPARRAY * output_data_ptr)
+                     JSAMPARRAY *output_data_ptr)
 {
 }
 
 GLOBAL(void)
 jsimd_h2v1_upsample (j_decompress_ptr cinfo,
-                     jpeg_component_info * compptr,
+                     jpeg_component_info *compptr,
                      JSAMPARRAY input_data,
-                     JSAMPARRAY * output_data_ptr)
+                     JSAMPARRAY *output_data_ptr)
 {
 }
 
@@ -253,17 +440,17 @@
 
 GLOBAL(void)
 jsimd_h2v2_fancy_upsample (j_decompress_ptr cinfo,
-                           jpeg_component_info * compptr,
+                           jpeg_component_info *compptr,
                            JSAMPARRAY input_data,
-                           JSAMPARRAY * output_data_ptr)
+                           JSAMPARRAY *output_data_ptr)
 {
 }
 
 GLOBAL(void)
 jsimd_h2v1_fancy_upsample (j_decompress_ptr cinfo,
-                           jpeg_component_info * compptr,
+                           jpeg_component_info *compptr,
                            JSAMPARRAY input_data,
-                           JSAMPARRAY * output_data_ptr)
+                           JSAMPARRAY *output_data_ptr)
 {
 }
 
@@ -304,6 +491,19 @@
 {
   init_simd();
 
+  /* The code is optimised for these values only */
+  if (DCTSIZE != 8)
+    return 0;
+  if (BITS_IN_JSAMPLE != 8)
+    return 0;
+  if (sizeof(JDIMENSION) != 4)
+    return 0;
+  if (sizeof(DCTELEM) != 2)
+    return 0;
+
+  if (simd_support & JSIMD_ARM_NEON)
+    return 1;
+
   return 0;
 }
 
@@ -317,13 +517,14 @@
 
 GLOBAL(void)
 jsimd_convsamp (JSAMPARRAY sample_data, JDIMENSION start_col,
-                DCTELEM * workspace)
+                DCTELEM *workspace)
 {
+  jsimd_convsamp_neon(sample_data, start_col, workspace);
 }
 
 GLOBAL(void)
 jsimd_convsamp_float (JSAMPARRAY sample_data, JDIMENSION start_col,
-                      FAST_FLOAT * workspace)
+                      FAST_FLOAT *workspace)
 {
 }
 
@@ -332,6 +533,15 @@
 {
   init_simd();
 
+  /* The code is optimised for these values only */
+  if (DCTSIZE != 8)
+    return 0;
+  if (sizeof(DCTELEM) != 2)
+    return 0;
+
+  if (simd_support & JSIMD_ARM_NEON)
+    return 1;
+
   return 0;
 }
 
@@ -340,6 +550,15 @@
 {
   init_simd();
 
+  /* The code is optimised for these values only */
+  if (DCTSIZE != 8)
+    return 0;
+  if (sizeof(DCTELEM) != 2)
+    return 0;
+
+  if (simd_support & JSIMD_ARM_NEON)
+    return 1;
+
   return 0;
 }
 
@@ -352,17 +571,19 @@
 }
 
 GLOBAL(void)
-jsimd_fdct_islow (DCTELEM * data)
+jsimd_fdct_islow (DCTELEM *data)
 {
+  jsimd_fdct_islow_neon(data);
 }
 
 GLOBAL(void)
-jsimd_fdct_ifast (DCTELEM * data)
+jsimd_fdct_ifast (DCTELEM *data)
 {
+  jsimd_fdct_ifast_neon(data);
 }
 
 GLOBAL(void)
-jsimd_fdct_float (FAST_FLOAT * data)
+jsimd_fdct_float (FAST_FLOAT *data)
 {
 }
 
@@ -371,6 +592,17 @@
 {
   init_simd();
 
+  /* The code is optimised for these values only */
+  if (DCTSIZE != 8)
+    return 0;
+  if (sizeof(JCOEF) != 2)
+    return 0;
+  if (sizeof(DCTELEM) != 2)
+    return 0;
+
+  if (simd_support & JSIMD_ARM_NEON)
+    return 1;
+
   return 0;
 }
 
@@ -383,14 +615,15 @@
 }
 
 GLOBAL(void)
-jsimd_quantize (JCOEFPTR coef_block, DCTELEM * divisors,
-                DCTELEM * workspace)
+jsimd_quantize (JCOEFPTR coef_block, DCTELEM *divisors,
+                DCTELEM *workspace)
 {
+  jsimd_quantize_neon(coef_block, divisors, workspace);
 }
 
 GLOBAL(void)
-jsimd_quantize_float (JCOEFPTR coef_block, FAST_FLOAT * divisors,
-                      FAST_FLOAT * workspace)
+jsimd_quantize_float (JCOEFPTR coef_block, FAST_FLOAT *divisors,
+                      FAST_FLOAT *workspace)
 {
 }
 
@@ -441,23 +674,21 @@
 }
 
 GLOBAL(void)
-jsimd_idct_2x2 (j_decompress_ptr cinfo, jpeg_component_info * compptr,
+jsimd_idct_2x2 (j_decompress_ptr cinfo, jpeg_component_info *compptr,
                 JCOEFPTR coef_block, JSAMPARRAY output_buf,
                 JDIMENSION output_col)
 {
-  if (simd_support & JSIMD_ARM_NEON)
-    jsimd_idct_2x2_neon(compptr->dct_table, coef_block, output_buf,
-                        output_col);
+  jsimd_idct_2x2_neon(compptr->dct_table, coef_block, output_buf,
+                      output_col);
 }
 
 GLOBAL(void)
-jsimd_idct_4x4 (j_decompress_ptr cinfo, jpeg_component_info * compptr,
+jsimd_idct_4x4 (j_decompress_ptr cinfo, jpeg_component_info *compptr,
                 JCOEFPTR coef_block, JSAMPARRAY output_buf,
                 JDIMENSION output_col)
 {
-  if (simd_support & JSIMD_ARM_NEON)
-    jsimd_idct_4x4_neon(compptr->dct_table, coef_block, output_buf,
-                        output_col);
+  jsimd_idct_4x4_neon(compptr->dct_table, coef_block, output_buf,
+                      output_col);
 }
 
 GLOBAL(int)
@@ -517,28 +748,55 @@
 }
 
 GLOBAL(void)
-jsimd_idct_islow (j_decompress_ptr cinfo, jpeg_component_info * compptr,
+jsimd_idct_islow (j_decompress_ptr cinfo, jpeg_component_info *compptr,
                   JCOEFPTR coef_block, JSAMPARRAY output_buf,
                   JDIMENSION output_col)
 {
-  if (simd_support & JSIMD_ARM_NEON)
-    jsimd_idct_islow_neon(compptr->dct_table, coef_block, output_buf,
-                          output_col);
+  jsimd_idct_islow_neon(compptr->dct_table, coef_block, output_buf,
+                        output_col);
 }
 
 GLOBAL(void)
-jsimd_idct_ifast (j_decompress_ptr cinfo, jpeg_component_info * compptr,
+jsimd_idct_ifast (j_decompress_ptr cinfo, jpeg_component_info *compptr,
                   JCOEFPTR coef_block, JSAMPARRAY output_buf,
                   JDIMENSION output_col)
 {
-  if (simd_support & JSIMD_ARM_NEON)
-    jsimd_idct_ifast_neon(compptr->dct_table, coef_block, output_buf,
-                          output_col);
+  jsimd_idct_ifast_neon(compptr->dct_table, coef_block, output_buf,
+                        output_col);
 }
 
 GLOBAL(void)
-jsimd_idct_float (j_decompress_ptr cinfo, jpeg_component_info * compptr,
+jsimd_idct_float (j_decompress_ptr cinfo, jpeg_component_info *compptr,
                   JCOEFPTR coef_block, JSAMPARRAY output_buf,
                   JDIMENSION output_col)
 {
 }
+
+GLOBAL(int)
+jsimd_can_huff_encode_one_block (void)
+{
+  init_simd();
+
+  if (DCTSIZE != 8)
+    return 0;
+  if (sizeof(JCOEF) != 2)
+    return 0;
+
+  if (simd_support & JSIMD_ARM_NEON && simd_huffman)
+    return 1;
+
+  return 0;
+}
+
+GLOBAL(JOCTET*)
+jsimd_huff_encode_one_block (void *state, JOCTET *buffer, JCOEFPTR block,
+                             int last_dc_val, c_derived_tbl *dctbl,
+                             c_derived_tbl *actbl)
+{
+  if (simd_features & JSIMD_FASTTBL)
+    return jsimd_huff_encode_one_block_neon(state, buffer, block, last_dc_val,
+                                            dctbl, actbl);
+  else
+    return jsimd_huff_encode_one_block_neon_slowtbl(state, buffer, block,
+                                                    last_dc_val, dctbl, actbl);
+}
diff --git a/simd/jsimd_arm64_neon.S b/simd/jsimd_arm64_neon.S
index 2186f24..d236314 100644
--- a/simd/jsimd_arm64_neon.S
+++ b/simd/jsimd_arm64_neon.S
@@ -6,7 +6,9 @@
  * Author: Siarhei Siamashka <siarhei.siamashka@nokia.com>
  * Copyright (C) 2013-2014, Linaro Limited
  * Author: Ragesh Radhakrishnan <ragesh.r@linaro.org>
- * Copyright (C) 2014, D. R. Commander.  All rights reserved.
+ * Copyright (C) 2014-2016, D. R. Commander.  All Rights Reserved.
+ * Copyright (C) 2015-2016, Matthieu Darbois.  All Rights Reserved.
+ * Copyright (C) 2016, Siarhei Siamashka.  All Rights Reserved.
  *
  * This software is provided 'as-is', without any express or implied
  * warranty.  In no event will the authors be held liable for any damages
@@ -26,11 +28,10 @@
  */
 
 #if defined(__linux__) && defined(__ELF__)
-.section .note.GNU-stack,"",%progbits /* mark stack as non-executable */
+.section .note.GNU-stack, "", %progbits  /* mark stack as non-executable */
 #endif
 
 .text
-.arch armv8-a+fp+simd
 
 
 #define RESPECT_STRICT_ALIGNMENT 1
@@ -54,42 +55,71 @@
 .endm
 
 /* Transpose elements of single 128 bit registers */
-.macro transpose_single x0,x1,xi,xilen,literal
-    ins  \xi\xilen[0],  \x0\xilen[0]
-    ins  \x1\xilen[0],  \x0\xilen[1]
-    trn1 \x0\literal,   \x0\literal, \x1\literal
-    trn2 \x1\literal,   \xi\literal, \x1\literal
+.macro transpose_single x0, x1, xi, xilen, literal
+    ins             \xi\xilen[0], \x0\xilen[0]
+    ins             \x1\xilen[0], \x0\xilen[1]
+    trn1            \x0\literal, \x0\literal, \x1\literal
+    trn2            \x1\literal, \xi\literal, \x1\literal
 .endm
 
 /* Transpose elements of 2 differnet registers */
-.macro transpose x0,x1,xi,xilen,literal
-    mov  \xi\xilen,     \x0\xilen
-    trn1 \x0\literal,   \x0\literal, \x1\literal
-    trn2 \x1\literal,   \xi\literal, \x1\literal
+.macro transpose x0, x1, xi, xilen, literal
+    mov             \xi\xilen, \x0\xilen
+    trn1            \x0\literal, \x0\literal, \x1\literal
+    trn2            \x1\literal, \xi\literal, \x1\literal
 .endm
 
 /* Transpose a block of 4x4 coefficients in four 64-bit registers */
-.macro transpose_4x4_32 x0,x0len x1,x1len x2,x2len x3,x3len,xi,xilen
-    mov  \xi\xilen, \x0\xilen
-    trn1 \x0\x0len, \x0\x0len, \x2\x2len
-    trn2 \x2\x2len, \xi\x0len, \x2\x2len
-    mov  \xi\xilen, \x1\xilen
-    trn1 \x1\x1len, \x1\x1len, \x3\x3len
-    trn2 \x3\x3len, \xi\x1len, \x3\x3len
+.macro transpose_4x4_32 x0, x0len, x1, x1len, x2, x2len, x3, x3len, xi, xilen
+    mov             \xi\xilen, \x0\xilen
+    trn1            \x0\x0len, \x0\x0len, \x2\x2len
+    trn2            \x2\x2len, \xi\x0len, \x2\x2len
+    mov             \xi\xilen, \x1\xilen
+    trn1            \x1\x1len, \x1\x1len, \x3\x3len
+    trn2            \x3\x3len, \xi\x1len, \x3\x3len
 .endm
 
-.macro transpose_4x4_16 x0,x0len x1,x1len, x2,x2len, x3,x3len,xi,xilen
-    mov  \xi\xilen, \x0\xilen
-    trn1 \x0\x0len, \x0\x0len, \x1\x1len
-    trn2 \x1\x2len, \xi\x0len, \x1\x2len
-    mov  \xi\xilen, \x2\xilen
-    trn1 \x2\x2len, \x2\x2len, \x3\x3len
-    trn2 \x3\x2len, \xi\x1len, \x3\x3len
+.macro transpose_4x4_16 x0, x0len, x1, x1len, x2, x2len, x3, x3len, xi, xilen
+    mov             \xi\xilen, \x0\xilen
+    trn1            \x0\x0len, \x0\x0len, \x1\x1len
+    trn2            \x1\x2len, \xi\x0len, \x1\x2len
+    mov             \xi\xilen, \x2\xilen
+    trn1            \x2\x2len, \x2\x2len, \x3\x3len
+    trn2            \x3\x2len, \xi\x1len, \x3\x3len
 .endm
 
-.macro transpose_4x4 x0, x1, x2, x3,x5
-    transpose_4x4_16 \x0,.4h, \x1,.4h, \x2,.4h,\x3,.4h,\x5,.16b
-    transpose_4x4_32 \x0,.2s, \x1,.2s, \x2,.2s,\x3,.2s,\x5,.16b
+.macro transpose_4x4 x0, x1, x2, x3, x5
+    transpose_4x4_16 \x0, .4h, \x1, .4h, \x2, .4h, \x3, .4h, \x5, .16b
+    transpose_4x4_32 \x0, .2s, \x1, .2s, \x2, .2s, \x3, .2s, \x5, .16b
+.endm
+
+.macro transpose_8x8 l0, l1, l2, l3, l4, l5, l6, l7, t0, t1, t2, t3
+    trn1            \t0\().8h, \l0\().8h, \l1\().8h
+    trn1            \t1\().8h, \l2\().8h, \l3\().8h
+    trn1            \t2\().8h, \l4\().8h, \l5\().8h
+    trn1            \t3\().8h, \l6\().8h, \l7\().8h
+    trn2            \l1\().8h, \l0\().8h, \l1\().8h
+    trn2            \l3\().8h, \l2\().8h, \l3\().8h
+    trn2            \l5\().8h, \l4\().8h, \l5\().8h
+    trn2            \l7\().8h, \l6\().8h, \l7\().8h
+
+    trn1            \l4\().4s, \t2\().4s, \t3\().4s
+    trn2            \t3\().4s, \t2\().4s, \t3\().4s
+    trn1            \t2\().4s, \t0\().4s, \t1\().4s
+    trn2            \l2\().4s, \t0\().4s, \t1\().4s
+    trn1            \t0\().4s, \l1\().4s, \l3\().4s
+    trn2            \l3\().4s, \l1\().4s, \l3\().4s
+    trn2            \t1\().4s, \l5\().4s, \l7\().4s
+    trn1            \l5\().4s, \l5\().4s, \l7\().4s
+
+    trn2            \l6\().2d, \l2\().2d, \t3\().2d
+    trn1            \l0\().2d, \t2\().2d, \l4\().2d
+    trn1            \l1\().2d, \t0\().2d, \l5\().2d
+    trn2            \l7\().2d, \l3\().2d, \t1\().2d
+    trn1            \l2\().2d, \l2\().2d, \t3\().2d
+    trn2            \l4\().2d, \t2\().2d, \l4\().2d
+    trn1            \l3\().2d, \l3\().2d, \t1\().2d
+    trn2            \l5\().2d, \t0\().2d, \l5\().2d
 .endm
 
 
@@ -101,630 +131,606 @@
  * Perform dequantization and inverse DCT on one block of coefficients.
  *
  * GLOBAL(void)
- * jsimd_idct_islow_neon (void * dct_table, JCOEFPTR coef_block,
+ * jsimd_idct_islow_neon (void *dct_table, JCOEFPTR coef_block,
  *                        JSAMPARRAY output_buf, JDIMENSION output_col)
  */
 
-#define FIX_0_298631336  (2446)
-#define FIX_0_390180644  (3196)
-#define FIX_0_541196100  (4433)
-#define FIX_0_765366865  (6270)
-#define FIX_0_899976223  (7373)
-#define FIX_1_175875602  (9633)
-#define FIX_1_501321110  (12299)
-#define FIX_1_847759065  (15137)
-#define FIX_1_961570560  (16069)
-#define FIX_2_053119869  (16819)
-#define FIX_2_562915447  (20995)
-#define FIX_3_072711026  (25172)
+#define CONST_BITS 13
+#define PASS1_BITS 2
 
-#define FIX_1_175875602_MINUS_1_961570560 (FIX_1_175875602 - FIX_1_961570560)
-#define FIX_1_175875602_MINUS_0_390180644 (FIX_1_175875602 - FIX_0_390180644)
-#define FIX_0_541196100_MINUS_1_847759065 (FIX_0_541196100 - FIX_1_847759065)
-#define FIX_3_072711026_MINUS_2_562915447 (FIX_3_072711026 - FIX_2_562915447)
-#define FIX_0_298631336_MINUS_0_899976223 (FIX_0_298631336 - FIX_0_899976223)
-#define FIX_1_501321110_MINUS_0_899976223 (FIX_1_501321110 - FIX_0_899976223)
-#define FIX_2_053119869_MINUS_2_562915447 (FIX_2_053119869 - FIX_2_562915447)
-#define FIX_0_541196100_PLUS_0_765366865  (FIX_0_541196100 + FIX_0_765366865)
-
-/*
- * Reference SIMD-friendly 1-D ISLOW iDCT C implementation.
- * Uses some ideas from the comments in 'simd/jiss2int-64.asm'
- */
-#define REF_1D_IDCT(xrow0, xrow1, xrow2, xrow3, xrow4, xrow5, xrow6, xrow7)   \
-{                                                                             \
-    DCTELEM row0, row1, row2, row3, row4, row5, row6, row7;                   \
-    INT32   q1, q2, q3, q4, q5, q6, q7;                                       \
-    INT32   tmp11_plus_tmp2, tmp11_minus_tmp2;                                \
-                                                                              \
-    /* 1-D iDCT input data */                                                 \
-    row0 = xrow0;                                                             \
-    row1 = xrow1;                                                             \
-    row2 = xrow2;                                                             \
-    row3 = xrow3;                                                             \
-    row4 = xrow4;                                                             \
-    row5 = xrow5;                                                             \
-    row6 = xrow6;                                                             \
-    row7 = xrow7;                                                             \
-                                                                              \
-    q5 = row7 + row3;                                                         \
-    q4 = row5 + row1;                                                         \
-    q6 = MULTIPLY(q5, FIX_1_175875602_MINUS_1_961570560) +                    \
-         MULTIPLY(q4, FIX_1_175875602);                                       \
-    q7 = MULTIPLY(q5, FIX_1_175875602) +                                      \
-         MULTIPLY(q4, FIX_1_175875602_MINUS_0_390180644);                     \
-    q2 = MULTIPLY(row2, FIX_0_541196100) +                                    \
-         MULTIPLY(row6, FIX_0_541196100_MINUS_1_847759065);                   \
-    q4 = q6;                                                                  \
-    q3 = ((INT32) row0 - (INT32) row4) << 13;                                 \
-    q6 += MULTIPLY(row5, -FIX_2_562915447) +                                  \
-          MULTIPLY(row3, FIX_3_072711026_MINUS_2_562915447);                  \
-    /* now we can use q1 (reloadable constants have been used up) */          \
-    q1 = q3 + q2;                                                             \
-    q4 += MULTIPLY(row7, FIX_0_298631336_MINUS_0_899976223) +                 \
-          MULTIPLY(row1, -FIX_0_899976223);                                   \
-    q5 = q7;                                                                  \
-    q1 = q1 + q6;                                                             \
-    q7 += MULTIPLY(row7, -FIX_0_899976223) +                                  \
-          MULTIPLY(row1, FIX_1_501321110_MINUS_0_899976223);                  \
-                                                                              \
-    /* (tmp11 + tmp2) has been calculated (out_row1 before descale) */        \
-    tmp11_plus_tmp2 = q1;                                                     \
-    row1 = 0;                                                                 \
-                                                                              \
-    q1 = q1 - q6;                                                             \
-    q5 += MULTIPLY(row5, FIX_2_053119869_MINUS_2_562915447) +                 \
-          MULTIPLY(row3, -FIX_2_562915447);                                   \
-    q1 = q1 - q6;                                                             \
-    q6 = MULTIPLY(row2, FIX_0_541196100_PLUS_0_765366865) +                   \
-         MULTIPLY(row6, FIX_0_541196100);                                     \
-    q3 = q3 - q2;                                                             \
-                                                                              \
-    /* (tmp11 - tmp2) has been calculated (out_row6 before descale) */        \
-    tmp11_minus_tmp2 = q1;                                                    \
-                                                                              \
-    q1 = ((INT32) row0 + (INT32) row4) << 13;                                 \
-    q2 = q1 + q6;                                                             \
-    q1 = q1 - q6;                                                             \
-                                                                              \
-    /* pick up the results */                                                 \
-    tmp0  = q4;                                                               \
-    tmp1  = q5;                                                               \
-    tmp2  = (tmp11_plus_tmp2 - tmp11_minus_tmp2) / 2;                         \
-    tmp3  = q7;                                                               \
-    tmp10 = q2;                                                               \
-    tmp11 = (tmp11_plus_tmp2 + tmp11_minus_tmp2) / 2;                         \
-    tmp12 = q3;                                                               \
-    tmp13 = q1;                                                               \
-}
-
-#define XFIX_0_899976223                    v0.h[0]
-#define XFIX_0_541196100                    v0.h[1]
-#define XFIX_2_562915447                    v0.h[2]
-#define XFIX_0_298631336_MINUS_0_899976223  v0.h[3]
-#define XFIX_1_501321110_MINUS_0_899976223  v1.h[0]
-#define XFIX_2_053119869_MINUS_2_562915447  v1.h[1]
-#define XFIX_0_541196100_PLUS_0_765366865   v1.h[2]
-#define XFIX_1_175875602                    v1.h[3]
-#define XFIX_1_175875602_MINUS_0_390180644  v2.h[0]
-#define XFIX_0_541196100_MINUS_1_847759065  v2.h[1]
-#define XFIX_3_072711026_MINUS_2_562915447  v2.h[2]
-#define XFIX_1_175875602_MINUS_1_961570560  v2.h[3]
+#define F_0_298  2446  /* FIX(0.298631336) */
+#define F_0_390  3196  /* FIX(0.390180644) */
+#define F_0_541  4433  /* FIX(0.541196100) */
+#define F_0_765  6270  /* FIX(0.765366865) */
+#define F_0_899  7373  /* FIX(0.899976223) */
+#define F_1_175  9633  /* FIX(1.175875602) */
+#define F_1_501 12299  /* FIX(1.501321110) */
+#define F_1_847 15137  /* FIX(1.847759065) */
+#define F_1_961 16069  /* FIX(1.961570560) */
+#define F_2_053 16819  /* FIX(2.053119869) */
+#define F_2_562 20995  /* FIX(2.562915447) */
+#define F_3_072 25172  /* FIX(3.072711026) */
 
 .balign 16
 Ljsimd_idct_islow_neon_consts:
-    .short FIX_0_899976223                    /* d0[0] */
-    .short FIX_0_541196100                    /* d0[1] */
-    .short FIX_2_562915447                    /* d0[2] */
-    .short FIX_0_298631336_MINUS_0_899976223  /* d0[3] */
-    .short FIX_1_501321110_MINUS_0_899976223  /* d1[0] */
-    .short FIX_2_053119869_MINUS_2_562915447  /* d1[1] */
-    .short FIX_0_541196100_PLUS_0_765366865   /* d1[2] */
-    .short FIX_1_175875602                    /* d1[3] */
-    /* reloadable constants */
-    .short FIX_1_175875602_MINUS_0_390180644  /* d2[0] */
-    .short FIX_0_541196100_MINUS_1_847759065  /* d2[1] */
-    .short FIX_3_072711026_MINUS_2_562915447  /* d2[2] */
-    .short FIX_1_175875602_MINUS_1_961570560  /* d2[3] */
+  .short F_0_298
+  .short -F_0_390
+  .short F_0_541
+  .short F_0_765
+  .short - F_0_899
+  .short F_1_175
+  .short F_1_501
+  .short - F_1_847
+  .short - F_1_961
+  .short F_2_053
+  .short - F_2_562
+  .short F_3_072
+  .short 0          /* padding */
+  .short 0
+  .short 0
+  .short 0
+
+#undef F_0_298
+#undef F_0_390
+#undef F_0_541
+#undef F_0_765
+#undef F_0_899
+#undef F_1_175
+#undef F_1_501
+#undef F_1_847
+#undef F_1_961
+#undef F_2_053
+#undef F_2_562
+#undef F_3_072
+
+#define XFIX_P_0_298 v0.h[0]
+#define XFIX_N_0_390 v0.h[1]
+#define XFIX_P_0_541 v0.h[2]
+#define XFIX_P_0_765 v0.h[3]
+#define XFIX_N_0_899 v0.h[4]
+#define XFIX_P_1_175 v0.h[5]
+#define XFIX_P_1_501 v0.h[6]
+#define XFIX_N_1_847 v0.h[7]
+#define XFIX_N_1_961 v1.h[0]
+#define XFIX_P_2_053 v1.h[1]
+#define XFIX_N_2_562 v1.h[2]
+#define XFIX_P_3_072 v1.h[3]
 
 asm_function jsimd_idct_islow_neon
-
     DCT_TABLE       .req x0
     COEF_BLOCK      .req x1
     OUTPUT_BUF      .req x2
     OUTPUT_COL      .req x3
     TMP1            .req x0
     TMP2            .req x1
-    TMP3            .req x2
-    TMP4            .req x15
+    TMP3            .req x9
+    TMP4            .req x10
+    TMP5            .req x11
+    TMP6            .req x12
+    TMP7            .req x13
+    TMP8            .req x14
 
-    ROW0L           .req v16
-    ROW0R           .req v17
-    ROW1L           .req v18
-    ROW1R           .req v19
-    ROW2L           .req v20
-    ROW2R           .req v21
-    ROW3L           .req v22
-    ROW3R           .req v23
-    ROW4L           .req v24
-    ROW4R           .req v25
-    ROW5L           .req v26
-    ROW5R           .req v27
-    ROW6L           .req v28
-    ROW6R           .req v29
-    ROW7L           .req v30
-    ROW7R           .req v31
-    /* Save all NEON registers and x15 (32 NEON registers * 8 bytes + 16) */
-    sub             sp, sp, 272
-    str             x15, [sp], 16
+    sub             sp, sp, #64
     adr             x15, Ljsimd_idct_islow_neon_consts
-    st1             {v0.8b, v1.8b, v2.8b, v3.8b}, [sp], 32
-    st1             {v4.8b, v5.8b, v6.8b, v7.8b}, [sp], 32
-    st1             {v8.8b, v9.8b, v10.8b, v11.8b}, [sp], 32
-    st1             {v12.8b, v13.8b, v14.8b, v15.8b}, [sp], 32
-    st1             {v16.8b, v17.8b, v18.8b, v19.8b}, [sp], 32
-    st1             {v20.8b, v21.8b, v22.8b, v23.8b}, [sp], 32
-    st1             {v24.8b, v25.8b, v26.8b, v27.8b}, [sp], 32
-    st1             {v28.8b, v29.8b, v30.8b, v31.8b}, [sp], 32
-    ld1             {v16.4h, v17.4h, v18.4h, v19.4h}, [COEF_BLOCK], 32
-    ld1             {v0.4h, v1.4h, v2.4h, v3.4h}, [DCT_TABLE], 32
-    ld1             {v20.4h, v21.4h, v22.4h, v23.4h}, [COEF_BLOCK], 32
-    mul             v16.4h, v16.4h, v0.4h
-    mul             v17.4h, v17.4h, v1.4h
-    ins             v16.d[1], v17.d[0]  /* 128 bit q8 */
-    ld1             {v4.4h, v5.4h, v6.4h, v7.4h}, [DCT_TABLE], 32
-    mul             v18.4h, v18.4h, v2.4h
-    mul             v19.4h, v19.4h, v3.4h
-    ins             v18.d[1], v19.d[0]  /* 128 bit q9 */
-    ld1             {v24.4h, v25.4h, v26.4h, v27.4h}, [COEF_BLOCK], 32
-    mul             v20.4h, v20.4h, v4.4h
-    mul             v21.4h, v21.4h, v5.4h
-    ins             v20.d[1], v21.d[0]  /* 128 bit q10 */
-    ld1             {v0.4h, v1.4h, v2.4h, v3.4h}, [DCT_TABLE], 32
-    mul             v22.4h, v22.4h, v6.4h
-    mul             v23.4h, v23.4h, v7.4h
-    ins             v22.d[1], v23.d[0]  /* 128 bit q11 */
-    ld1             {v28.4h, v29.4h, v30.4h, v31.4h}, [COEF_BLOCK]
-    mul             v24.4h, v24.4h, v0.4h
-    mul             v25.4h, v25.4h, v1.4h
-    ins             v24.d[1], v25.d[0]  /* 128 bit q12 */
-    ld1             {v4.4h, v5.4h, v6.4h, v7.4h}, [DCT_TABLE], 32
-    mul             v28.4h, v28.4h, v4.4h
-    mul             v29.4h, v29.4h, v5.4h
-    ins             v28.d[1], v29.d[0]  /* 128 bit q14 */
-    mul             v26.4h, v26.4h, v2.4h
-    mul             v27.4h, v27.4h, v3.4h
-    ins             v26.d[1], v27.d[0]  /* 128 bit q13 */
-    ld1             {v0.4h, v1.4h, v2.4h, v3.4h}, [x15]  /* load constants */
-    add             x15, x15, #16
-    mul             v30.4h, v30.4h, v6.4h
-    mul             v31.4h, v31.4h, v7.4h
-    ins             v30.d[1], v31.d[0]  /* 128 bit q15 */
-    /* Go to the bottom of the stack */
-    sub             sp, sp, 352
-    stp             x4, x5, [sp], 16
-    st1             {v8.4h, v9.4h, v10.4h, v11.4h}, [sp], 32  /* save NEON registers */
-    st1             {v12.4h, v13.4h, v14.4h, v15.4h}, [sp], 32
-    /* 1-D IDCT, pass 1, left 4x8 half */
-    add             v4.4h,    ROW7L.4h, ROW3L.4h
-    add             v5.4h,    ROW5L.4h, ROW1L.4h
-    smull           v12.4s,   v4.4h,    XFIX_1_175875602_MINUS_1_961570560
-    smlal           v12.4s,   v5.4h,    XFIX_1_175875602
-    smull           v14.4s,   v4.4h,    XFIX_1_175875602
-    /* Check for the zero coefficients in the right 4x8 half */
-    smlal           v14.4s,   v5.4h,    XFIX_1_175875602_MINUS_0_390180644
-    ssubl           v6.4s,    ROW0L.4h, ROW4L.4h
-      ldp           w4,       w5,       [COEF_BLOCK, #(-96 + 2 * (4 + 1 * 8))]
-    smull           v4.4s,    ROW2L.4h, XFIX_0_541196100
-    smlal           v4.4s,    ROW6L.4h, XFIX_0_541196100_MINUS_1_847759065
-      orr           x0,       x4,       x5
-    mov             v8.16b,   v12.16b
-    smlsl           v12.4s,   ROW5L.4h, XFIX_2_562915447
-      ldp           w4,       w5,       [COEF_BLOCK, #(-96 + 2 * (4 + 2 * 8))]
-    smlal           v12.4s,   ROW3L.4h, XFIX_3_072711026_MINUS_2_562915447
-    shl             v6.4s,    v6.4s,    #13
-      orr           x0,       x0,       x4
-    smlsl           v8.4s,    ROW1L.4h, XFIX_0_899976223
-      orr           x0,       x0 ,      x5
-    add             v2.4s,    v6.4s,    v4.4s
-      ldp           w4,       w5,       [COEF_BLOCK, #(-96 + 2 * (4 + 3 * 8))]
-    mov             v10.16b,  v14.16b
-    add             v2.4s,    v2.4s,    v12.4s
-      orr           x0,       x0,       x4
-    smlsl           v14.4s,   ROW7L.4h, XFIX_0_899976223
-      orr           x0,       x0,       x5
-    smlal           v14.4s,   ROW1L.4h, XFIX_1_501321110_MINUS_0_899976223
-    rshrn           ROW1L.4h, v2.4s,    #11
-      ldp           w4,       w5,       [COEF_BLOCK, #(-96 + 2 * (4 + 4 * 8))]
-    sub             v2.4s,    v2.4s,    v12.4s
-    smlal           v10.4s,   ROW5L.4h, XFIX_2_053119869_MINUS_2_562915447
-      orr           x0,       x0,       x4
-    smlsl           v10.4s,   ROW3L.4h, XFIX_2_562915447
-      orr           x0,       x0,       x5
-    sub             v2.4s,    v2.4s,    v12.4s
-    smull           v12.4s,   ROW2L.4h, XFIX_0_541196100_PLUS_0_765366865
-      ldp           w4,       w5,       [COEF_BLOCK, #(-96 + 2 * (4 + 5 * 8))]
-    smlal           v12.4s,   ROW6L.4h, XFIX_0_541196100
-    sub             v6.4s,    v6.4s,    v4.4s
-      orr           x0,       x0,       x4
-    rshrn           ROW6L.4h, v2.4s,    #11
-      orr           x0,       x0,       x5
-    add             v2.4s,    v6.4s,    v10.4s
-      ldp           w4,       w5,       [COEF_BLOCK, #(-96 + 2 * (4 + 6 * 8))]
-    sub             v6.4s,    v6.4s,    v10.4s
-    saddl           v10.4s,   ROW0L.4h, ROW4L.4h
-      orr           x0,       x0,       x4
-    rshrn           ROW2L.4h, v2.4s,    #11
-      orr           x0,       x0,       x5
-    rshrn           ROW5L.4h, v6.4s,    #11
-      ldp           w4,       w5,       [COEF_BLOCK, #(-96 + 2 * (4 + 7 * 8))]
-    shl             v10.4s,   v10.4s,   #13
-    smlal           v8.4s,    ROW7L.4h, XFIX_0_298631336_MINUS_0_899976223
-      orr           x0,       x0,       x4
-    add             v4.4s,    v10.4s,   v12.4s
-      orr           x0,       x0,       x5
-    cmp             x0, #0 /* orrs instruction removed */
-    sub             v2.4s,    v10.4s,   v12.4s
-    add             v12.4s,   v4.4s,    v14.4s
-      ldp           w4,       w5,       [COEF_BLOCK, #(-96 + 2 * (4 + 0 * 8))]
-    sub             v4.4s,    v4.4s,    v14.4s
-    add             v10.4s,   v2.4s,    v8.4s
-      orr           x0,       x4,       x5
-    sub             v6.4s,    v2.4s,    v8.4s
-      /* pop             {x4, x5} */
-      sub           sp, sp, 80
-      ldp           x4, x5, [sp], 16
-    rshrn           ROW7L.4h, v4.4s,    #11
-    rshrn           ROW3L.4h, v10.4s,   #11
-    rshrn           ROW0L.4h, v12.4s,   #11
-    rshrn           ROW4L.4h, v6.4s,    #11
+    st1             {v8.8b, v9.8b, v10.8b, v11.8b}, [sp], #32
+    st1             {v12.8b, v13.8b, v14.8b, v15.8b}, [sp], #32
+    ld1             {v0.8h, v1.8h}, [x15]
+    ld1             {v2.8h, v3.8h, v4.8h, v5.8h}, [COEF_BLOCK], #64
+    ld1             {v18.8h, v19.8h, v20.8h, v21.8h}, [DCT_TABLE], #64
+    ld1             {v6.8h, v7.8h, v8.8h, v9.8h}, [COEF_BLOCK], #64
+    ld1             {v22.8h, v23.8h, v24.8h, v25.8h}, [DCT_TABLE], #64
 
-      b.eq          3f /* Go to do some special handling for the sparse right 4x8 half */
+    cmeq            v16.8h, v3.8h, #0
+    cmeq            v26.8h, v4.8h, #0
+    cmeq            v27.8h, v5.8h, #0
+    cmeq            v28.8h, v6.8h, #0
+    cmeq            v29.8h, v7.8h, #0
+    cmeq            v30.8h, v8.8h, #0
+    cmeq            v31.8h, v9.8h, #0
 
-    /* 1-D IDCT, pass 1, right 4x8 half */
-    ld1             {v2.4h},  [x15]    /* reload constants */
-    add             v10.4h,   ROW7R.4h, ROW3R.4h
-    add             v8.4h,    ROW5R.4h, ROW1R.4h
-    /* Transpose ROW6L <-> ROW7L   (v3 available free register) */
-    transpose       ROW6L, ROW7L, v3, .16b, .4h
-    smull           v12.4s,   v10.4h,   XFIX_1_175875602_MINUS_1_961570560
-    smlal           v12.4s,   v8.4h,    XFIX_1_175875602
-    /* Transpose ROW2L <-> ROW3L   (v3 available free register) */
-    transpose       ROW2L, ROW3L, v3, .16b, .4h
-    smull           v14.4s,   v10.4h,   XFIX_1_175875602
-    smlal           v14.4s,   v8.4h,    XFIX_1_175875602_MINUS_0_390180644
-    /* Transpose ROW0L <-> ROW1L   (v3 available free register) */
-    transpose       ROW0L, ROW1L, v3, .16b, .4h
-    ssubl           v6.4s,    ROW0R.4h, ROW4R.4h
-    smull           v4.4s,    ROW2R.4h, XFIX_0_541196100
-    smlal           v4.4s,    ROW6R.4h, XFIX_0_541196100_MINUS_1_847759065
-    /* Transpose ROW4L <-> ROW5L   (v3 available free register) */
-    transpose       ROW4L, ROW5L, v3, .16b, .4h
-    mov             v8.16b,   v12.16b
-    smlsl           v12.4s,   ROW5R.4h, XFIX_2_562915447
-    smlal           v12.4s,   ROW3R.4h, XFIX_3_072711026_MINUS_2_562915447
-    /* Transpose ROW1L <-> ROW3L   (v3 available free register) */
-    transpose       ROW1L, ROW3L, v3, .16b, .2s
-    shl             v6.4s,    v6.4s,    #13
-    smlsl           v8.4s,    ROW1R.4h, XFIX_0_899976223
-    /* Transpose ROW4L <-> ROW6L   (v3 available free register) */
-    transpose       ROW4L, ROW6L, v3, .16b, .2s
-    add             v2.4s,    v6.4s,    v4.4s
-    mov             v10.16b,  v14.16b
-    add             v2.4s,    v2.4s,    v12.4s
-    /* Transpose ROW0L <-> ROW2L   (v3 available free register) */
-    transpose       ROW0L, ROW2L, v3, .16b, .2s
-    smlsl           v14.4s,   ROW7R.4h, XFIX_0_899976223
-    smlal           v14.4s,   ROW1R.4h, XFIX_1_501321110_MINUS_0_899976223
-    rshrn           ROW1R.4h, v2.4s,    #11
-    /* Transpose ROW5L <-> ROW7L   (v3 available free register) */
-    transpose       ROW5L, ROW7L, v3, .16b, .2s
-    sub             v2.4s,    v2.4s,    v12.4s
-    smlal           v10.4s,   ROW5R.4h, XFIX_2_053119869_MINUS_2_562915447
-    smlsl           v10.4s,   ROW3R.4h, XFIX_2_562915447
-    sub             v2.4s,    v2.4s,    v12.4s
-    smull           v12.4s,   ROW2R.4h, XFIX_0_541196100_PLUS_0_765366865
-    smlal           v12.4s,   ROW6R.4h, XFIX_0_541196100
-    sub             v6.4s,    v6.4s,    v4.4s
-    rshrn           ROW6R.4h, v2.4s,    #11
-    add             v2.4s,    v6.4s,    v10.4s
-    sub             v6.4s,    v6.4s,    v10.4s
-    saddl           v10.4s,   ROW0R.4h, ROW4R.4h
-    rshrn           ROW2R.4h, v2.4s,    #11
-    rshrn           ROW5R.4h, v6.4s,    #11
-    shl             v10.4s,   v10.4s,   #13
-    smlal           v8.4s,    ROW7R.4h, XFIX_0_298631336_MINUS_0_899976223
-    add             v4.4s,    v10.4s,   v12.4s
-    sub             v2.4s,    v10.4s,   v12.4s
-    add             v12.4s,   v4.4s,    v14.4s
-    sub             v4.4s,    v4.4s,    v14.4s
-    add             v10.4s,   v2.4s,    v8.4s
-    sub             v6.4s,    v2.4s,    v8.4s
-    rshrn           ROW7R.4h, v4.4s,    #11
-    rshrn           ROW3R.4h, v10.4s,   #11
-    rshrn           ROW0R.4h, v12.4s,   #11
-    rshrn           ROW4R.4h, v6.4s,    #11
-    /* Transpose right 4x8 half */
-    transpose       ROW6R, ROW7R, v3, .16b, .4h
-    transpose       ROW2R, ROW3R, v3, .16b, .4h
-    transpose       ROW0R, ROW1R, v3, .16b, .4h
-    transpose       ROW4R, ROW5R, v3, .16b, .4h
-    transpose       ROW1R, ROW3R, v3, .16b, .2s
-    transpose       ROW4R, ROW6R, v3, .16b, .2s
-    transpose       ROW0R, ROW2R, v3, .16b, .2s
-    transpose       ROW5R, ROW7R, v3, .16b, .2s
+    and             v10.16b, v16.16b, v26.16b
+    and             v11.16b, v27.16b, v28.16b
+    and             v12.16b, v29.16b, v30.16b
+    and             v13.16b, v31.16b, v10.16b
+    and             v14.16b, v11.16b, v12.16b
+    mul             v2.8h, v2.8h, v18.8h
+    and             v15.16b, v13.16b, v14.16b
+    shl             v10.8h, v2.8h, #(PASS1_BITS)
+    sqxtn           v16.8b, v15.8h
+    mov             TMP1, v16.d[0]
+    sub             sp, sp, #64
+    mvn             TMP2, TMP1
 
-1:  /* 1-D IDCT, pass 2 (normal variant), left 4x8 half */
-    ld1             {v2.4h},  [x15]    /* reload constants */
-    smull           v12.4S,   ROW1R.4h, XFIX_1_175875602 /* ROW5L.4h <-> ROW1R.4h */
-    smlal           v12.4s,   ROW1L.4h, XFIX_1_175875602
-    smlal           v12.4s,   ROW3R.4h, XFIX_1_175875602_MINUS_1_961570560 /* ROW7L.4h <-> ROW3R.4h */
-    smlal           v12.4s,   ROW3L.4h, XFIX_1_175875602_MINUS_1_961570560
-    smull           v14.4s,   ROW3R.4h, XFIX_1_175875602 /* ROW7L.4h <-> ROW3R.4h */
-    smlal           v14.4s,   ROW3L.4h, XFIX_1_175875602
-    smlal           v14.4s,   ROW1R.4h, XFIX_1_175875602_MINUS_0_390180644 /* ROW5L.4h <-> ROW1R.4h */
-    smlal           v14.4s,   ROW1L.4h, XFIX_1_175875602_MINUS_0_390180644
-    ssubl           v6.4s,    ROW0L.4h, ROW0R.4h /* ROW4L.4h <-> ROW0R.4h */
-    smull           v4.4s,    ROW2L.4h, XFIX_0_541196100
-    smlal           v4.4s,    ROW2R.4h, XFIX_0_541196100_MINUS_1_847759065 /* ROW6L.4h <-> ROW2R.4h */
-    mov             v8.16b,   v12.16b
-    smlsl           v12.4s,   ROW1R.4h, XFIX_2_562915447 /* ROW5L.4h <-> ROW1R.4h */
-    smlal           v12.4s,   ROW3L.4h, XFIX_3_072711026_MINUS_2_562915447
-    shl             v6.4s,    v6.4s,    #13
-    smlsl           v8.4s,    ROW1L.4h, XFIX_0_899976223
-    add             v2.4s,    v6.4s,    v4.4s
-    mov             v10.16b,  v14.16b
-    add             v2.4s,    v2.4s,    v12.4s
-    smlsl           v14.4s,   ROW3R.4h, XFIX_0_899976223 /* ROW7L.4h <-> ROW3R.4h */
-    smlal           v14.4s,   ROW1L.4h, XFIX_1_501321110_MINUS_0_899976223
-    shrn            ROW1L.4h, v2.4s,    #16
-    sub             v2.4s,    v2.4s,    v12.4s
-    smlal           v10.4s,   ROW1R.4h, XFIX_2_053119869_MINUS_2_562915447 /* ROW5L.4h <-> ROW1R.4h */
-    smlsl           v10.4s,   ROW3L.4h, XFIX_2_562915447
-    sub             v2.4s,    v2.4s,    v12.4s
-    smull           v12.4s,   ROW2L.4h, XFIX_0_541196100_PLUS_0_765366865
-    smlal           v12.4s,   ROW2R.4h, XFIX_0_541196100 /* ROW6L.4h <-> ROW2R.4h */
-    sub             v6.4s,    v6.4s,    v4.4s
-    shrn            ROW2R.4h, v2.4s,    #16 /* ROW6L.4h <-> ROW2R.4h */
-    add             v2.4s,    v6.4s,    v10.4s
-    sub             v6.4s,    v6.4s,    v10.4s
-    saddl           v10.4s,   ROW0L.4h, ROW0R.4h /* ROW4L.4h <-> ROW0R.4h */
-    shrn            ROW2L.4h, v2.4s,    #16
-    shrn            ROW1R.4h, v6.4s,    #16 /* ROW5L.4h <-> ROW1R.4h */
-    shl             v10.4s,   v10.4s,   #13
-    smlal           v8.4s,    ROW3R.4h, XFIX_0_298631336_MINUS_0_899976223 /* ROW7L.4h <-> ROW3R.4h */
-    add             v4.4s,    v10.4s,   v12.4s
-    sub             v2.4s,    v10.4s,   v12.4s
-    add             v12.4s,   v4.4s,    v14.4s
-    sub             v4.4s,    v4.4s,    v14.4s
-    add             v10.4s,   v2.4s,    v8.4s
-    sub             v6.4s,    v2.4s,    v8.4s
-    shrn            ROW3R.4h, v4.4s,    #16 /* ROW7L.4h <-> ROW3R.4h */
-    shrn            ROW3L.4h, v10.4s,   #16
-    shrn            ROW0L.4h, v12.4s,   #16
-    shrn            ROW0R.4h, v6.4s,    #16 /* ROW4L.4h <-> ROW0R.4h */
-    /* 1-D IDCT, pass 2, right 4x8 half */
-    ld1             {v2.4h},  [x15]    /* reload constants */
-    smull           v12.4s,   ROW5R.4h, XFIX_1_175875602
-    smlal           v12.4s,   ROW5L.4h, XFIX_1_175875602 /* ROW5L.4h <-> ROW1R.4h */
-    smlal           v12.4s,   ROW7R.4h, XFIX_1_175875602_MINUS_1_961570560
-    smlal           v12.4s,   ROW7L.4h, XFIX_1_175875602_MINUS_1_961570560 /* ROW7L.4h <-> ROW3R.4h */
-    smull           v14.4s,   ROW7R.4h, XFIX_1_175875602
-    smlal           v14.4s,   ROW7L.4h, XFIX_1_175875602 /* ROW7L.4h <-> ROW3R.4h */
-    smlal           v14.4s,   ROW5R.4h, XFIX_1_175875602_MINUS_0_390180644
-    smlal           v14.4s,   ROW5L.4h, XFIX_1_175875602_MINUS_0_390180644 /* ROW5L.4h <-> ROW1R.4h */
-    ssubl           v6.4s,    ROW4L.4h, ROW4R.4h /* ROW4L.4h <-> ROW0R.4h */
-    smull           v4.4s,    ROW6L.4h, XFIX_0_541196100 /* ROW6L.4h <-> ROW2R.4h */
-    smlal           v4.4s,    ROW6R.4h, XFIX_0_541196100_MINUS_1_847759065
-    mov             v8.16b,   v12.16b
-    smlsl           v12.4s,   ROW5R.4h, XFIX_2_562915447
-    smlal           v12.4s,   ROW7L.4h, XFIX_3_072711026_MINUS_2_562915447 /* ROW7L.4h <-> ROW3R.4h */
-    shl             v6.4s,    v6.4s,    #13
-    smlsl           v8.4s,    ROW5L.4h, XFIX_0_899976223 /* ROW5L.4h <-> ROW1R.4h */
-    add             v2.4s,    v6.4s,    v4.4s
-    mov             v10.16b,  v14.16b
-    add             v2.4s,    v2.4s,    v12.4s
-    smlsl           v14.4s,   ROW7R.4h, XFIX_0_899976223
-    smlal           v14.4s,   ROW5L.4h, XFIX_1_501321110_MINUS_0_899976223 /* ROW5L.4h <-> ROW1R.4h */
-    shrn            ROW5L.4h, v2.4s,    #16 /* ROW5L.4h <-> ROW1R.4h */
-    sub             v2.4s,    v2.4s,    v12.4s
-    smlal           v10.4s,   ROW5R.4h, XFIX_2_053119869_MINUS_2_562915447
-    smlsl           v10.4s,   ROW7L.4h, XFIX_2_562915447 /* ROW7L.4h <-> ROW3R.4h */
-    sub             v2.4s,    v2.4s,    v12.4s
-    smull           v12.4s,   ROW6L.4h, XFIX_0_541196100_PLUS_0_765366865 /* ROW6L.4h <-> ROW2R.4h */
-    smlal           v12.4s,   ROW6R.4h, XFIX_0_541196100
-    sub             v6.4s,    v6.4s,    v4.4s
-    shrn            ROW6R.4h, v2.4s,    #16
-    add             v2.4s,    v6.4s,    v10.4s
-    sub             v6.4s,    v6.4s,    v10.4s
-    saddl           v10.4s,   ROW4L.4h, ROW4R.4h /* ROW4L.4h <-> ROW0R.4h */
-    shrn            ROW6L.4h, v2.4s,    #16 /* ROW6L.4h <-> ROW2R.4h */
-    shrn            ROW5R.4h, v6.4s,    #16
-    shl             v10.4s,   v10.4s,   #13
-    smlal           v8.4s,    ROW7R.4h, XFIX_0_298631336_MINUS_0_899976223
-    add             v4.4s,    v10.4s,   v12.4s
-    sub             v2.4s,    v10.4s,   v12.4s
-    add             v12.4s,   v4.4s,    v14.4s
-    sub             v4.4s,    v4.4s,    v14.4s
-    add             v10.4s,   v2.4s,    v8.4s
-    sub             v6.4s,    v2.4s,    v8.4s
-    shrn            ROW7R.4h, v4.4s,    #16
-    shrn            ROW7L.4h, v10.4s,   #16 /* ROW7L.4h <-> ROW3R.4h */
-    shrn            ROW4L.4h, v12.4s,   #16 /* ROW4L.4h <-> ROW0R.4h */
-    shrn            ROW4R.4h, v6.4s,    #16
+    cbnz            TMP2, 2f
+    /* case all AC coeffs are zeros */
+    dup             v2.2d, v10.d[0]
+    dup             v6.2d, v10.d[1]
+    mov             v3.16b, v2.16b
+    mov             v7.16b, v6.16b
+    mov             v4.16b, v2.16b
+    mov             v8.16b, v6.16b
+    mov             v5.16b, v2.16b
+    mov             v9.16b, v6.16b
+1:
+    /* for this transpose, we should organise data like this:
+     * 00, 01, 02, 03, 40, 41, 42, 43
+     * 10, 11, 12, 13, 50, 51, 52, 53
+     * 20, 21, 22, 23, 60, 61, 62, 63
+     * 30, 31, 32, 33, 70, 71, 72, 73
+     * 04, 05, 06, 07, 44, 45, 46, 47
+     * 14, 15, 16, 17, 54, 55, 56, 57
+     * 24, 25, 26, 27, 64, 65, 66, 67
+     * 34, 35, 36, 37, 74, 75, 76, 77
+     */
+    trn1            v28.8h, v2.8h, v3.8h
+    trn1            v29.8h, v4.8h, v5.8h
+    trn1            v30.8h, v6.8h, v7.8h
+    trn1            v31.8h, v8.8h, v9.8h
+    trn2            v16.8h, v2.8h, v3.8h
+    trn2            v17.8h, v4.8h, v5.8h
+    trn2            v18.8h, v6.8h, v7.8h
+    trn2            v19.8h, v8.8h, v9.8h
+    trn1            v2.4s, v28.4s, v29.4s
+    trn1            v6.4s, v30.4s, v31.4s
+    trn1            v3.4s, v16.4s, v17.4s
+    trn1            v7.4s, v18.4s, v19.4s
+    trn2            v4.4s, v28.4s, v29.4s
+    trn2            v8.4s, v30.4s, v31.4s
+    trn2            v5.4s, v16.4s, v17.4s
+    trn2            v9.4s, v18.4s, v19.4s
+    /* Even part: reverse the even part of the forward DCT. */
+    add             v18.8h, v4.8h, v8.8h           /* z2 + z3 = DEQUANTIZE(inptr[DCTSIZE*2], quantptr[DCTSIZE*2]) + DEQUANTIZE(inptr[DCTSIZE*6], quantptr[DCTSIZE*6]) */
+    add             v22.8h, v2.8h, v6.8h           /* z2 + z3 = DEQUANTIZE(inptr[DCTSIZE*0], quantptr[DCTSIZE*0]) + DEQUANTIZE(inptr[DCTSIZE*4], quantptr[DCTSIZE*4]) */
+    smull2          v19.4s, v18.8h, XFIX_P_0_541   /* z1h z1 = MULTIPLY(z2 + z3, FIX_0_541196100); */
+    sub             v26.8h, v2.8h, v6.8h           /* z2 - z3 = DEQUANTIZE(inptr[DCTSIZE*0], quantptr[DCTSIZE*0]) - DEQUANTIZE(inptr[DCTSIZE*4], quantptr[DCTSIZE*4]) */
+    smull           v18.4s, v18.4h, XFIX_P_0_541   /* z1l z1 = MULTIPLY(z2 + z3, FIX_0_541196100); */
+    sshll2          v23.4s, v22.8h, #(CONST_BITS)  /* tmp0h tmp0 = LEFT_SHIFT(z2 + z3, CONST_BITS); */
+    mov             v21.16b, v19.16b               /* tmp3 = z1 */
+    mov             v20.16b, v18.16b               /* tmp3 = z1 */
+    smlal2          v19.4s, v8.8h, XFIX_N_1_847    /* tmp2h tmp2 = z1 + MULTIPLY(z3, - FIX_1_847759065); */
+    smlal           v18.4s, v8.4h, XFIX_N_1_847    /* tmp2l tmp2 = z1 + MULTIPLY(z3, - FIX_1_847759065); */
+    sshll2          v27.4s, v26.8h, #(CONST_BITS)  /* tmp1h tmp1 = LEFT_SHIFT(z2 - z3, CONST_BITS); */
+    smlal2          v21.4s, v4.8h, XFIX_P_0_765    /* tmp3h tmp3 = z1 + MULTIPLY(z2, FIX_0_765366865); */
+    smlal           v20.4s, v4.4h, XFIX_P_0_765    /* tmp3l tmp3 = z1 + MULTIPLY(z2, FIX_0_765366865); */
+    sshll           v22.4s, v22.4h, #(CONST_BITS)  /* tmp0l tmp0 = LEFT_SHIFT(z2 + z3, CONST_BITS); */
+    sshll           v26.4s, v26.4h, #(CONST_BITS)  /* tmp1l tmp1 = LEFT_SHIFT(z2 - z3, CONST_BITS); */
+    add             v2.4s, v22.4s, v20.4s          /* tmp10l tmp10 = tmp0 + tmp3; */
+    sub             v6.4s, v22.4s, v20.4s          /* tmp13l tmp13 = tmp0 - tmp3; */
+    add             v8.4s, v26.4s, v18.4s          /* tmp11l tmp11 = tmp1 + tmp2; */
+    sub             v4.4s, v26.4s, v18.4s          /* tmp12l tmp12 = tmp1 - tmp2; */
+    add             v28.4s, v23.4s, v21.4s         /* tmp10h tmp10 = tmp0 + tmp3; */
+    sub             v31.4s, v23.4s, v21.4s         /* tmp13h tmp13 = tmp0 - tmp3; */
+    add             v29.4s, v27.4s, v19.4s         /* tmp11h tmp11 = tmp1 + tmp2; */
+    sub             v30.4s, v27.4s, v19.4s         /* tmp12h tmp12 = tmp1 - tmp2; */
 
-2:  /* Descale to 8-bit and range limit */
-    ins             v16.d[1], v17.d[0]
-    ins             v18.d[1], v19.d[0]
-    ins             v20.d[1], v21.d[0]
-    ins             v22.d[1], v23.d[0]
-    sqrshrn         v16.8b,   v16.8h,   #2
-    sqrshrn2        v16.16b,  v18.8h,   #2
-    sqrshrn         v18.8b,   v20.8h,   #2
-    sqrshrn2        v18.16b,  v22.8h,   #2
+    /* Odd part per figure 8; the matrix is unitary and hence its
+     * transpose is its inverse.  i0..i3 are y7,y5,y3,y1 respectively.
+     */
 
-    /* vpop            {v8.4h - d15.4h} */ /* restore NEON registers */
-    ld1             {v8.4h, v9.4h, v10.4h, v11.4h}, [sp], 32
-    ld1             {v12.4h, v13.4h, v14.4h, v15.4h}, [sp], 32
-    ins             v24.d[1], v25.d[0]
+    add             v22.8h, v9.8h, v5.8h    /* z3 = tmp0 + tmp2 = DEQUANTIZE(inptr[DCTSIZE*7], quantptr[DCTSIZE*7]) + DEQUANTIZE(inptr[DCTSIZE*3], quantptr[DCTSIZE*3]) */
+    add             v24.8h, v7.8h, v3.8h    /* z4 = tmp1 + tmp3 = DEQUANTIZE(inptr[DCTSIZE*5], quantptr[DCTSIZE*5]) + DEQUANTIZE(inptr[DCTSIZE*1], quantptr[DCTSIZE*1]) */
+    add             v18.8h, v9.8h, v3.8h    /* z1 = tmp0 + tmp3 = DEQUANTIZE(inptr[DCTSIZE*7], quantptr[DCTSIZE*7]) + DEQUANTIZE(inptr[DCTSIZE*1], quantptr[DCTSIZE*1]) */
+    add             v20.8h, v7.8h, v5.8h    /* z2 = tmp1 + tmp2 = DEQUANTIZE(inptr[DCTSIZE*5], quantptr[DCTSIZE*5]) + DEQUANTIZE(inptr[DCTSIZE*3], quantptr[DCTSIZE*3]) */
+    add             v26.8h, v22.8h, v24.8h  /* z5 = z3 + z4 */
 
-    sqrshrn         v20.8b,   v24.8h,   #2
-      /* Transpose the final 8-bit samples and do signed->unsigned conversion */
-    /* trn1            v16.8h,    v16.8h,  v18.8h */
-    transpose       v16, v18, v3, .16b, .8h
-    ins             v26.d[1], v27.d[0]
-    ins             v28.d[1], v29.d[0]
-    ins             v30.d[1], v31.d[0]
-    sqrshrn2        v20.16b,  v26.8h,   #2
-    sqrshrn         v22.8b,   v28.8h,   #2
-    movi            v0.16b,   #(CENTERJSAMPLE)
-    sqrshrn2        v22.16b,  v30.8h,   #2
-    transpose_single v16, v17, v3, .d, .8b
-    transpose_single v18, v19, v3, .d, .8b
-    add             v16.8b,   v16.8b,   v0.8b
-    add             v17.8b,   v17.8b,   v0.8b
-    add             v18.8b,   v18.8b,   v0.8b
-    add             v19.8b,   v19.8b,   v0.8b
-    transpose       v20, v22, v3, .16b, .8h
+    smull2          v11.4s, v9.8h, XFIX_P_0_298   /* tmp0 = MULTIPLY(tmp0, FIX_0_298631336) */
+    smull2          v13.4s, v7.8h, XFIX_P_2_053   /* tmp1 = MULTIPLY(tmp1, FIX_2_053119869) */
+    smull2          v15.4s, v5.8h, XFIX_P_3_072   /* tmp2 = MULTIPLY(tmp2, FIX_3_072711026) */
+    smull2          v17.4s, v3.8h, XFIX_P_1_501   /* tmp3 = MULTIPLY(tmp3, FIX_1_501321110) */
+    smull2          v27.4s, v26.8h, XFIX_P_1_175  /* z5h z5 = MULTIPLY(z3 + z4, FIX_1_175875602) */
+    smull2          v23.4s, v22.8h, XFIX_N_1_961  /* z3 = MULTIPLY(z3, - FIX_1_961570560) */
+    smull2          v25.4s, v24.8h, XFIX_N_0_390  /* z4 = MULTIPLY(z4, - FIX_0_390180644) */
+    smull2          v19.4s, v18.8h, XFIX_N_0_899  /* z1 = MULTIPLY(z1, - FIX_0_899976223) */
+    smull2          v21.4s, v20.8h, XFIX_N_2_562  /* z2 = MULTIPLY(z2, - FIX_2_562915447) */
+
+    smull           v10.4s, v9.4h, XFIX_P_0_298   /* tmp0 = MULTIPLY(tmp0, FIX_0_298631336) */
+    smull           v12.4s, v7.4h, XFIX_P_2_053   /* tmp1 = MULTIPLY(tmp1, FIX_2_053119869) */
+    smull           v14.4s, v5.4h, XFIX_P_3_072   /* tmp2 = MULTIPLY(tmp2, FIX_3_072711026) */
+    smull           v16.4s, v3.4h, XFIX_P_1_501   /* tmp3 = MULTIPLY(tmp3, FIX_1_501321110) */
+    smull           v26.4s, v26.4h, XFIX_P_1_175  /* z5l z5 = MULTIPLY(z3 + z4, FIX_1_175875602) */
+    smull           v22.4s, v22.4h, XFIX_N_1_961  /* z3 = MULTIPLY(z3, - FIX_1_961570560) */
+    smull           v24.4s, v24.4h, XFIX_N_0_390  /* z4 = MULTIPLY(z4, - FIX_0_390180644) */
+    smull           v18.4s, v18.4h, XFIX_N_0_899  /* z1 = MULTIPLY(z1, - FIX_0_899976223) */
+    smull           v20.4s, v20.4h, XFIX_N_2_562  /* z2 = MULTIPLY(z2, - FIX_2_562915447) */
+
+    add             v23.4s, v23.4s, v27.4s  /* z3 += z5 */
+    add             v22.4s, v22.4s, v26.4s  /* z3 += z5 */
+    add             v25.4s, v25.4s, v27.4s  /* z4 += z5 */
+    add             v24.4s, v24.4s, v26.4s  /* z4 += z5 */
+
+    add             v11.4s, v11.4s, v19.4s  /* tmp0 += z1 */
+    add             v10.4s, v10.4s, v18.4s  /* tmp0 += z1 */
+    add             v13.4s, v13.4s, v21.4s  /* tmp1 += z2 */
+    add             v12.4s, v12.4s, v20.4s  /* tmp1 += z2 */
+    add             v15.4s, v15.4s, v21.4s  /* tmp2 += z2 */
+    add             v14.4s, v14.4s, v20.4s  /* tmp2 += z2 */
+    add             v17.4s, v17.4s, v19.4s  /* tmp3 += z1 */
+    add             v16.4s, v16.4s, v18.4s  /* tmp3 += z1 */
+
+    add             v11.4s, v11.4s, v23.4s  /* tmp0 += z3 */
+    add             v10.4s, v10.4s, v22.4s  /* tmp0 += z3 */
+    add             v13.4s, v13.4s, v25.4s  /* tmp1 += z4 */
+    add             v12.4s, v12.4s, v24.4s  /* tmp1 += z4 */
+    add             v17.4s, v17.4s, v25.4s  /* tmp3 += z4 */
+    add             v16.4s, v16.4s, v24.4s  /* tmp3 += z4 */
+    add             v15.4s, v15.4s, v23.4s  /* tmp2 += z3 */
+    add             v14.4s, v14.4s, v22.4s  /* tmp2 += z3 */
+
+    /* Final output stage: inputs are tmp10..tmp13, tmp0..tmp3 */
+
+    add             v18.4s, v2.4s, v16.4s   /* tmp10 + tmp3 */
+    add             v19.4s, v28.4s, v17.4s  /* tmp10 + tmp3 */
+    sub             v20.4s, v2.4s, v16.4s   /* tmp10 - tmp3 */
+    sub             v21.4s, v28.4s, v17.4s  /* tmp10 - tmp3 */
+    add             v22.4s, v8.4s, v14.4s   /* tmp11 + tmp2 */
+    add             v23.4s, v29.4s, v15.4s  /* tmp11 + tmp2 */
+    sub             v24.4s, v8.4s, v14.4s   /* tmp11 - tmp2 */
+    sub             v25.4s, v29.4s, v15.4s  /* tmp11 - tmp2 */
+    add             v26.4s, v4.4s, v12.4s   /* tmp12 + tmp1 */
+    add             v27.4s, v30.4s, v13.4s  /* tmp12 + tmp1 */
+    sub             v28.4s, v4.4s, v12.4s   /* tmp12 - tmp1 */
+    sub             v29.4s, v30.4s, v13.4s  /* tmp12 - tmp1 */
+    add             v14.4s, v6.4s, v10.4s   /* tmp13 + tmp0 */
+    add             v15.4s, v31.4s, v11.4s  /* tmp13 + tmp0 */
+    sub             v16.4s, v6.4s, v10.4s   /* tmp13 - tmp0 */
+    sub             v17.4s, v31.4s, v11.4s  /* tmp13 - tmp0 */
+
+    shrn            v2.4h, v18.4s, #16  /* wsptr[DCTSIZE*0] = (int) DESCALE(tmp10 + tmp3, CONST_BITS+PASS1_BITS+3) */
+    shrn            v9.4h, v20.4s, #16  /* wsptr[DCTSIZE*7] = (int) DESCALE(tmp10 - tmp3, CONST_BITS+PASS1_BITS+3) */
+    shrn            v3.4h, v22.4s, #16  /* wsptr[DCTSIZE*1] = (int) DESCALE(tmp11 + tmp2, CONST_BITS+PASS1_BITS+3) */
+    shrn            v8.4h, v24.4s, #16  /* wsptr[DCTSIZE*6] = (int) DESCALE(tmp11 - tmp2, CONST_BITS+PASS1_BITS+3) */
+    shrn            v4.4h, v26.4s, #16  /* wsptr[DCTSIZE*2] = (int) DESCALE(tmp12 + tmp1, CONST_BITS+PASS1_BITS+3) */
+    shrn            v7.4h, v28.4s, #16  /* wsptr[DCTSIZE*5] = (int) DESCALE(tmp12 - tmp1, CONST_BITS+PASS1_BITS+3) */
+    shrn            v5.4h, v14.4s, #16  /* wsptr[DCTSIZE*3] = (int) DESCALE(tmp13 + tmp0, CONST_BITS+PASS1_BITS+3) */
+    shrn            v6.4h, v16.4s, #16  /* wsptr[DCTSIZE*4] = (int) DESCALE(tmp13 - tmp0, CONST_BITS+PASS1_BITS+3) */
+    shrn2           v2.8h, v19.4s, #16  /* wsptr[DCTSIZE*0] = (int) DESCALE(tmp10 + tmp3, CONST_BITS+PASS1_BITS+3) */
+    shrn2           v9.8h, v21.4s, #16  /* wsptr[DCTSIZE*7] = (int) DESCALE(tmp10 - tmp3, CONST_BITS+PASS1_BITS+3) */
+    shrn2           v3.8h, v23.4s, #16  /* wsptr[DCTSIZE*1] = (int) DESCALE(tmp11 + tmp2, CONST_BITS+PASS1_BITS+3) */
+    shrn2           v8.8h, v25.4s, #16  /* wsptr[DCTSIZE*6] = (int) DESCALE(tmp11 - tmp2, CONST_BITS+PASS1_BITS+3) */
+    shrn2           v4.8h, v27.4s, #16  /* wsptr[DCTSIZE*2] = (int) DESCALE(tmp12 + tmp1, CONST_BITS+PASS1_BITS+3) */
+    shrn2           v7.8h, v29.4s, #16  /* wsptr[DCTSIZE*5] = (int) DESCALE(tmp12 - tmp1, CONST_BITS+PASS1_BITS+3) */
+    shrn2           v5.8h, v15.4s, #16  /* wsptr[DCTSIZE*3] = (int) DESCALE(tmp13 + tmp0, CONST_BITS+PASS1_BITS+3) */
+    shrn2           v6.8h, v17.4s, #16  /* wsptr[DCTSIZE*4] = (int) DESCALE(tmp13 - tmp0, CONST_BITS+PASS1_BITS+3) */
+    movi            v0.16b, #(CENTERJSAMPLE)
+    /* Prepare pointers (dual-issue with NEON instructions) */
+      ldp             TMP1, TMP2, [OUTPUT_BUF], 16
+    sqrshrn         v28.8b, v2.8h, #(CONST_BITS+PASS1_BITS+3-16)
+      ldp             TMP3, TMP4, [OUTPUT_BUF], 16
+    sqrshrn         v29.8b, v3.8h, #(CONST_BITS+PASS1_BITS+3-16)
+      add             TMP1, TMP1, OUTPUT_COL
+    sqrshrn         v30.8b, v4.8h, #(CONST_BITS+PASS1_BITS+3-16)
+      add             TMP2, TMP2, OUTPUT_COL
+    sqrshrn         v31.8b, v5.8h, #(CONST_BITS+PASS1_BITS+3-16)
+      add             TMP3, TMP3, OUTPUT_COL
+    sqrshrn2        v28.16b, v6.8h, #(CONST_BITS+PASS1_BITS+3-16)
+      add             TMP4, TMP4, OUTPUT_COL
+    sqrshrn2        v29.16b, v7.8h, #(CONST_BITS+PASS1_BITS+3-16)
+      ldp             TMP5, TMP6, [OUTPUT_BUF], 16
+    sqrshrn2        v30.16b, v8.8h, #(CONST_BITS+PASS1_BITS+3-16)
+      ldp             TMP7, TMP8, [OUTPUT_BUF], 16
+    sqrshrn2        v31.16b, v9.8h, #(CONST_BITS+PASS1_BITS+3-16)
+      add             TMP5, TMP5, OUTPUT_COL
+    add             v16.16b, v28.16b, v0.16b
+      add             TMP6, TMP6, OUTPUT_COL
+    add             v18.16b, v29.16b, v0.16b
+      add             TMP7, TMP7, OUTPUT_COL
+    add             v20.16b, v30.16b, v0.16b
+      add             TMP8, TMP8, OUTPUT_COL
+    add             v22.16b, v31.16b, v0.16b
+
+    /* Transpose the final 8-bit samples */
+    trn1            v28.16b, v16.16b, v18.16b
+    trn1            v30.16b, v20.16b, v22.16b
+    trn2            v29.16b, v16.16b, v18.16b
+    trn2            v31.16b, v20.16b, v22.16b
+
+    trn1            v16.8h, v28.8h, v30.8h
+    trn2            v18.8h, v28.8h, v30.8h
+    trn1            v20.8h, v29.8h, v31.8h
+    trn2            v22.8h, v29.8h, v31.8h
+
+    uzp1            v28.4s, v16.4s, v18.4s
+    uzp2            v30.4s, v16.4s, v18.4s
+    uzp1            v29.4s, v20.4s, v22.4s
+    uzp2            v31.4s, v20.4s, v22.4s
+
     /* Store results to the output buffer */
-    ldp             TMP1,     TMP2,     [OUTPUT_BUF], 16
-    add             TMP1,     TMP1,     OUTPUT_COL
-    add             TMP2,     TMP2,     OUTPUT_COL
-    st1             {v16.8b}, [TMP1]
-    transpose_single v20, v21, v3, .d, .8b
-    st1             {v17.8b}, [TMP2]
-    ldp             TMP1,     TMP2,     [OUTPUT_BUF], 16
-    add             TMP1,     TMP1,     OUTPUT_COL
-    add             TMP2,     TMP2,     OUTPUT_COL
-    st1             {v18.8b}, [TMP1]
-    add             v20.8b,   v20.8b,   v0.8b
-    add             v21.8b,   v21.8b,   v0.8b
-    st1             {v19.8b}, [TMP2]
-    ldp             TMP1,     TMP2,     [OUTPUT_BUF], 16
-    ldp             TMP3,     TMP4,     [OUTPUT_BUF]
-    add             TMP1,     TMP1,     OUTPUT_COL
-    add             TMP2,     TMP2,     OUTPUT_COL
-    add             TMP3,     TMP3,     OUTPUT_COL
-    add             TMP4,     TMP4,     OUTPUT_COL
-    transpose_single v22, v23, v3, .d, .8b
-    st1             {v20.8b}, [TMP1]
-    add             v22.8b,   v22.8b,   v0.8b
-    add             v23.8b,   v23.8b,   v0.8b
-    st1             {v21.8b}, [TMP2]
-    st1             {v22.8b}, [TMP3]
-    st1             {v23.8b}, [TMP4]
-    ldr             x15, [sp], 16
-    ld1             {v0.8b, v1.8b, v2.8b, v3.8b}, [sp], 32
-    ld1             {v4.8b, v5.8b, v6.8b, v7.8b}, [sp], 32
-    ld1             {v8.8b, v9.8b, v10.8b, v11.8b}, [sp], 32
-    ld1             {v12.8b, v13.8b, v14.8b, v15.8b}, [sp], 32
-    ld1             {v16.8b, v17.8b, v18.8b, v19.8b}, [sp], 32
-    ld1             {v20.8b, v21.8b, v22.8b, v23.8b}, [sp], 32
-    ld1             {v24.8b, v25.8b, v26.8b, v27.8b}, [sp], 32
-    ld1             {v28.8b, v29.8b, v30.8b, v31.8b}, [sp], 32
+    st1             {v28.d}[0], [TMP1]
+    st1             {v29.d}[0], [TMP2]
+    st1             {v28.d}[1], [TMP3]
+    st1             {v29.d}[1], [TMP4]
+    st1             {v30.d}[0], [TMP5]
+    st1             {v31.d}[0], [TMP6]
+    st1             {v30.d}[1], [TMP7]
+    st1             {v31.d}[1], [TMP8]
+    ld1             {v8.8b, v9.8b, v10.8b, v11.8b}, [sp], #32
+    ld1             {v12.8b, v13.8b, v14.8b, v15.8b}, [sp], #32
     blr             x30
 
-3:  /* Left 4x8 half is done, right 4x8 half contains mostly zeros */
+.balign 16
+2:
+    mul             v3.8h, v3.8h, v19.8h
+    mul             v4.8h, v4.8h, v20.8h
+    mul             v5.8h, v5.8h, v21.8h
+    add             TMP4, xzr, TMP2, LSL #32
+    mul             v6.8h, v6.8h, v22.8h
+    mul             v7.8h, v7.8h, v23.8h
+    adds            TMP3, xzr, TMP2, LSR #32
+    mul             v8.8h, v8.8h, v24.8h
+    mul             v9.8h, v9.8h, v25.8h
+    b.ne            3f
+    /* Right AC coef is zero */
+    dup             v15.2d, v10.d[1]
+    /* Even part: reverse the even part of the forward DCT. */
+    add             v18.4h, v4.4h, v8.4h           /* z2 + z3 = DEQUANTIZE(inptr[DCTSIZE*2], quantptr[DCTSIZE*2]) + DEQUANTIZE(inptr[DCTSIZE*6], quantptr[DCTSIZE*6]) */
+    add             v22.4h, v2.4h, v6.4h           /* z2 + z3 = DEQUANTIZE(inptr[DCTSIZE*0], quantptr[DCTSIZE*0]) + DEQUANTIZE(inptr[DCTSIZE*4], quantptr[DCTSIZE*4]) */
+    sub             v26.4h, v2.4h, v6.4h           /* z2 - z3 = DEQUANTIZE(inptr[DCTSIZE*0], quantptr[DCTSIZE*0]) - DEQUANTIZE(inptr[DCTSIZE*4], quantptr[DCTSIZE*4]) */
+    smull           v18.4s, v18.4h, XFIX_P_0_541   /* z1l z1 = MULTIPLY(z2 + z3, FIX_0_541196100); */
+    sshll           v22.4s, v22.4h, #(CONST_BITS)  /* tmp0l tmp0 = LEFT_SHIFT(z2 + z3, CONST_BITS); */
+    mov             v20.16b, v18.16b               /* tmp3 = z1 */
+    sshll           v26.4s, v26.4h, #(CONST_BITS)  /* tmp1l tmp1 = LEFT_SHIFT(z2 - z3, CONST_BITS); */
+    smlal           v18.4s, v8.4h, XFIX_N_1_847    /* tmp2l tmp2 = z1 + MULTIPLY(z3, - FIX_1_847759065); */
+    smlal           v20.4s, v4.4h, XFIX_P_0_765    /* tmp3l tmp3 = z1 + MULTIPLY(z2, FIX_0_765366865); */
+    add             v2.4s, v22.4s, v20.4s          /* tmp10l tmp10 = tmp0 + tmp3; */
+    sub             v6.4s, v22.4s, v20.4s          /* tmp13l tmp13 = tmp0 - tmp3; */
+    add             v8.4s, v26.4s, v18.4s          /* tmp11l tmp11 = tmp1 + tmp2; */
+    sub             v4.4s, v26.4s, v18.4s          /* tmp12l tmp12 = tmp1 - tmp2; */
 
-    /* Transpose left 4x8 half */
-    transpose       ROW6L, ROW7L, v3, .16b, .4h
-    transpose       ROW2L, ROW3L, v3, .16b, .4h
-    transpose       ROW0L, ROW1L, v3, .16b, .4h
-    transpose       ROW4L, ROW5L, v3, .16b, .4h
-    shl             ROW0R.4h, ROW0R.4h, #2 /* PASS1_BITS */
-    transpose       ROW1L, ROW3L, v3, .16b, .2s
-    transpose       ROW4L, ROW6L, v3, .16b, .2s
-    transpose       ROW0L, ROW2L, v3, .16b, .2s
-    transpose       ROW5L, ROW7L, v3, .16b, .2s
-    cmp             x0, #0
-    b.eq            4f /* Right 4x8 half has all zeros, go to 'sparse' second pass */
+    /* Odd part per figure 8; the matrix is unitary and hence its
+     * transpose is its inverse.  i0..i3 are y7,y5,y3,y1 respectively.
+     */
 
-    /* Only row 0 is non-zero for the right 4x8 half  */
-    dup             ROW1R.4h, ROW0R.h[1]
-    dup             ROW2R.4h, ROW0R.h[2]
-    dup             ROW3R.4h, ROW0R.h[3]
-    dup             ROW4R.4h, ROW0R.h[0]
-    dup             ROW5R.4h, ROW0R.h[1]
-    dup             ROW6R.4h, ROW0R.h[2]
-    dup             ROW7R.4h, ROW0R.h[3]
-    dup             ROW0R.4h, ROW0R.h[0]
-    b               1b /* Go to 'normal' second pass */
+    add             v22.4h, v9.4h, v5.4h    /* z3 = tmp0 + tmp2 = DEQUANTIZE(inptr[DCTSIZE*7], quantptr[DCTSIZE*7]) + DEQUANTIZE(inptr[DCTSIZE*3], quantptr[DCTSIZE*3]) */
+    add             v24.4h, v7.4h, v3.4h    /* z4 = tmp1 + tmp3 = DEQUANTIZE(inptr[DCTSIZE*5], quantptr[DCTSIZE*5]) + DEQUANTIZE(inptr[DCTSIZE*1], quantptr[DCTSIZE*1]) */
+    add             v18.4h, v9.4h, v3.4h    /* z1 = tmp0 + tmp3 = DEQUANTIZE(inptr[DCTSIZE*7], quantptr[DCTSIZE*7]) + DEQUANTIZE(inptr[DCTSIZE*1], quantptr[DCTSIZE*1]) */
+    add             v20.4h, v7.4h, v5.4h    /* z2 = tmp1 + tmp2 = DEQUANTIZE(inptr[DCTSIZE*5], quantptr[DCTSIZE*5]) + DEQUANTIZE(inptr[DCTSIZE*3], quantptr[DCTSIZE*3]) */
+    add             v26.4h, v22.4h, v24.4h  /* z5 = z3 + z4 */
 
-4:  /* 1-D IDCT, pass 2 (sparse variant with zero rows 4-7), left 4x8 half */
-    ld1             {v2.4h},  [x15]    /* reload constants */
-    smull           v12.4s,   ROW1L.4h, XFIX_1_175875602
-    smlal           v12.4s,   ROW3L.4h, XFIX_1_175875602_MINUS_1_961570560
-    smull           v14.4s,   ROW3L.4h, XFIX_1_175875602
-    smlal           v14.4s,   ROW1L.4h, XFIX_1_175875602_MINUS_0_390180644
-    smull           v4.4s,    ROW2L.4h, XFIX_0_541196100
-    sshll           v6.4s,    ROW0L.4h, #13
-    mov             v8.16b,   v12.16b
-    smlal           v12.4s,   ROW3L.4h, XFIX_3_072711026_MINUS_2_562915447
-    smlsl           v8.4s,    ROW1L.4h, XFIX_0_899976223
-    add             v2.4s,    v6.4s,    v4.4s
-    mov             v10.16b,  v14.16b
-    smlal           v14.4s,   ROW1L.4h, XFIX_1_501321110_MINUS_0_899976223
-    add             v2.4s,    v2.4s,    v12.4s
-    add             v12.4s,   v12.4s,   v12.4s
-    smlsl           v10.4s,   ROW3L.4h, XFIX_2_562915447
-    shrn            ROW1L.4h, v2.4s,    #16
-    sub             v2.4s,    v2.4s,    v12.4s
-    smull           v12.4s,   ROW2L.4h, XFIX_0_541196100_PLUS_0_765366865
-    sub             v6.4s,    v6.4s,    v4.4s
-    shrn            ROW2R.4h, v2.4s,    #16 /* ROW6L.4h <-> ROW2R.4h */
-    add             v2.4s,    v6.4s,    v10.4s
-    sub             v6.4s,    v6.4s,    v10.4s
-    sshll           v10.4s,   ROW0L.4h, #13
-    shrn            ROW2L.4h, v2.4s,    #16
-    shrn            ROW1R.4h, v6.4s,    #16 /* ROW5L.4h <-> ROW1R.4h */
-    add             v4.4s,    v10.4s,   v12.4s
-    sub             v2.4s,    v10.4s,   v12.4s
-    add             v12.4s,   v4.4s,    v14.4s
-    sub             v4.4s,    v4.4s,    v14.4s
-    add             v10.4s,   v2.4s,    v8.4s
-    sub             v6.4s,    v2.4s,    v8.4s
-    shrn            ROW3R.4h, v4.4s,    #16 /* ROW7L.4h <-> ROW3R.4h */
-    shrn            ROW3L.4h, v10.4s,   #16
-    shrn            ROW0L.4h, v12.4s,   #16
-    shrn            ROW0R.4h, v6.4s,    #16 /* ROW4L.4h <-> ROW0R.4h */
-    /* 1-D IDCT, pass 2 (sparse variant with zero rows 4-7), right 4x8 half */
-    ld1             {v2.4h},  [x15]    /* reload constants */
-    smull           v12.4s,   ROW5L.4h, XFIX_1_175875602
-    smlal           v12.4s,   ROW7L.4h, XFIX_1_175875602_MINUS_1_961570560
-    smull           v14.4s,   ROW7L.4h, XFIX_1_175875602
-    smlal           v14.4s,   ROW5L.4h, XFIX_1_175875602_MINUS_0_390180644
-    smull           v4.4s,    ROW6L.4h, XFIX_0_541196100
-    sshll           v6.4s,    ROW4L.4h, #13
-    mov             v8.16b,   v12.16b
-    smlal           v12.4s,   ROW7L.4h, XFIX_3_072711026_MINUS_2_562915447
-    smlsl           v8.4s,    ROW5L.4h, XFIX_0_899976223
-    add             v2.4s,    v6.4s,    v4.4s
-    mov             v10.16b,  v14.16b
-    smlal           v14.4s,   ROW5L.4h, XFIX_1_501321110_MINUS_0_899976223
-    add             v2.4s,    v2.4s,    v12.4s
-    add             v12.4s,   v12.4s,   v12.4s
-    smlsl           v10.4s,   ROW7L.4h, XFIX_2_562915447
-    shrn            ROW5L.4h, v2.4s,    #16 /* ROW5L.4h <-> ROW1R.4h */
-    sub             v2.4s,    v2.4s,    v12.4s
-    smull           v12.4s,   ROW6L.4h, XFIX_0_541196100_PLUS_0_765366865
-    sub             v6.4s,    v6.4s,    v4.4s
-    shrn            ROW6R.4h, v2.4s,    #16
-    add             v2.4s,    v6.4s,    v10.4s
-    sub             v6.4s,    v6.4s,    v10.4s
-    sshll           v10.4s,   ROW4L.4h, #13
-    shrn            ROW6L.4h, v2.4s,    #16 /* ROW6L.4h <-> ROW2R.4h */
-    shrn            ROW5R.4h, v6.4s,    #16
-    add             v4.4s,    v10.4s,   v12.4s
-    sub             v2.4s,    v10.4s,   v12.4s
-    add             v12.4s,   v4.4s,    v14.4s
-    sub             v4.4s,    v4.4s,    v14.4s
-    add             v10.4s,   v2.4s,    v8.4s
-    sub             v6.4s,    v2.4s,    v8.4s
-    shrn            ROW7R.4h, v4.4s,    #16
-    shrn            ROW7L.4h, v10.4s,   #16 /* ROW7L.4h <-> ROW3R.4h */
-    shrn            ROW4L.4h, v12.4s,   #16 /* ROW4L.4h <-> ROW0R.4h */
-    shrn            ROW4R.4h, v6.4s,    #16
-    b               2b /* Go to epilogue */
+    smull           v10.4s, v9.4h, XFIX_P_0_298   /* tmp0 = MULTIPLY(tmp0, FIX_0_298631336) */
+    smull           v12.4s, v7.4h, XFIX_P_2_053   /* tmp1 = MULTIPLY(tmp1, FIX_2_053119869) */
+    smull           v14.4s, v5.4h, XFIX_P_3_072   /* tmp2 = MULTIPLY(tmp2, FIX_3_072711026) */
+    smull           v16.4s, v3.4h, XFIX_P_1_501   /* tmp3 = MULTIPLY(tmp3, FIX_1_501321110) */
+    smull           v26.4s, v26.4h, XFIX_P_1_175  /* z5l z5 = MULTIPLY(z3 + z4, FIX_1_175875602) */
+    smull           v22.4s, v22.4h, XFIX_N_1_961  /* z3 = MULTIPLY(z3, - FIX_1_961570560) */
+    smull           v24.4s, v24.4h, XFIX_N_0_390  /* z4 = MULTIPLY(z4, - FIX_0_390180644) */
+    smull           v18.4s, v18.4h, XFIX_N_0_899  /* z1 = MULTIPLY(z1, - FIX_0_899976223) */
+    smull           v20.4s, v20.4h, XFIX_N_2_562  /* z2 = MULTIPLY(z2, - FIX_2_562915447) */
+
+    add             v22.4s, v22.4s, v26.4s  /* z3 += z5 */
+    add             v24.4s, v24.4s, v26.4s  /* z4 += z5 */
+
+    add             v10.4s, v10.4s, v18.4s  /* tmp0 += z1 */
+    add             v12.4s, v12.4s, v20.4s  /* tmp1 += z2 */
+    add             v14.4s, v14.4s, v20.4s  /* tmp2 += z2 */
+    add             v16.4s, v16.4s, v18.4s  /* tmp3 += z1 */
+
+    add             v10.4s, v10.4s, v22.4s  /* tmp0 += z3 */
+    add             v12.4s, v12.4s, v24.4s  /* tmp1 += z4 */
+    add             v16.4s, v16.4s, v24.4s  /* tmp3 += z4 */
+    add             v14.4s, v14.4s, v22.4s  /* tmp2 += z3 */
+
+    /* Final output stage: inputs are tmp10..tmp13, tmp0..tmp3 */
+
+    add             v18.4s, v2.4s, v16.4s  /* tmp10 + tmp3 */
+    sub             v20.4s, v2.4s, v16.4s  /* tmp10 - tmp3 */
+    add             v22.4s, v8.4s, v14.4s  /* tmp11 + tmp2 */
+    sub             v24.4s, v8.4s, v14.4s  /* tmp11 - tmp2 */
+    add             v26.4s, v4.4s, v12.4s  /* tmp12 + tmp1 */
+    sub             v28.4s, v4.4s, v12.4s  /* tmp12 - tmp1 */
+    add             v14.4s, v6.4s, v10.4s  /* tmp13 + tmp0 */
+    sub             v16.4s, v6.4s, v10.4s  /* tmp13 - tmp0 */
+
+    rshrn           v2.4h, v18.4s, #(CONST_BITS-PASS1_BITS)  /* wsptr[DCTSIZE*0] = (int) DESCALE(tmp10 + tmp3, CONST_BITS-PASS1_BITS) */
+    rshrn           v3.4h, v22.4s, #(CONST_BITS-PASS1_BITS)  /* wsptr[DCTSIZE*1] = (int) DESCALE(tmp11 + tmp2, CONST_BITS-PASS1_BITS) */
+    rshrn           v4.4h, v26.4s, #(CONST_BITS-PASS1_BITS)  /* wsptr[DCTSIZE*2] = (int) DESCALE(tmp12 + tmp1, CONST_BITS-PASS1_BITS) */
+    rshrn           v5.4h, v14.4s, #(CONST_BITS-PASS1_BITS)  /* wsptr[DCTSIZE*3] = (int) DESCALE(tmp13 + tmp0, CONST_BITS-PASS1_BITS) */
+    rshrn2          v2.8h, v16.4s, #(CONST_BITS-PASS1_BITS)  /* wsptr[DCTSIZE*4] = (int) DESCALE(tmp13 - tmp0, CONST_BITS-PASS1_BITS) */
+    rshrn2          v3.8h, v28.4s, #(CONST_BITS-PASS1_BITS)  /* wsptr[DCTSIZE*5] = (int) DESCALE(tmp12 - tmp1, CONST_BITS-PASS1_BITS) */
+    rshrn2          v4.8h, v24.4s, #(CONST_BITS-PASS1_BITS)  /* wsptr[DCTSIZE*6] = (int) DESCALE(tmp11 - tmp2, CONST_BITS-PASS1_BITS) */
+    rshrn2          v5.8h, v20.4s, #(CONST_BITS-PASS1_BITS)  /* wsptr[DCTSIZE*7] = (int) DESCALE(tmp10 - tmp3, CONST_BITS-PASS1_BITS) */
+    mov             v6.16b, v15.16b
+    mov             v7.16b, v15.16b
+    mov             v8.16b, v15.16b
+    mov             v9.16b, v15.16b
+    b               1b
+
+.balign 16
+3:
+    cbnz            TMP4, 4f
+    /* Left AC coef is zero */
+    dup             v14.2d, v10.d[0]
+    /* Even part: reverse the even part of the forward DCT. */
+    add             v18.8h, v4.8h, v8.8h           /* z2 + z3 = DEQUANTIZE(inptr[DCTSIZE*2], quantptr[DCTSIZE*2]) + DEQUANTIZE(inptr[DCTSIZE*6], quantptr[DCTSIZE*6]) */
+    add             v22.8h, v2.8h, v6.8h           /* z2 + z3 = DEQUANTIZE(inptr[DCTSIZE*0], quantptr[DCTSIZE*0]) + DEQUANTIZE(inptr[DCTSIZE*4], quantptr[DCTSIZE*4]) */
+    smull2          v19.4s, v18.8h, XFIX_P_0_541   /* z1h z1 = MULTIPLY(z2 + z3, FIX_0_541196100); */
+    sub             v26.8h, v2.8h, v6.8h           /* z2 - z3 = DEQUANTIZE(inptr[DCTSIZE*0], quantptr[DCTSIZE*0]) - DEQUANTIZE(inptr[DCTSIZE*4], quantptr[DCTSIZE*4]) */
+    sshll2          v23.4s, v22.8h, #(CONST_BITS)  /* tmp0h tmp0 = LEFT_SHIFT(z2 + z3, CONST_BITS); */
+    mov             v21.16b, v19.16b               /* tmp3 = z1 */
+    smlal2          v19.4s, v8.8h, XFIX_N_1_847    /* tmp2h tmp2 = z1 + MULTIPLY(z3, - FIX_1_847759065); */
+    sshll2          v27.4s, v26.8h, #(CONST_BITS)  /* tmp1h tmp1 = LEFT_SHIFT(z2 - z3, CONST_BITS); */
+    smlal2          v21.4s, v4.8h, XFIX_P_0_765    /* tmp3h tmp3 = z1 + MULTIPLY(z2, FIX_0_765366865); */
+    add             v28.4s, v23.4s, v21.4s         /* tmp10h tmp10 = tmp0 + tmp3; */
+    sub             v31.4s, v23.4s, v21.4s         /* tmp13h tmp13 = tmp0 - tmp3; */
+    add             v29.4s, v27.4s, v19.4s         /* tmp11h tmp11 = tmp1 + tmp2; */
+    sub             v30.4s, v27.4s, v19.4s         /* tmp12h tmp12 = tmp1 - tmp2; */
+
+    /* Odd part per figure 8; the matrix is unitary and hence its
+     * transpose is its inverse.  i0..i3 are y7,y5,y3,y1 respectively.
+     */
+
+    add             v22.8h, v9.8h, v5.8h    /* z3 = tmp0 + tmp2 = DEQUANTIZE(inptr[DCTSIZE*7], quantptr[DCTSIZE*7]) + DEQUANTIZE(inptr[DCTSIZE*3], quantptr[DCTSIZE*3]) */
+    add             v24.8h, v7.8h, v3.8h    /* z4 = tmp1 + tmp3 = DEQUANTIZE(inptr[DCTSIZE*5], quantptr[DCTSIZE*5]) + DEQUANTIZE(inptr[DCTSIZE*1], quantptr[DCTSIZE*1]) */
+    add             v18.8h, v9.8h, v3.8h    /* z1 = tmp0 + tmp3 = DEQUANTIZE(inptr[DCTSIZE*7], quantptr[DCTSIZE*7]) + DEQUANTIZE(inptr[DCTSIZE*1], quantptr[DCTSIZE*1]) */
+    add             v20.8h, v7.8h, v5.8h    /* z2 = tmp1 + tmp2 = DEQUANTIZE(inptr[DCTSIZE*5], quantptr[DCTSIZE*5]) + DEQUANTIZE(inptr[DCTSIZE*3], quantptr[DCTSIZE*3]) */
+    add             v26.8h, v22.8h, v24.8h  /* z5 = z3 + z4 */
+
+    smull2          v11.4s, v9.8h, XFIX_P_0_298   /* tmp0 = MULTIPLY(tmp0, FIX_0_298631336) */
+    smull2          v13.4s, v7.8h, XFIX_P_2_053   /* tmp1 = MULTIPLY(tmp1, FIX_2_053119869) */
+    smull2          v15.4s, v5.8h, XFIX_P_3_072   /* tmp2 = MULTIPLY(tmp2, FIX_3_072711026) */
+    smull2          v17.4s, v3.8h, XFIX_P_1_501   /* tmp3 = MULTIPLY(tmp3, FIX_1_501321110) */
+    smull2          v27.4s, v26.8h, XFIX_P_1_175  /* z5h z5 = MULTIPLY(z3 + z4, FIX_1_175875602) */
+    smull2          v23.4s, v22.8h, XFIX_N_1_961  /* z3 = MULTIPLY(z3, - FIX_1_961570560) */
+    smull2          v25.4s, v24.8h, XFIX_N_0_390  /* z4 = MULTIPLY(z4, - FIX_0_390180644) */
+    smull2          v19.4s, v18.8h, XFIX_N_0_899  /* z1 = MULTIPLY(z1, - FIX_0_899976223) */
+    smull2          v21.4s, v20.8h, XFIX_N_2_562  /* z2 = MULTIPLY(z2, - FIX_2_562915447) */
+
+    add             v23.4s, v23.4s, v27.4s  /* z3 += z5 */
+    add             v22.4s, v22.4s, v26.4s  /* z3 += z5 */
+    add             v25.4s, v25.4s, v27.4s  /* z4 += z5 */
+    add             v24.4s, v24.4s, v26.4s  /* z4 += z5 */
+
+    add             v11.4s, v11.4s, v19.4s  /* tmp0 += z1 */
+    add             v13.4s, v13.4s, v21.4s  /* tmp1 += z2 */
+    add             v15.4s, v15.4s, v21.4s  /* tmp2 += z2 */
+    add             v17.4s, v17.4s, v19.4s  /* tmp3 += z1 */
+
+    add             v11.4s, v11.4s, v23.4s  /* tmp0 += z3 */
+    add             v13.4s, v13.4s, v25.4s  /* tmp1 += z4 */
+    add             v17.4s, v17.4s, v25.4s  /* tmp3 += z4 */
+    add             v15.4s, v15.4s, v23.4s  /* tmp2 += z3 */
+
+    /* Final output stage: inputs are tmp10..tmp13, tmp0..tmp3 */
+
+    add             v19.4s, v28.4s, v17.4s  /* tmp10 + tmp3 */
+    sub             v21.4s, v28.4s, v17.4s  /* tmp10 - tmp3 */
+    add             v23.4s, v29.4s, v15.4s  /* tmp11 + tmp2 */
+    sub             v25.4s, v29.4s, v15.4s  /* tmp11 - tmp2 */
+    add             v27.4s, v30.4s, v13.4s  /* tmp12 + tmp1 */
+    sub             v29.4s, v30.4s, v13.4s  /* tmp12 - tmp1 */
+    add             v15.4s, v31.4s, v11.4s  /* tmp13 + tmp0 */
+    sub             v17.4s, v31.4s, v11.4s  /* tmp13 - tmp0 */
+
+    mov             v2.16b, v14.16b
+    mov             v3.16b, v14.16b
+    mov             v4.16b, v14.16b
+    mov             v5.16b, v14.16b
+    rshrn           v6.4h, v19.4s, #(CONST_BITS-PASS1_BITS)  /* wsptr[DCTSIZE*0] = (int) DESCALE(tmp10 + tmp3, CONST_BITS-PASS1_BITS) */
+    rshrn           v7.4h, v23.4s, #(CONST_BITS-PASS1_BITS)  /* wsptr[DCTSIZE*1] = (int) DESCALE(tmp11 + tmp2, CONST_BITS-PASS1_BITS) */
+    rshrn           v8.4h, v27.4s, #(CONST_BITS-PASS1_BITS)  /* wsptr[DCTSIZE*2] = (int) DESCALE(tmp12 + tmp1, CONST_BITS-PASS1_BITS) */
+    rshrn           v9.4h, v15.4s, #(CONST_BITS-PASS1_BITS)  /* wsptr[DCTSIZE*3] = (int) DESCALE(tmp13 + tmp0, CONST_BITS-PASS1_BITS) */
+    rshrn2          v6.8h, v17.4s, #(CONST_BITS-PASS1_BITS)  /* wsptr[DCTSIZE*4] = (int) DESCALE(tmp13 - tmp0, CONST_BITS-PASS1_BITS) */
+    rshrn2          v7.8h, v29.4s, #(CONST_BITS-PASS1_BITS)  /* wsptr[DCTSIZE*5] = (int) DESCALE(tmp12 - tmp1, CONST_BITS-PASS1_BITS) */
+    rshrn2          v8.8h, v25.4s, #(CONST_BITS-PASS1_BITS)  /* wsptr[DCTSIZE*6] = (int) DESCALE(tmp11 - tmp2, CONST_BITS-PASS1_BITS) */
+    rshrn2          v9.8h, v21.4s, #(CONST_BITS-PASS1_BITS)  /* wsptr[DCTSIZE*7] = (int) DESCALE(tmp10 - tmp3, CONST_BITS-PASS1_BITS) */
+    b               1b
+
+.balign 16
+4:
+    /* "No" AC coef is zero */
+    /* Even part: reverse the even part of the forward DCT. */
+    add             v18.8h, v4.8h, v8.8h           /* z2 + z3 = DEQUANTIZE(inptr[DCTSIZE*2], quantptr[DCTSIZE*2]) + DEQUANTIZE(inptr[DCTSIZE*6], quantptr[DCTSIZE*6]) */
+    add             v22.8h, v2.8h, v6.8h           /* z2 + z3 = DEQUANTIZE(inptr[DCTSIZE*0], quantptr[DCTSIZE*0]) + DEQUANTIZE(inptr[DCTSIZE*4], quantptr[DCTSIZE*4]) */
+    smull2          v19.4s, v18.8h, XFIX_P_0_541   /* z1h z1 = MULTIPLY(z2 + z3, FIX_0_541196100); */
+    sub             v26.8h, v2.8h, v6.8h           /* z2 - z3 = DEQUANTIZE(inptr[DCTSIZE*0], quantptr[DCTSIZE*0]) - DEQUANTIZE(inptr[DCTSIZE*4], quantptr[DCTSIZE*4]) */
+    smull           v18.4s, v18.4h, XFIX_P_0_541   /* z1l z1 = MULTIPLY(z2 + z3, FIX_0_541196100); */
+    sshll2          v23.4s, v22.8h, #(CONST_BITS)  /* tmp0h tmp0 = LEFT_SHIFT(z2 + z3, CONST_BITS); */
+    mov             v21.16b, v19.16b               /* tmp3 = z1 */
+    mov             v20.16b, v18.16b               /* tmp3 = z1 */
+    smlal2          v19.4s, v8.8h, XFIX_N_1_847    /* tmp2h tmp2 = z1 + MULTIPLY(z3, - FIX_1_847759065); */
+    smlal           v18.4s, v8.4h, XFIX_N_1_847    /* tmp2l tmp2 = z1 + MULTIPLY(z3, - FIX_1_847759065); */
+    sshll2          v27.4s, v26.8h, #(CONST_BITS)  /* tmp1h tmp1 = LEFT_SHIFT(z2 - z3, CONST_BITS); */
+    smlal2          v21.4s, v4.8h, XFIX_P_0_765    /* tmp3h tmp3 = z1 + MULTIPLY(z2, FIX_0_765366865); */
+    smlal           v20.4s, v4.4h, XFIX_P_0_765    /* tmp3l tmp3 = z1 + MULTIPLY(z2, FIX_0_765366865); */
+    sshll           v22.4s, v22.4h, #(CONST_BITS)  /* tmp0l tmp0 = LEFT_SHIFT(z2 + z3, CONST_BITS); */
+    sshll           v26.4s, v26.4h, #(CONST_BITS)  /* tmp1l tmp1 = LEFT_SHIFT(z2 - z3, CONST_BITS); */
+    add             v2.4s, v22.4s, v20.4s          /* tmp10l tmp10 = tmp0 + tmp3; */
+    sub             v6.4s, v22.4s, v20.4s          /* tmp13l tmp13 = tmp0 - tmp3; */
+    add             v8.4s, v26.4s, v18.4s          /* tmp11l tmp11 = tmp1 + tmp2; */
+    sub             v4.4s, v26.4s, v18.4s          /* tmp12l tmp12 = tmp1 - tmp2; */
+    add             v28.4s, v23.4s, v21.4s         /* tmp10h tmp10 = tmp0 + tmp3; */
+    sub             v31.4s, v23.4s, v21.4s         /* tmp13h tmp13 = tmp0 - tmp3; */
+    add             v29.4s, v27.4s, v19.4s         /* tmp11h tmp11 = tmp1 + tmp2; */
+    sub             v30.4s, v27.4s, v19.4s         /* tmp12h tmp12 = tmp1 - tmp2; */
+
+    /* Odd part per figure 8; the matrix is unitary and hence its
+     * transpose is its inverse.  i0..i3 are y7,y5,y3,y1 respectively.
+     */
+
+    add             v22.8h, v9.8h, v5.8h    /* z3 = tmp0 + tmp2 = DEQUANTIZE(inptr[DCTSIZE*7], quantptr[DCTSIZE*7]) + DEQUANTIZE(inptr[DCTSIZE*3], quantptr[DCTSIZE*3]) */
+    add             v24.8h, v7.8h, v3.8h    /* z4 = tmp1 + tmp3 = DEQUANTIZE(inptr[DCTSIZE*5], quantptr[DCTSIZE*5]) + DEQUANTIZE(inptr[DCTSIZE*1], quantptr[DCTSIZE*1]) */
+    add             v18.8h, v9.8h, v3.8h    /* z1 = tmp0 + tmp3 = DEQUANTIZE(inptr[DCTSIZE*7], quantptr[DCTSIZE*7]) + DEQUANTIZE(inptr[DCTSIZE*1], quantptr[DCTSIZE*1]) */
+    add             v20.8h, v7.8h, v5.8h    /* z2 = tmp1 + tmp2 = DEQUANTIZE(inptr[DCTSIZE*5], quantptr[DCTSIZE*5]) + DEQUANTIZE(inptr[DCTSIZE*3], quantptr[DCTSIZE*3]) */
+    add             v26.8h, v22.8h, v24.8h  /* z5 = z3 + z4 */
+
+    smull2          v11.4s, v9.8h, XFIX_P_0_298   /* tmp0 = MULTIPLY(tmp0, FIX_0_298631336) */
+    smull2          v13.4s, v7.8h, XFIX_P_2_053   /* tmp1 = MULTIPLY(tmp1, FIX_2_053119869) */
+    smull2          v15.4s, v5.8h, XFIX_P_3_072   /* tmp2 = MULTIPLY(tmp2, FIX_3_072711026) */
+    smull2          v17.4s, v3.8h, XFIX_P_1_501   /* tmp3 = MULTIPLY(tmp3, FIX_1_501321110) */
+    smull2          v27.4s, v26.8h, XFIX_P_1_175  /* z5h z5 = MULTIPLY(z3 + z4, FIX_1_175875602) */
+    smull2          v23.4s, v22.8h, XFIX_N_1_961  /* z3 = MULTIPLY(z3, - FIX_1_961570560) */
+    smull2          v25.4s, v24.8h, XFIX_N_0_390  /* z4 = MULTIPLY(z4, - FIX_0_390180644) */
+    smull2          v19.4s, v18.8h, XFIX_N_0_899  /* z1 = MULTIPLY(z1, - FIX_0_899976223) */
+    smull2          v21.4s, v20.8h, XFIX_N_2_562  /* z2 = MULTIPLY(z2, - FIX_2_562915447) */
+
+    smull           v10.4s, v9.4h, XFIX_P_0_298   /* tmp0 = MULTIPLY(tmp0, FIX_0_298631336) */
+    smull           v12.4s, v7.4h, XFIX_P_2_053   /* tmp1 = MULTIPLY(tmp1, FIX_2_053119869) */
+    smull           v14.4s, v5.4h, XFIX_P_3_072   /* tmp2 = MULTIPLY(tmp2, FIX_3_072711026) */
+    smull           v16.4s, v3.4h, XFIX_P_1_501   /* tmp3 = MULTIPLY(tmp3, FIX_1_501321110) */
+    smull           v26.4s, v26.4h, XFIX_P_1_175  /* z5l z5 = MULTIPLY(z3 + z4, FIX_1_175875602) */
+    smull           v22.4s, v22.4h, XFIX_N_1_961  /* z3 = MULTIPLY(z3, - FIX_1_961570560) */
+    smull           v24.4s, v24.4h, XFIX_N_0_390  /* z4 = MULTIPLY(z4, - FIX_0_390180644) */
+    smull           v18.4s, v18.4h, XFIX_N_0_899  /* z1 = MULTIPLY(z1, - FIX_0_899976223) */
+    smull           v20.4s, v20.4h, XFIX_N_2_562  /* z2 = MULTIPLY(z2, - FIX_2_562915447) */
+
+    add             v23.4s, v23.4s, v27.4s  /* z3 += z5 */
+    add             v22.4s, v22.4s, v26.4s  /* z3 += z5 */
+    add             v25.4s, v25.4s, v27.4s  /* z4 += z5 */
+    add             v24.4s, v24.4s, v26.4s  /* z4 += z5 */
+
+    add             v11.4s, v11.4s, v19.4s  /* tmp0 += z1 */
+    add             v10.4s, v10.4s, v18.4s  /* tmp0 += z1 */
+    add             v13.4s, v13.4s, v21.4s  /* tmp1 += z2 */
+    add             v12.4s, v12.4s, v20.4s  /* tmp1 += z2 */
+    add             v15.4s, v15.4s, v21.4s  /* tmp2 += z2 */
+    add             v14.4s, v14.4s, v20.4s  /* tmp2 += z2 */
+    add             v17.4s, v17.4s, v19.4s  /* tmp3 += z1 */
+    add             v16.4s, v16.4s, v18.4s  /* tmp3 += z1 */
+
+    add             v11.4s, v11.4s, v23.4s  /* tmp0 += z3 */
+    add             v10.4s, v10.4s, v22.4s  /* tmp0 += z3 */
+    add             v13.4s, v13.4s, v25.4s  /* tmp1 += z4 */
+    add             v12.4s, v12.4s, v24.4s  /* tmp1 += z4 */
+    add             v17.4s, v17.4s, v25.4s  /* tmp3 += z4 */
+    add             v16.4s, v16.4s, v24.4s  /* tmp3 += z4 */
+    add             v15.4s, v15.4s, v23.4s  /* tmp2 += z3 */
+    add             v14.4s, v14.4s, v22.4s  /* tmp2 += z3 */
+
+    /* Final output stage: inputs are tmp10..tmp13, tmp0..tmp3 */
+
+    add             v18.4s, v2.4s, v16.4s   /* tmp10 + tmp3 */
+    add             v19.4s, v28.4s, v17.4s  /* tmp10 + tmp3 */
+    sub             v20.4s, v2.4s, v16.4s   /* tmp10 - tmp3 */
+    sub             v21.4s, v28.4s, v17.4s  /* tmp10 - tmp3 */
+    add             v22.4s, v8.4s, v14.4s   /* tmp11 + tmp2 */
+    add             v23.4s, v29.4s, v15.4s  /* tmp11 + tmp2 */
+    sub             v24.4s, v8.4s, v14.4s   /* tmp11 - tmp2 */
+    sub             v25.4s, v29.4s, v15.4s  /* tmp11 - tmp2 */
+    add             v26.4s, v4.4s, v12.4s   /* tmp12 + tmp1 */
+    add             v27.4s, v30.4s, v13.4s  /* tmp12 + tmp1 */
+    sub             v28.4s, v4.4s, v12.4s   /* tmp12 - tmp1 */
+    sub             v29.4s, v30.4s, v13.4s  /* tmp12 - tmp1 */
+    add             v14.4s, v6.4s, v10.4s   /* tmp13 + tmp0 */
+    add             v15.4s, v31.4s, v11.4s  /* tmp13 + tmp0 */
+    sub             v16.4s, v6.4s, v10.4s   /* tmp13 - tmp0 */
+    sub             v17.4s, v31.4s, v11.4s  /* tmp13 - tmp0 */
+
+    rshrn           v2.4h, v18.4s, #(CONST_BITS-PASS1_BITS)  /* wsptr[DCTSIZE*0] = (int) DESCALE(tmp10 + tmp3, CONST_BITS-PASS1_BITS) */
+    rshrn           v3.4h, v22.4s, #(CONST_BITS-PASS1_BITS)  /* wsptr[DCTSIZE*1] = (int) DESCALE(tmp11 + tmp2, CONST_BITS-PASS1_BITS) */
+    rshrn           v4.4h, v26.4s, #(CONST_BITS-PASS1_BITS)  /* wsptr[DCTSIZE*2] = (int) DESCALE(tmp12 + tmp1, CONST_BITS-PASS1_BITS) */
+    rshrn           v5.4h, v14.4s, #(CONST_BITS-PASS1_BITS)  /* wsptr[DCTSIZE*3] = (int) DESCALE(tmp13 + tmp0, CONST_BITS-PASS1_BITS) */
+    rshrn           v6.4h, v19.4s, #(CONST_BITS-PASS1_BITS)  /* wsptr[DCTSIZE*0] = (int) DESCALE(tmp10 + tmp3, CONST_BITS-PASS1_BITS) */
+    rshrn           v7.4h, v23.4s, #(CONST_BITS-PASS1_BITS)  /* wsptr[DCTSIZE*1] = (int) DESCALE(tmp11 + tmp2, CONST_BITS-PASS1_BITS) */
+    rshrn           v8.4h, v27.4s, #(CONST_BITS-PASS1_BITS)  /* wsptr[DCTSIZE*2] = (int) DESCALE(tmp12 + tmp1, CONST_BITS-PASS1_BITS) */
+    rshrn           v9.4h, v15.4s, #(CONST_BITS-PASS1_BITS)  /* wsptr[DCTSIZE*3] = (int) DESCALE(tmp13 + tmp0, CONST_BITS-PASS1_BITS) */
+    rshrn2          v2.8h, v16.4s, #(CONST_BITS-PASS1_BITS)  /* wsptr[DCTSIZE*4] = (int) DESCALE(tmp13 - tmp0, CONST_BITS-PASS1_BITS) */
+    rshrn2          v3.8h, v28.4s, #(CONST_BITS-PASS1_BITS)  /* wsptr[DCTSIZE*5] = (int) DESCALE(tmp12 - tmp1, CONST_BITS-PASS1_BITS) */
+    rshrn2          v4.8h, v24.4s, #(CONST_BITS-PASS1_BITS)  /* wsptr[DCTSIZE*6] = (int) DESCALE(tmp11 - tmp2, CONST_BITS-PASS1_BITS) */
+    rshrn2          v5.8h, v20.4s, #(CONST_BITS-PASS1_BITS)  /* wsptr[DCTSIZE*7] = (int) DESCALE(tmp10 - tmp3, CONST_BITS-PASS1_BITS) */
+    rshrn2          v6.8h, v17.4s, #(CONST_BITS-PASS1_BITS)  /* wsptr[DCTSIZE*4] = (int) DESCALE(tmp13 - tmp0, CONST_BITS-PASS1_BITS) */
+    rshrn2          v7.8h, v29.4s, #(CONST_BITS-PASS1_BITS)  /* wsptr[DCTSIZE*5] = (int) DESCALE(tmp12 - tmp1, CONST_BITS-PASS1_BITS) */
+    rshrn2          v8.8h, v25.4s, #(CONST_BITS-PASS1_BITS)  /* wsptr[DCTSIZE*6] = (int) DESCALE(tmp11 - tmp2, CONST_BITS-PASS1_BITS) */
+    rshrn2          v9.8h, v21.4s, #(CONST_BITS-PASS1_BITS)  /* wsptr[DCTSIZE*7] = (int) DESCALE(tmp10 - tmp3, CONST_BITS-PASS1_BITS) */
+    b               1b
 
     .unreq          DCT_TABLE
     .unreq          COEF_BLOCK
@@ -734,23 +740,26 @@
     .unreq          TMP2
     .unreq          TMP3
     .unreq          TMP4
+    .unreq          TMP5
+    .unreq          TMP6
+    .unreq          TMP7
+    .unreq          TMP8
 
-    .unreq          ROW0L
-    .unreq          ROW0R
-    .unreq          ROW1L
-    .unreq          ROW1R
-    .unreq          ROW2L
-    .unreq          ROW2R
-    .unreq          ROW3L
-    .unreq          ROW3R
-    .unreq          ROW4L
-    .unreq          ROW4R
-    .unreq          ROW5L
-    .unreq          ROW5R
-    .unreq          ROW6L
-    .unreq          ROW6R
-    .unreq          ROW7L
-    .unreq          ROW7R
+#undef CENTERJSAMPLE
+#undef CONST_BITS
+#undef PASS1_BITS
+#undef XFIX_P_0_298
+#undef XFIX_N_0_390
+#undef XFIX_P_0_541
+#undef XFIX_P_0_765
+#undef XFIX_N_0_899
+#undef XFIX_P_1_175
+#undef XFIX_P_1_501
+#undef XFIX_N_1_847
+#undef XFIX_N_1_961
+#undef XFIX_P_2_053
+#undef XFIX_N_2_562
+#undef XFIX_P_3_072
 
 
 /*****************************************************************************/
@@ -778,10 +787,10 @@
 
 .balign 16
 Ljsimd_idct_ifast_neon_consts:
-    .short (277 * 128 - 256 * 128) /* XFIX_1_082392200 */
-    .short (362 * 128 - 256 * 128) /* XFIX_1_414213562 */
-    .short (473 * 128 - 256 * 128) /* XFIX_1_847759065 */
-    .short (669 * 128 - 512 * 128) /* XFIX_2_613125930 */
+  .short (277 * 128 - 256 * 128)  /* XFIX_1_082392200 */
+  .short (362 * 128 - 256 * 128)  /* XFIX_1_414213562 */
+  .short (473 * 128 - 256 * 128)  /* XFIX_1_847759065 */
+  .short (669 * 128 - 512 * 128)  /* XFIX_2_613125930 */
 
 asm_function jsimd_idct_ifast_neon
 
@@ -791,261 +800,182 @@
     OUTPUT_COL      .req x3
     TMP1            .req x0
     TMP2            .req x1
-    TMP3            .req x2
-    TMP4            .req x22
-    TMP5            .req x23
+    TMP3            .req x9
+    TMP4            .req x10
+    TMP5            .req x11
+    TMP6            .req x12
+    TMP7            .req x13
+    TMP8            .req x14
 
     /* Load and dequantize coefficients into NEON registers
      * with the following allocation:
      *       0 1 2 3 | 4 5 6 7
      *      ---------+--------
-     *   0 | d16     | d17     ( v8.8h  )
-     *   1 | d18     | d19     ( v9.8h  )
-     *   2 | d20     | d21     ( v10.8h )
-     *   3 | d22     | d23     ( v11.8h )
-     *   4 | d24     | d25     ( v12.8h )
-     *   5 | d26     | d27     ( v13.8h )
-     *   6 | d28     | d29     ( v14.8h )
-     *   7 | d30     | d31     ( v15.8h )
+     *   0 | d16     | d17     ( v16.8h )
+     *   1 | d18     | d19     ( v17.8h )
+     *   2 | d20     | d21     ( v18.8h )
+     *   3 | d22     | d23     ( v19.8h )
+     *   4 | d24     | d25     ( v20.8h )
+     *   5 | d26     | d27     ( v21.8h )
+     *   6 | d28     | d29     ( v22.8h )
+     *   7 | d30     | d31     ( v23.8h )
      */
     /* Save NEON registers used in fast IDCT */
-    sub             sp, sp, #176
-    stp             x22, x23, [sp], 16
-    adr             x23, Ljsimd_idct_ifast_neon_consts
-    st1             {v0.8b, v1.8b, v2.8b, v3.8b}, [sp], 32
-    st1             {v4.8b, v5.8b, v6.8b, v7.8b}, [sp], 32
-    st1             {v8.8b, v9.8b, v10.8b, v11.8b}, [sp], 32
-    st1             {v12.8b, v13.8b, v14.8b, v15.8b}, [sp], 32
-    st1             {v16.8b, v17.8b, v18.8b, v19.8b}, [sp], 32
-    ld1             {v8.8h, v9.8h}, [COEF_BLOCK], 32
+    adr             TMP5, Ljsimd_idct_ifast_neon_consts
+    ld1             {v16.8h, v17.8h}, [COEF_BLOCK], 32
     ld1             {v0.8h, v1.8h}, [DCT_TABLE], 32
-    ld1             {v10.8h, v11.8h}, [COEF_BLOCK], 32
-    mul             v8.8h,  v8.8h,  v0.8h
+    ld1             {v18.8h, v19.8h}, [COEF_BLOCK], 32
+    mul             v16.8h, v16.8h, v0.8h
     ld1             {v2.8h, v3.8h}, [DCT_TABLE], 32
-    mul             v9.8h,  v9.8h,  v1.8h
-    ld1             {v12.8h, v13.8h}, [COEF_BLOCK], 32
-    mul             v10.8h, v10.8h, v2.8h
+    mul             v17.8h, v17.8h, v1.8h
+    ld1             {v20.8h, v21.8h}, [COEF_BLOCK], 32
+    mul             v18.8h, v18.8h, v2.8h
     ld1             {v0.8h, v1.8h}, [DCT_TABLE], 32
-    mul             v11.8h, v11.8h, v3.8h
-    ld1             {v14.8h, v15.8h}, [COEF_BLOCK], 32
-    mul             v12.8h, v12.8h, v0.8h
+    mul             v19.8h, v19.8h, v3.8h
+    ld1             {v22.8h, v23.8h}, [COEF_BLOCK], 32
+    mul             v20.8h, v20.8h, v0.8h
     ld1             {v2.8h, v3.8h}, [DCT_TABLE], 32
-    mul             v14.8h, v14.8h, v2.8h
-    mul             v13.8h, v13.8h, v1.8h
-    ld1             {v0.4h}, [x23]      /* load constants */
-    mul             v15.8h, v15.8h, v3.8h
+    mul             v22.8h, v22.8h, v2.8h
+    mul             v21.8h, v21.8h, v1.8h
+    ld1             {v0.4h}, [TMP5]        /* load constants */
+    mul             v23.8h, v23.8h, v3.8h
 
     /* 1-D IDCT, pass 1 */
-    sub             v2.8h,    v10.8h,   v14.8h
-    add             v14.8h,   v10.8h,   v14.8h
-    sub             v1.8h,    v11.8h,   v13.8h
-    add             v13.8h,   v11.8h,   v13.8h
-    sub             v5.8h,    v9.8h,    v15.8h
-    add             v15.8h,   v9.8h,    v15.8h
-    sqdmulh         v4.8h,    v2.8h,    XFIX_1_414213562
-    sqdmulh         v6.8h,    v1.8h,    XFIX_2_613125930
-    add             v3.8h,    v1.8h,    v1.8h
-    sub             v1.8h,    v5.8h,    v1.8h
-    add             v10.8h,   v2.8h,    v4.8h
-    sqdmulh         v4.8h,    v1.8h,    XFIX_1_847759065
-    sub             v2.8h,    v15.8h,   v13.8h
-    add             v3.8h,    v3.8h,    v6.8h
-    sqdmulh         v6.8h,    v2.8h,    XFIX_1_414213562
-    add             v1.8h,    v1.8h,    v4.8h
-    sqdmulh         v4.8h,    v5.8h,    XFIX_1_082392200
-    sub             v10.8h,   v10.8h,   v14.8h
-    add             v2.8h,    v2.8h,    v6.8h
-    sub             v6.8h,    v8.8h,    v12.8h
-    add             v12.8h,   v8.8h,    v12.8h
-    add             v9.8h,    v5.8h,    v4.8h
-    add             v5.8h,    v6.8h,    v10.8h
-    sub             v10.8h,   v6.8h,    v10.8h
-    add             v6.8h,    v15.8h,   v13.8h
-    add             v8.8h,    v12.8h,   v14.8h
-    sub             v3.8h,    v6.8h,    v3.8h
-    sub             v12.8h,   v12.8h,   v14.8h
-    sub             v3.8h,    v3.8h,    v1.8h
-    sub             v1.8h,    v9.8h,    v1.8h
-    add             v2.8h,    v3.8h,    v2.8h
-    sub             v15.8h,   v8.8h,    v6.8h
-    add             v1.8h,    v1.8h,    v2.8h
-    add             v8.8h,    v8.8h,    v6.8h
-    add             v14.8h,   v5.8h,    v3.8h
-    sub             v9.8h,    v5.8h,    v3.8h
-    sub             v13.8h,   v10.8h,   v2.8h
-    add             v10.8h,   v10.8h,   v2.8h
-    /* Transpose  q8-q9 */
-    mov             v18.16b,  v8.16b
-    trn1            v8.8h,    v8.8h,    v9.8h
-    trn2            v9.8h,    v18.8h,   v9.8h
-    sub             v11.8h,   v12.8h,   v1.8h
-    /* Transpose  q14-q15 */
-    mov             v18.16b,  v14.16b
-    trn1            v14.8h,   v14.8h,   v15.8h
-    trn2            v15.8h,   v18.8h,   v15.8h
-    add             v12.8h,   v12.8h,   v1.8h
-    /* Transpose  q10-q11 */
-    mov             v18.16b,  v10.16b
-    trn1            v10.8h,   v10.8h,   v11.8h
-    trn2            v11.8h,   v18.8h,   v11.8h
-    /* Transpose  q12-q13 */
-    mov             v18.16b,  v12.16b
-    trn1            v12.8h,   v12.8h,   v13.8h
-    trn2            v13.8h,   v18.8h,   v13.8h
-    /* Transpose  q9-q11 */
-    mov             v18.16b,  v9.16b
-    trn1            v9.4s,    v9.4s,    v11.4s
-    trn2            v11.4s,   v18.4s,   v11.4s
-    /* Transpose  q12-q14 */
-    mov             v18.16b,  v12.16b
-    trn1            v12.4s,   v12.4s,   v14.4s
-    trn2            v14.4s,   v18.4s,   v14.4s
-    /* Transpose  q8-q10 */
-    mov             v18.16b,  v8.16b
-    trn1            v8.4s,    v8.4s,    v10.4s
-    trn2            v10.4s,   v18.4s,   v10.4s
-    /* Transpose  q13-q15 */
-    mov             v18.16b,  v13.16b
-    trn1            v13.4s,   v13.4s,   v15.4s
-    trn2            v15.4s,   v18.4s,   v15.4s
-    /* vswp            v14.4h,   v10-MSB.4h */
-    umov            x22, v14.d[0]
-    ins             v14.d[0], v10.d[1]
-    ins             v10.d[1], x22
-    /* vswp            v13.4h,   v9MSB.4h */
-
-    umov            x22, v13.d[0]
-    ins             v13.d[0], v9.d[1]
-    ins             v9.d[1], x22
+    sub             v2.8h, v18.8h, v22.8h
+    add             v22.8h, v18.8h, v22.8h
+    sub             v1.8h, v19.8h, v21.8h
+    add             v21.8h, v19.8h, v21.8h
+    sub             v5.8h, v17.8h, v23.8h
+    add             v23.8h, v17.8h, v23.8h
+    sqdmulh         v4.8h, v2.8h, XFIX_1_414213562
+    sqdmulh         v6.8h, v1.8h, XFIX_2_613125930
+    add             v3.8h, v1.8h, v1.8h
+    sub             v1.8h, v5.8h, v1.8h
+    add             v18.8h, v2.8h, v4.8h
+    sqdmulh         v4.8h, v1.8h, XFIX_1_847759065
+    sub             v2.8h, v23.8h, v21.8h
+    add             v3.8h, v3.8h, v6.8h
+    sqdmulh         v6.8h, v2.8h, XFIX_1_414213562
+    add             v1.8h, v1.8h, v4.8h
+    sqdmulh         v4.8h, v5.8h, XFIX_1_082392200
+    sub             v18.8h, v18.8h, v22.8h
+    add             v2.8h, v2.8h, v6.8h
+    sub             v6.8h, v16.8h, v20.8h
+    add             v20.8h, v16.8h, v20.8h
+    add             v17.8h, v5.8h, v4.8h
+    add             v5.8h, v6.8h, v18.8h
+    sub             v18.8h, v6.8h, v18.8h
+    add             v6.8h, v23.8h, v21.8h
+    add             v16.8h, v20.8h, v22.8h
+    sub             v3.8h, v6.8h, v3.8h
+    sub             v20.8h, v20.8h, v22.8h
+    sub             v3.8h, v3.8h, v1.8h
+    sub             v1.8h, v17.8h, v1.8h
+    add             v2.8h, v3.8h, v2.8h
+    sub             v23.8h, v16.8h, v6.8h
+    add             v1.8h, v1.8h, v2.8h
+    add             v16.8h, v16.8h, v6.8h
+    add             v22.8h, v5.8h, v3.8h
+    sub             v17.8h, v5.8h, v3.8h
+    sub             v21.8h, v18.8h, v2.8h
+    add             v18.8h, v18.8h, v2.8h
+    sub             v19.8h, v20.8h, v1.8h
+    add             v20.8h, v20.8h, v1.8h
+    transpose_8x8   v16, v17, v18, v19, v20, v21, v22, v23, v28, v29, v30, v31
     /* 1-D IDCT, pass 2 */
-    sub             v2.8h,    v10.8h,   v14.8h
-    /* vswp            v15.4h,   v11MSB.4h */
-    umov            x22, v15.d[0]
-    ins             v15.d[0], v11.d[1]
-    ins             v11.d[1], x22
-    add             v14.8h,   v10.8h,   v14.8h
-    /* vswp            v12.4h,   v8-MSB.4h */
-    umov            x22, v12.d[0]
-    ins             v12.d[0], v8.d[1]
-    ins             v8.d[1],  x22
-    sub             v1.8h,    v11.8h,   v13.8h
-    add             v13.8h,   v11.8h,   v13.8h
-    sub             v5.8h,    v9.8h,    v15.8h
-    add             v15.8h,   v9.8h,    v15.8h
-    sqdmulh         v4.8h,    v2.8h,    XFIX_1_414213562
-    sqdmulh         v6.8h,    v1.8h,    XFIX_2_613125930
-    add             v3.8h,    v1.8h,    v1.8h
-    sub             v1.8h,    v5.8h,    v1.8h
-    add             v10.8h,   v2.8h,    v4.8h
-    sqdmulh         v4.8h,    v1.8h,    XFIX_1_847759065
-    sub             v2.8h,    v15.8h,   v13.8h
-    add             v3.8h,    v3.8h,    v6.8h
-    sqdmulh         v6.8h,    v2.8h,    XFIX_1_414213562
-    add             v1.8h,    v1.8h,    v4.8h
-    sqdmulh         v4.8h,    v5.8h,    XFIX_1_082392200
-    sub             v10.8h,   v10.8h,   v14.8h
-    add             v2.8h,    v2.8h,    v6.8h
-    sub             v6.8h,    v8.8h,    v12.8h
-    add             v12.8h,   v8.8h,    v12.8h
-    add             v9.8h,    v5.8h,    v4.8h
-    add             v5.8h,    v6.8h,    v10.8h
-    sub             v10.8h,   v6.8h,    v10.8h
-    add             v6.8h,    v15.8h,   v13.8h
-    add             v8.8h,    v12.8h,   v14.8h
-    sub             v3.8h,    v6.8h,    v3.8h
-    sub             v12.8h,   v12.8h,   v14.8h
-    sub             v3.8h,    v3.8h,    v1.8h
-    sub             v1.8h,    v9.8h,    v1.8h
-    add             v2.8h,    v3.8h,    v2.8h
-    sub             v15.8h,   v8.8h,    v6.8h
-    add             v1.8h,    v1.8h,    v2.8h
-    add             v8.8h,    v8.8h,    v6.8h
-    add             v14.8h,   v5.8h,    v3.8h
-    sub             v9.8h,    v5.8h,    v3.8h
-    sub             v13.8h,   v10.8h,   v2.8h
-    add             v10.8h,   v10.8h,   v2.8h
-    sub             v11.8h,   v12.8h,   v1.8h
-    add             v12.8h,   v12.8h,   v1.8h
+    sub             v2.8h, v18.8h, v22.8h
+    add             v22.8h, v18.8h, v22.8h
+    sub             v1.8h, v19.8h, v21.8h
+    add             v21.8h, v19.8h, v21.8h
+    sub             v5.8h, v17.8h, v23.8h
+    add             v23.8h, v17.8h, v23.8h
+    sqdmulh         v4.8h, v2.8h, XFIX_1_414213562
+    sqdmulh         v6.8h, v1.8h, XFIX_2_613125930
+    add             v3.8h, v1.8h, v1.8h
+    sub             v1.8h, v5.8h, v1.8h
+    add             v18.8h, v2.8h, v4.8h
+    sqdmulh         v4.8h, v1.8h, XFIX_1_847759065
+    sub             v2.8h, v23.8h, v21.8h
+    add             v3.8h, v3.8h, v6.8h
+    sqdmulh         v6.8h, v2.8h, XFIX_1_414213562
+    add             v1.8h, v1.8h, v4.8h
+    sqdmulh         v4.8h, v5.8h, XFIX_1_082392200
+    sub             v18.8h, v18.8h, v22.8h
+    add             v2.8h, v2.8h, v6.8h
+    sub             v6.8h, v16.8h, v20.8h
+    add             v20.8h, v16.8h, v20.8h
+    add             v17.8h, v5.8h, v4.8h
+    add             v5.8h, v6.8h, v18.8h
+    sub             v18.8h, v6.8h, v18.8h
+    add             v6.8h, v23.8h, v21.8h
+    add             v16.8h, v20.8h, v22.8h
+    sub             v3.8h, v6.8h, v3.8h
+    sub             v20.8h, v20.8h, v22.8h
+    sub             v3.8h, v3.8h, v1.8h
+    sub             v1.8h, v17.8h, v1.8h
+    add             v2.8h, v3.8h, v2.8h
+    sub             v23.8h, v16.8h, v6.8h
+    add             v1.8h, v1.8h, v2.8h
+    add             v16.8h, v16.8h, v6.8h
+    add             v22.8h, v5.8h, v3.8h
+    sub             v17.8h, v5.8h, v3.8h
+    sub             v21.8h, v18.8h, v2.8h
+    add             v18.8h, v18.8h, v2.8h
+    sub             v19.8h, v20.8h, v1.8h
+    add             v20.8h, v20.8h, v1.8h
     /* Descale to 8-bit and range limit */
-    movi            v0.16b,   #0x80
-    sqshrn          v8.8b,    v8.8h,    #5
-    sqshrn2         v8.16b,   v9.8h,    #5
-    sqshrn          v9.8b,    v10.8h,   #5
-    sqshrn2         v9.16b,   v11.8h,   #5
-    sqshrn          v10.8b,   v12.8h,   #5
-    sqshrn2         v10.16b,  v13.8h,   #5
-    sqshrn          v11.8b,   v14.8h,   #5
-    sqshrn2         v11.16b,  v15.8h,   #5
-    add             v8.16b,   v8.16b,   v0.16b
-    add             v9.16b,   v9.16b,   v0.16b
-    add             v10.16b,  v10.16b,  v0.16b
-    add             v11.16b,  v11.16b,  v0.16b
+    movi            v0.16b, #0x80
+      /* Prepare pointers (dual-issue with NEON instructions) */
+      ldp             TMP1, TMP2, [OUTPUT_BUF], 16
+    sqshrn          v28.8b, v16.8h, #5
+      ldp             TMP3, TMP4, [OUTPUT_BUF], 16
+    sqshrn          v29.8b, v17.8h, #5
+      add             TMP1, TMP1, OUTPUT_COL
+    sqshrn          v30.8b, v18.8h, #5
+      add             TMP2, TMP2, OUTPUT_COL
+    sqshrn          v31.8b, v19.8h, #5
+      add             TMP3, TMP3, OUTPUT_COL
+    sqshrn2         v28.16b, v20.8h, #5
+      add             TMP4, TMP4, OUTPUT_COL
+    sqshrn2         v29.16b, v21.8h, #5
+      ldp             TMP5, TMP6, [OUTPUT_BUF], 16
+    sqshrn2         v30.16b, v22.8h, #5
+      ldp             TMP7, TMP8, [OUTPUT_BUF], 16
+    sqshrn2         v31.16b, v23.8h, #5
+      add             TMP5, TMP5, OUTPUT_COL
+    add             v16.16b, v28.16b, v0.16b
+      add             TMP6, TMP6, OUTPUT_COL
+    add             v18.16b, v29.16b, v0.16b
+      add             TMP7, TMP7, OUTPUT_COL
+    add             v20.16b, v30.16b, v0.16b
+      add             TMP8, TMP8, OUTPUT_COL
+    add             v22.16b, v31.16b, v0.16b
+
     /* Transpose the final 8-bit samples */
-    /* Transpose  q8-q9 */
-    mov             v18.16b,  v8.16b
-    trn1            v8.8h,    v8.8h,    v9.8h
-    trn2            v9.8h,    v18.8h,   v9.8h
-    /* Transpose  q10-q11 */
-    mov             v18.16b,  v10.16b
-    trn1            v10.8h,   v10.8h,   v11.8h
-    trn2            v11.8h,   v18.8h,   v11.8h
-    /* Transpose  q8-q10 */
-    mov             v18.16b,  v8.16b
-    trn1            v8.4s,    v8.4s,    v10.4s
-    trn2            v10.4s,   v18.4s,   v10.4s
-    /* Transpose  q9-q11 */
-    mov             v18.16b,  v9.16b
-    trn1            v9.4s,    v9.4s,    v11.4s
-    trn2            v11.4s,   v18.4s,   v11.4s
-    /* make copy */
-    ins             v17.d[0], v8.d[1]
-    /* Transpose  d16-d17-msb */
-    mov             v18.16b,  v8.16b
-    trn1            v8.8b,    v8.8b,    v17.8b
-    trn2            v17.8b,   v18.8b,   v17.8b
-    /* make copy */
-    ins             v19.d[0], v9.d[1]
-    mov             v18.16b,  v9.16b
-    trn1            v9.8b,    v9.8b,    v19.8b
-    trn2            v19.8b,   v18.8b,   v19.8b
+    trn1            v28.16b, v16.16b, v18.16b
+    trn1            v30.16b, v20.16b, v22.16b
+    trn2            v29.16b, v16.16b, v18.16b
+    trn2            v31.16b, v20.16b, v22.16b
+
+    trn1            v16.8h, v28.8h, v30.8h
+    trn2            v18.8h, v28.8h, v30.8h
+    trn1            v20.8h, v29.8h, v31.8h
+    trn2            v22.8h, v29.8h, v31.8h
+
+    uzp1            v28.4s, v16.4s, v18.4s
+    uzp2            v30.4s, v16.4s, v18.4s
+    uzp1            v29.4s, v20.4s, v22.4s
+    uzp2            v31.4s, v20.4s, v22.4s
+
     /* Store results to the output buffer */
-    ldp             TMP1,     TMP2,     [OUTPUT_BUF], 16
-    add             TMP1,     TMP1,     OUTPUT_COL
-    add             TMP2,     TMP2,     OUTPUT_COL
-    st1             {v8.8b},  [TMP1]
-    st1             {v17.8b}, [TMP2]
-    ldp             TMP1,     TMP2,     [OUTPUT_BUF], 16
-    add             TMP1,     TMP1,     OUTPUT_COL
-    add             TMP2,     TMP2,     OUTPUT_COL
-    st1             {v9.8b},  [TMP1]
-    /* make copy */
-    ins             v7.d[0],  v10.d[1]
-    mov             v18.16b,  v10.16b
-    trn1            v10.8b,   v10.8b,   v7.8b
-    trn2            v7.8b,    v18.8b,   v7.8b
-    st1             {v19.8b}, [TMP2]
-    ldp             TMP1,     TMP2,     [OUTPUT_BUF], 16
-    ldp             TMP4,     TMP5,     [OUTPUT_BUF], 16
-    add             TMP1,     TMP1,     OUTPUT_COL
-    add             TMP2,     TMP2,     OUTPUT_COL
-    add             TMP4,     TMP4,     OUTPUT_COL
-    add             TMP5,     TMP5,     OUTPUT_COL
-    st1             {v10.8b}, [TMP1]
-    /* make copy */
-    ins             v16.d[0], v11.d[1]
-    mov             v18.16b,  v11.16b
-    trn1            v11.8b,   v11.8b,   v16.8b
-    trn2            v16.8b,   v18.8b,   v16.8b
-    st1             {v7.8b},  [TMP2]
-    st1             {v11.8b}, [TMP4]
-    st1             {v16.8b}, [TMP5]
-    sub             sp, sp, #176
-    ldp             x22, x23, [sp], 16
-    ld1             {v0.8b, v1.8b, v2.8b, v3.8b}, [sp], 32
-    ld1             {v4.8b, v5.8b, v6.8b, v7.8b}, [sp], 32
-    ld1             {v8.8b, v9.8b, v10.8b, v11.8b}, [sp], 32
-    ld1             {v12.8b, v13.8b, v14.8b, v15.8b}, [sp], 32
-    ld1             {v16.8b, v17.8b, v18.8b, v19.8b}, [sp], 32
+    st1             {v28.d}[0], [TMP1]
+    st1             {v29.d}[0], [TMP2]
+    st1             {v28.d}[1], [TMP3]
+    st1             {v29.d}[1], [TMP4]
+    st1             {v30.d}[0], [TMP5]
+    st1             {v31.d}[0], [TMP6]
+    st1             {v30.d}[1], [TMP7]
+    st1             {v31.d}[1], [TMP8]
     blr             x30
 
     .unreq          DCT_TABLE
@@ -1056,6 +986,10 @@
     .unreq          TMP2
     .unreq          TMP3
     .unreq          TMP4
+    .unreq          TMP5
+    .unreq          TMP6
+    .unreq          TMP7
+    .unreq          TMP8
 
 
 /*****************************************************************************/
@@ -1080,81 +1014,80 @@
 
 #define CONST_BITS  13
 
-#define FIX_0_211164243  (1730)  /* FIX(0.211164243) */
-#define FIX_0_509795579  (4176)  /* FIX(0.509795579) */
-#define FIX_0_601344887  (4926)  /* FIX(0.601344887) */
-#define FIX_0_720959822  (5906)  /* FIX(0.720959822) */
-#define FIX_0_765366865  (6270)  /* FIX(0.765366865) */
-#define FIX_0_850430095  (6967)  /* FIX(0.850430095) */
-#define FIX_0_899976223  (7373)  /* FIX(0.899976223) */
-#define FIX_1_061594337  (8697)  /* FIX(1.061594337) */
-#define FIX_1_272758580  (10426) /* FIX(1.272758580) */
-#define FIX_1_451774981  (11893) /* FIX(1.451774981) */
-#define FIX_1_847759065  (15137) /* FIX(1.847759065) */
-#define FIX_2_172734803  (17799) /* FIX(2.172734803) */
-#define FIX_2_562915447  (20995) /* FIX(2.562915447) */
-#define FIX_3_624509785  (29692) /* FIX(3.624509785) */
+#define FIX_0_211164243 (1730)   /* FIX(0.211164243) */
+#define FIX_0_509795579 (4176)   /* FIX(0.509795579) */
+#define FIX_0_601344887 (4926)   /* FIX(0.601344887) */
+#define FIX_0_720959822 (5906)   /* FIX(0.720959822) */
+#define FIX_0_765366865 (6270)   /* FIX(0.765366865) */
+#define FIX_0_850430095 (6967)   /* FIX(0.850430095) */
+#define FIX_0_899976223 (7373)   /* FIX(0.899976223) */
+#define FIX_1_061594337 (8697)   /* FIX(1.061594337) */
+#define FIX_1_272758580 (10426)  /* FIX(1.272758580) */
+#define FIX_1_451774981 (11893)  /* FIX(1.451774981) */
+#define FIX_1_847759065 (15137)  /* FIX(1.847759065) */
+#define FIX_2_172734803 (17799)  /* FIX(2.172734803) */
+#define FIX_2_562915447 (20995)  /* FIX(2.562915447) */
+#define FIX_3_624509785 (29692)  /* FIX(3.624509785) */
 
 .balign 16
 Ljsimd_idct_4x4_neon_consts:
-    .short     FIX_1_847759065     /* v0.h[0] */
-    .short     -FIX_0_765366865    /* v0.h[1] */
-    .short     -FIX_0_211164243    /* v0.h[2] */
-    .short     FIX_1_451774981     /* v0.h[3] */
-    .short     -FIX_2_172734803    /* d1[0] */
-    .short     FIX_1_061594337     /* d1[1] */
-    .short     -FIX_0_509795579    /* d1[2] */
-    .short     -FIX_0_601344887    /* d1[3] */
-    .short     FIX_0_899976223     /* v2.h[0] */
-    .short     FIX_2_562915447     /* v2.h[1] */
-    .short     1 << (CONST_BITS+1) /* v2.h[2] */
-    .short     0                   /* v2.h[3] */
+  .short FIX_1_847759065      /* v0.h[0] */
+  .short -FIX_0_765366865     /* v0.h[1] */
+  .short -FIX_0_211164243     /* v0.h[2] */
+  .short FIX_1_451774981      /* v0.h[3] */
+  .short -FIX_2_172734803     /* d1[0] */
+  .short FIX_1_061594337      /* d1[1] */
+  .short -FIX_0_509795579     /* d1[2] */
+  .short -FIX_0_601344887     /* d1[3] */
+  .short FIX_0_899976223      /* v2.h[0] */
+  .short FIX_2_562915447      /* v2.h[1] */
+  .short 1 << (CONST_BITS+1)  /* v2.h[2] */
+  .short 0                    /* v2.h[3] */
 
 .macro idct_helper x4, x6, x8, x10, x12, x14, x16, shift, y26, y27, y28, y29
-    smull           v28.4s, \x4,    v2.h[2]
-    smlal           v28.4s, \x8,    v0.h[0]
-    smlal           v28.4s, \x14,   v0.h[1]
+    smull           v28.4s, \x4, v2.h[2]
+    smlal           v28.4s, \x8, v0.h[0]
+    smlal           v28.4s, \x14, v0.h[1]
 
-    smull           v26.4s, \x16,   v1.h[2]
-    smlal           v26.4s, \x12,   v1.h[3]
-    smlal           v26.4s, \x10,   v2.h[0]
-    smlal           v26.4s, \x6,    v2.h[1]
+    smull           v26.4s, \x16, v1.h[2]
+    smlal           v26.4s, \x12, v1.h[3]
+    smlal           v26.4s, \x10, v2.h[0]
+    smlal           v26.4s, \x6, v2.h[1]
 
-    smull           v30.4s, \x4,    v2.h[2]
-    smlsl           v30.4s, \x8,    v0.h[0]
-    smlsl           v30.4s, \x14,   v0.h[1]
+    smull           v30.4s, \x4, v2.h[2]
+    smlsl           v30.4s, \x8, v0.h[0]
+    smlsl           v30.4s, \x14, v0.h[1]
 
-    smull           v24.4s, \x16,   v0.h[2]
-    smlal           v24.4s, \x12,   v0.h[3]
-    smlal           v24.4s, \x10,   v1.h[0]
-    smlal           v24.4s, \x6,    v1.h[1]
+    smull           v24.4s, \x16, v0.h[2]
+    smlal           v24.4s, \x12, v0.h[3]
+    smlal           v24.4s, \x10, v1.h[0]
+    smlal           v24.4s, \x6, v1.h[1]
 
     add             v20.4s, v28.4s, v26.4s
     sub             v28.4s, v28.4s, v26.4s
 
-.if \shift > 16
+  .if \shift > 16
     srshr           v20.4s, v20.4s, #\shift
     srshr           v28.4s, v28.4s, #\shift
-    xtn             \y26,   v20.4s
-    xtn             \y29,   v28.4s
-.else
-    rshrn           \y26,   v20.4s, #\shift
-    rshrn           \y29,   v28.4s, #\shift
-.endif
+    xtn             \y26, v20.4s
+    xtn             \y29, v28.4s
+  .else
+    rshrn           \y26, v20.4s, #\shift
+    rshrn           \y29, v28.4s, #\shift
+  .endif
 
     add             v20.4s, v30.4s, v24.4s
     sub             v30.4s, v30.4s, v24.4s
 
-.if \shift > 16
+  .if \shift > 16
     srshr           v20.4s, v20.4s, #\shift
     srshr           v30.4s, v30.4s, #\shift
-    xtn             \y27,   v20.4s
-    xtn             \y28,   v30.4s
-.else
-    rshrn           \y27,   v20.4s, #\shift
-    rshrn           \y28,   v30.4s, #\shift
-.endif
-
+    xtn             \y27, v20.4s
+    xtn             \y28, v30.4s
+  .else
+    rshrn           \y27, v20.4s, #\shift
+    rshrn           \y28, v30.4s, #\shift
+  .endif
 .endm
 
 asm_function jsimd_idct_4x4_neon
@@ -1204,39 +1137,43 @@
     ld1             {v18.4h, v19.4h, v20.4h, v21.4h}, [DCT_TABLE], 32
     mul             v4.4h, v4.4h, v18.4h
     mul             v5.4h, v5.4h, v19.4h
-    ins             v4.d[1], v5.d[0]    /* 128 bit q4 */
+    ins             v4.d[1], v5.d[0]              /* 128 bit q4 */
     ld1             {v22.4h, v23.4h, v24.4h, v25.4h}, [DCT_TABLE], 32
     mul             v6.4h, v6.4h, v20.4h
     mul             v7.4h, v7.4h, v21.4h
-    ins             v6.d[1], v7.d[0]    /* 128 bit q6 */
+    ins             v6.d[1], v7.d[0]              /* 128 bit q6 */
     mul             v8.4h, v8.4h, v22.4h
     mul             v9.4h, v9.4h, v23.4h
-    ins             v8.d[1], v9.d[0]    /* 128 bit q8 */
+    ins             v8.d[1], v9.d[0]              /* 128 bit q8 */
     add             DCT_TABLE, DCT_TABLE, #16
     ld1             {v26.4h, v27.4h, v28.4h, v29.4h}, [DCT_TABLE], 32
     mul             v10.4h, v10.4h, v24.4h
     mul             v11.4h, v11.4h, v25.4h
-    ins             v10.d[1], v11.d[0]  /* 128 bit q10 */
+    ins             v10.d[1], v11.d[0]            /* 128 bit q10 */
     mul             v12.4h, v12.4h, v26.4h
     mul             v13.4h, v13.4h, v27.4h
-    ins             v12.d[1], v13.d[0]  /* 128 bit q12 */
+    ins             v12.d[1], v13.d[0]            /* 128 bit q12 */
     ld1             {v30.4h, v31.4h}, [DCT_TABLE], 16
     mul             v14.4h, v14.4h, v28.4h
     mul             v15.4h, v15.4h, v29.4h
-    ins             v14.d[1], v15.d[0]  /* 128 bit q14 */
+    ins             v14.d[1], v15.d[0]            /* 128 bit q14 */
     mul             v16.4h, v16.4h, v30.4h
     mul             v17.4h, v17.4h, v31.4h
-    ins             v16.d[1], v17.d[0]  /* 128 bit q16 */
+    ins             v16.d[1], v17.d[0]            /* 128 bit q16 */
 
     /* Pass 1 */
-    idct_helper     v4.4h, v6.4h, v8.4h, v10.4h, v12.4h, v14.4h, v16.4h, 12, v4.4h, v6.4h, v8.4h, v10.4h
+    idct_helper     v4.4h, v6.4h, v8.4h, v10.4h, v12.4h, v14.4h, v16.4h, 12, \
+                    v4.4h, v6.4h, v8.4h, v10.4h
     transpose_4x4   v4, v6, v8, v10, v3
     ins             v10.d[1], v11.d[0]
-    idct_helper     v5.4h, v7.4h, v9.4h, v11.4h, v13.4h, v15.4h, v17.4h, 12, v5.4h, v7.4h, v9.4h, v11.4h
+    idct_helper     v5.4h, v7.4h, v9.4h, v11.4h, v13.4h, v15.4h, v17.4h, 12, \
+                    v5.4h, v7.4h, v9.4h, v11.4h
     transpose_4x4   v5, v7, v9, v11, v3
     ins             v10.d[1], v11.d[0]
+
     /* Pass 2 */
-    idct_helper     v4.4h, v6.4h, v8.4h, v10.4h, v7.4h, v9.4h, v11.4h, 19, v26.4h, v27.4h, v28.4h, v29.4h
+    idct_helper     v4.4h, v6.4h, v8.4h, v10.4h, v7.4h, v9.4h, v11.4h, 19, \
+                    v26.4h, v27.4h, v28.4h, v29.4h
     transpose_4x4   v26, v27, v28, v29, v3
 
     /* Range limit */
@@ -1327,31 +1264,30 @@
 
 .balign 8
 Ljsimd_idct_2x2_neon_consts:
-    .short     -FIX_0_720959822    /* v14[0] */
-    .short     FIX_0_850430095     /* v14[1] */
-    .short     -FIX_1_272758580    /* v14[2] */
-    .short     FIX_3_624509785     /* v14[3] */
+  .short -FIX_0_720959822  /* v14[0] */
+  .short FIX_0_850430095   /* v14[1] */
+  .short -FIX_1_272758580  /* v14[2] */
+  .short FIX_3_624509785   /* v14[3] */
 
 .macro idct_helper x4, x6, x10, x12, x16, shift, y26, y27
-    sshll      v15.4s, \x4,    #15
-    smull      v26.4s, \x6,    v14.h[3]
-    smlal      v26.4s, \x10,   v14.h[2]
-    smlal      v26.4s, \x12,   v14.h[1]
-    smlal      v26.4s, \x16,   v14.h[0]
+    sshll           v15.4s, \x4, #15
+    smull           v26.4s, \x6, v14.h[3]
+    smlal           v26.4s, \x10, v14.h[2]
+    smlal           v26.4s, \x12, v14.h[1]
+    smlal           v26.4s, \x16, v14.h[0]
 
-    add        v20.4s, v15.4s, v26.4s
-    sub        v15.4s, v15.4s, v26.4s
+    add             v20.4s, v15.4s, v26.4s
+    sub             v15.4s, v15.4s, v26.4s
 
-.if \shift > 16
-    srshr      v20.4s, v20.4s, #\shift
-    srshr      v15.4s, v15.4s, #\shift
-    xtn        \y26,   v20.4s
-    xtn        \y27,   v15.4s
-.else
-    rshrn      \y26,   v20.4s, #\shift
-    rshrn      \y27,   v15.4s, #\shift
-.endif
-
+  .if \shift > 16
+    srshr           v20.4s, v20.4s, #\shift
+    srshr           v15.4s, v15.4s, #\shift
+    xtn             \y26, v20.4s
+    xtn             \y27, v15.4s
+  .else
+    rshrn           \y26, v20.4s, #\shift
+    rshrn           \y27, v15.4s, #\shift
+  .endif
 .endm
 
 asm_function jsimd_idct_2x2_neon
@@ -1424,28 +1360,28 @@
     /* Pass 1 */
 #if 0
     idct_helper     v4.4h, v6.4h, v10.4h, v12.4h, v16.4h, 13, v4.4h, v6.4h
-    transpose_4x4   v4.4h, v6.4h, v8.4h,  v10.4h
+    transpose_4x4   v4.4h, v6.4h, v8.4h, v10.4h
     idct_helper     v5.4h, v7.4h, v11.4h, v13.4h, v17.4h, 13, v5.4h, v7.4h
-    transpose_4x4   v5.4h, v7.4h, v9.4h,  v11.4h
+    transpose_4x4   v5.4h, v7.4h, v9.4h, v11.4h
 #else
-    smull           v26.4s, v6.4h,  v14.h[3]
+    smull           v26.4s, v6.4h, v14.h[3]
     smlal           v26.4s, v10.4h, v14.h[2]
     smlal           v26.4s, v12.4h, v14.h[1]
     smlal           v26.4s, v16.4h, v14.h[0]
-    smull           v24.4s, v7.4h,  v14.h[3]
+    smull           v24.4s, v7.4h, v14.h[3]
     smlal           v24.4s, v11.4h, v14.h[2]
     smlal           v24.4s, v13.4h, v14.h[1]
     smlal           v24.4s, v17.4h, v14.h[0]
-    sshll           v15.4s, v4.4h,  #15
-    sshll           v30.4s, v5.4h,  #15
+    sshll           v15.4s, v4.4h, #15
+    sshll           v30.4s, v5.4h, #15
     add             v20.4s, v15.4s, v26.4s
     sub             v15.4s, v15.4s, v26.4s
-    rshrn           v4.4h,  v20.4s, #13
-    rshrn           v6.4h,  v15.4s, #13
+    rshrn           v4.4h, v20.4s, #13
+    rshrn           v6.4h, v15.4s, #13
     add             v20.4s, v30.4s, v24.4s
     sub             v15.4s, v30.4s, v24.4s
-    rshrn           v5.4h,  v20.4s, #13
-    rshrn           v7.4h,  v15.4s, #13
+    rshrn           v5.4h, v20.4s, #13
+    rshrn           v7.4h, v15.4s, #13
     ins             v4.d[1], v5.d[0]
     ins             v6.d[1], v7.d[0]
     transpose       v4, v6, v3, .16b, .8h
@@ -1509,188 +1445,222 @@
  * Colorspace conversion YCbCr -> RGB
  */
 
-
 .macro do_load size
+  .if \size == 8
+    ld1             {v4.8b}, [U], 8
+    ld1             {v5.8b}, [V], 8
+    ld1             {v0.8b}, [Y], 8
+    prfm            pldl1keep, [U, #64]
+    prfm            pldl1keep, [V, #64]
+    prfm            pldl1keep, [Y, #64]
+  .elseif \size == 4
+    ld1             {v4.b}[0], [U], 1
+    ld1             {v4.b}[1], [U], 1
+    ld1             {v4.b}[2], [U], 1
+    ld1             {v4.b}[3], [U], 1
+    ld1             {v5.b}[0], [V], 1
+    ld1             {v5.b}[1], [V], 1
+    ld1             {v5.b}[2], [V], 1
+    ld1             {v5.b}[3], [V], 1
+    ld1             {v0.b}[0], [Y], 1
+    ld1             {v0.b}[1], [Y], 1
+    ld1             {v0.b}[2], [Y], 1
+    ld1             {v0.b}[3], [Y], 1
+  .elseif \size == 2
+    ld1             {v4.b}[4], [U], 1
+    ld1             {v4.b}[5], [U], 1
+    ld1             {v5.b}[4], [V], 1
+    ld1             {v5.b}[5], [V], 1
+    ld1             {v0.b}[4], [Y], 1
+    ld1             {v0.b}[5], [Y], 1
+  .elseif \size == 1
+    ld1             {v4.b}[6], [U], 1
+    ld1             {v5.b}[6], [V], 1
+    ld1             {v0.b}[6], [Y], 1
+  .else
+    .error unsupported macroblock size
+  .endif
+.endm
+
+.macro do_store bpp, size, fast_st3
+  .if \bpp == 24
     .if \size == 8
-        ld1  {v4.8b}, [U], 8
-        ld1  {v5.8b}, [V], 8
-        ld1  {v0.8b}, [Y], 8
-        prfm pldl1keep, [U, #64]
-        prfm pldl1keep, [V, #64]
-        prfm pldl1keep, [Y, #64]
+      .if \fast_st3 == 1
+        st3         {v10.8b, v11.8b, v12.8b}, [RGB], 24
+      .else
+        st1         {v10.b}[0], [RGB], #1
+        st1         {v11.b}[0], [RGB], #1
+        st1         {v12.b}[0], [RGB], #1
+
+        st1         {v10.b}[1], [RGB], #1
+        st1         {v11.b}[1], [RGB], #1
+        st1         {v12.b}[1], [RGB], #1
+
+        st1         {v10.b}[2], [RGB], #1
+        st1         {v11.b}[2], [RGB], #1
+        st1         {v12.b}[2], [RGB], #1
+
+        st1         {v10.b}[3], [RGB], #1
+        st1         {v11.b}[3], [RGB], #1
+        st1         {v12.b}[3], [RGB], #1
+
+        st1         {v10.b}[4], [RGB], #1
+        st1         {v11.b}[4], [RGB], #1
+        st1         {v12.b}[4], [RGB], #1
+
+        st1         {v10.b}[5], [RGB], #1
+        st1         {v11.b}[5], [RGB], #1
+        st1         {v12.b}[5], [RGB], #1
+
+        st1         {v10.b}[6], [RGB], #1
+        st1         {v11.b}[6], [RGB], #1
+        st1         {v12.b}[6], [RGB], #1
+
+        st1         {v10.b}[7], [RGB], #1
+        st1         {v11.b}[7], [RGB], #1
+        st1         {v12.b}[7], [RGB], #1
+      .endif
     .elseif \size == 4
-        ld1  {v4.b}[0], [U], 1
-        ld1  {v4.b}[1], [U], 1
-        ld1  {v4.b}[2], [U], 1
-        ld1  {v4.b}[3], [U], 1
-        ld1  {v5.b}[0], [V], 1
-        ld1  {v5.b}[1], [V], 1
-        ld1  {v5.b}[2], [V], 1
-        ld1  {v5.b}[3], [V], 1
-        ld1  {v0.b}[0], [Y], 1
-        ld1  {v0.b}[1], [Y], 1
-        ld1  {v0.b}[2], [Y], 1
-        ld1  {v0.b}[3], [Y], 1
+      st3           {v10.b, v11.b, v12.b}[0], [RGB], 3
+      st3           {v10.b, v11.b, v12.b}[1], [RGB], 3
+      st3           {v10.b, v11.b, v12.b}[2], [RGB], 3
+      st3           {v10.b, v11.b, v12.b}[3], [RGB], 3
     .elseif \size == 2
-        ld1  {v4.b}[4], [U], 1
-        ld1  {v4.b}[5], [U], 1
-        ld1  {v5.b}[4], [V], 1
-        ld1  {v5.b}[5], [V], 1
-        ld1  {v0.b}[4], [Y], 1
-        ld1  {v0.b}[5], [Y], 1
+      st3           {v10.b, v11.b, v12.b}[4], [RGB], 3
+      st3           {v10.b, v11.b, v12.b}[5], [RGB], 3
     .elseif \size == 1
-        ld1  {v4.b}[6], [U], 1
-        ld1  {v5.b}[6], [V], 1
-        ld1  {v0.b}[6], [Y], 1
+      st3           {v10.b, v11.b, v12.b}[6], [RGB], 3
     .else
-        .error unsupported macroblock size
+     .error unsupported macroblock size
     .endif
+  .elseif \bpp == 32
+    .if \size == 8
+      st4           {v10.8b, v11.8b, v12.8b, v13.8b}, [RGB], 32
+    .elseif \size == 4
+      st4           {v10.b, v11.b, v12.b, v13.b}[0], [RGB], 4
+      st4           {v10.b, v11.b, v12.b, v13.b}[1], [RGB], 4
+      st4           {v10.b, v11.b, v12.b, v13.b}[2], [RGB], 4
+      st4           {v10.b, v11.b, v12.b, v13.b}[3], [RGB], 4
+    .elseif \size == 2
+      st4           {v10.b, v11.b, v12.b, v13.b}[4], [RGB], 4
+      st4           {v10.b, v11.b, v12.b, v13.b}[5], [RGB], 4
+    .elseif \size == 1
+      st4           {v10.b, v11.b, v12.b, v13.b}[6], [RGB], 4
+    .else
+      .error unsupported macroblock size
+    .endif
+  .elseif \bpp==16
+    .if \size == 8
+      st1           {v25.8h}, [RGB], 16
+    .elseif \size == 4
+      st1           {v25.4h}, [RGB], 8
+    .elseif \size == 2
+      st1           {v25.h}[4], [RGB], 2
+      st1           {v25.h}[5], [RGB], 2
+    .elseif \size == 1
+      st1           {v25.h}[6], [RGB], 2
+    .else
+      .error unsupported macroblock size
+    .endif
+  .else
+    .error unsupported bpp
+  .endif
 .endm
 
-.macro do_store bpp, size
-    .if \bpp == 24
-        .if \size == 8
-            st3  {v10.8b, v11.8b, v12.8b}, [RGB], 24
-        .elseif \size == 4
-            st3  {v10.b, v11.b, v12.b}[0], [RGB], 3
-            st3  {v10.b, v11.b, v12.b}[1], [RGB], 3
-            st3  {v10.b, v11.b, v12.b}[2], [RGB], 3
-            st3  {v10.b, v11.b, v12.b}[3], [RGB], 3
-        .elseif \size == 2
-            st3  {v10.b, v11.b, v12.b}[4], [RGB], 3
-            st3  {v10.b, v11.b, v12.b}[5], [RGB], 3
-        .elseif \size == 1
-            st3  {v10.b, v11.b, v12.b}[6], [RGB], 3
-        .else
-            .error unsupported macroblock size
-        .endif
-    .elseif \bpp == 32
-        .if \size == 8
-            st4  {v10.8b, v11.8b, v12.8b, v13.8b}, [RGB], 32
-        .elseif \size == 4
-            st4  {v10.b, v11.b, v12.b, v13.b}[0], [RGB], 4
-            st4  {v10.b, v11.b, v12.b, v13.b}[1], [RGB], 4
-            st4  {v10.b, v11.b, v12.b, v13.b}[2], [RGB], 4
-            st4  {v10.b, v11.b, v12.b, v13.b}[3], [RGB], 4
-        .elseif \size == 2
-            st4  {v10.b, v11.b, v12.b, v13.b}[4], [RGB], 4
-            st4  {v10.b, v11.b, v12.b, v13.b}[5], [RGB], 4
-        .elseif \size == 1
-            st4  {v10.b, v11.b, v12.b, v13.b}[6], [RGB], 4
-        .else
-            .error unsupported macroblock size
-        .endif
-    .elseif \bpp==16
-        .if \size == 8
-            st1  {v25.8h}, [RGB],16
-        .elseif \size == 4
-            st1  {v25.4h}, [RGB],8
-        .elseif \size == 2
-            st1  {v25.h}[4], [RGB],2
-            st1  {v25.h}[5], [RGB],2
-        .elseif \size == 1
-            st1  {v25.h}[6], [RGB],2
-        .else
-            .error unsupported macroblock size
-        .endif
-     .else
-        .error unsupported bpp
-    .endif
-.endm
-
-.macro generate_jsimd_ycc_rgb_convert_neon colorid, bpp, r_offs, rsize, g_offs, gsize, b_offs, bsize, defsize
+.macro generate_jsimd_ycc_rgb_convert_neon colorid, bpp, r_offs, rsize, \
+                                           g_offs, gsize, b_offs, bsize, \
+                                           defsize, fast_st3
 
 /*
  * 2-stage pipelined YCbCr->RGB conversion
  */
 
 .macro do_yuv_to_rgb_stage1
-    uaddw        v6.8h, v2.8h, v4.8b     /* q3 = u - 128 */
-    uaddw        v8.8h, v2.8h, v5.8b     /* q2 = v - 128 */
-    smull        v20.4s, v6.4h, v1.h[1]  /* multiply by -11277 */
-    smlal        v20.4s, v8.4h, v1.h[2]  /* multiply by -23401 */
-    smull2       v22.4s, v6.8h, v1.h[1]  /* multiply by -11277 */
-    smlal2       v22.4s, v8.8h, v1.h[2]  /* multiply by -23401 */
-    smull        v24.4s, v8.4h, v1.h[0]  /* multiply by 22971 */
-    smull2       v26.4s, v8.8h, v1.h[0]  /* multiply by 22971 */
-    smull        v28.4s, v6.4h, v1.h[3]  /* multiply by 29033 */
-    smull2       v30.4s, v6.8h, v1.h[3]  /* multiply by 29033 */
+    uaddw           v6.8h, v2.8h, v4.8b     /* q3 = u - 128 */
+    uaddw           v8.8h, v2.8h, v5.8b     /* q2 = v - 128 */
+    smull           v20.4s, v6.4h, v1.h[1]  /* multiply by -11277 */
+    smlal           v20.4s, v8.4h, v1.h[2]  /* multiply by -23401 */
+    smull2          v22.4s, v6.8h, v1.h[1]  /* multiply by -11277 */
+    smlal2          v22.4s, v8.8h, v1.h[2]  /* multiply by -23401 */
+    smull           v24.4s, v8.4h, v1.h[0]  /* multiply by 22971 */
+    smull2          v26.4s, v8.8h, v1.h[0]  /* multiply by 22971 */
+    smull           v28.4s, v6.4h, v1.h[3]  /* multiply by 29033 */
+    smull2          v30.4s, v6.8h, v1.h[3]  /* multiply by 29033 */
 .endm
 
 .macro do_yuv_to_rgb_stage2
-    rshrn        v20.4h, v20.4s, #15
-    rshrn2       v20.8h, v22.4s, #15
-    rshrn        v24.4h, v24.4s, #14
-    rshrn2       v24.8h, v26.4s, #14
-    rshrn        v28.4h, v28.4s, #14
-    rshrn2       v28.8h, v30.4s, #14
-    uaddw        v20.8h, v20.8h, v0.8b
-    uaddw        v24.8h, v24.8h, v0.8b
-    uaddw        v28.8h, v28.8h, v0.8b
-.if \bpp != 16
-    sqxtun       v1\g_offs\defsize, v20.8h
-    sqxtun       v1\r_offs\defsize, v24.8h
-    sqxtun       v1\b_offs\defsize, v28.8h
-.else
-    sqshlu       v21.8h, v20.8h, #8
-    sqshlu       v25.8h, v24.8h, #8
-    sqshlu       v29.8h, v28.8h, #8
-    sri          v25.8h, v21.8h, #5
-    sri          v25.8h, v29.8h, #11
-.endif
-
+    rshrn           v20.4h, v20.4s, #15
+    rshrn2          v20.8h, v22.4s, #15
+    rshrn           v24.4h, v24.4s, #14
+    rshrn2          v24.8h, v26.4s, #14
+    rshrn           v28.4h, v28.4s, #14
+    rshrn2          v28.8h, v30.4s, #14
+    uaddw           v20.8h, v20.8h, v0.8b
+    uaddw           v24.8h, v24.8h, v0.8b
+    uaddw           v28.8h, v28.8h, v0.8b
+  .if \bpp != 16
+    sqxtun          v1\g_offs\defsize, v20.8h
+    sqxtun          v1\r_offs\defsize, v24.8h
+    sqxtun          v1\b_offs\defsize, v28.8h
+  .else
+    sqshlu          v21.8h, v20.8h, #8
+    sqshlu          v25.8h, v24.8h, #8
+    sqshlu          v29.8h, v28.8h, #8
+    sri             v25.8h, v21.8h, #5
+    sri             v25.8h, v29.8h, #11
+  .endif
 .endm
 
-.macro do_yuv_to_rgb_stage2_store_load_stage1
-    rshrn        v20.4h, v20.4s, #15
-    rshrn        v24.4h, v24.4s, #14
-    rshrn        v28.4h, v28.4s, #14
-    ld1          {v4.8b}, [U], 8
-    rshrn2       v20.8h, v22.4s, #15
-    rshrn2       v24.8h, v26.4s, #14
-    rshrn2       v28.8h, v30.4s, #14
-    ld1          {v5.8b}, [V], 8
-    uaddw        v20.8h, v20.8h, v0.8b
-    uaddw        v24.8h, v24.8h, v0.8b
-    uaddw        v28.8h, v28.8h, v0.8b
-.if \bpp != 16 /**************** rgb24/rgb32 *********************************/
-    sqxtun       v1\g_offs\defsize, v20.8h
-    ld1          {v0.8b}, [Y], 8
-    sqxtun       v1\r_offs\defsize, v24.8h
-    prfm         pldl1keep, [U, #64]
-    prfm         pldl1keep, [V, #64]
-    prfm         pldl1keep, [Y, #64]
-    sqxtun       v1\b_offs\defsize, v28.8h
-    uaddw        v6.8h, v2.8h, v4.8b     /* v6.16b = u - 128 */
-    uaddw        v8.8h, v2.8h, v5.8b     /* q2 = v - 128 */
-    smull        v20.4s, v6.4h, v1.h[1]  /* multiply by -11277 */
-    smlal        v20.4s, v8.4h, v1.h[2]  /* multiply by -23401 */
-    smull2       v22.4s, v6.8h, v1.h[1]  /* multiply by -11277 */
-    smlal2       v22.4s, v8.8h, v1.h[2]  /* multiply by -23401 */
-    smull        v24.4s, v8.4h, v1.h[0]  /* multiply by 22971 */
-    smull2       v26.4s, v8.8h, v1.h[0]  /* multiply by 22971 */
-.else /**************************** rgb565 ***********************************/
-    sqshlu       v21.8h, v20.8h, #8
-    sqshlu       v25.8h, v24.8h, #8
-    sqshlu       v29.8h, v28.8h, #8
-    uaddw        v6.8h, v2.8h, v4.8b     /* v6.16b = u - 128 */
-    uaddw        v8.8h, v2.8h, v5.8b     /* q2 = v - 128 */
-    ld1          {v0.8b}, [Y], 8
-    smull        v20.4s, v6.4h, v1.h[1]  /* multiply by -11277 */
-    smlal        v20.4s, v8.4h, v1.h[2]  /* multiply by -23401 */
-    smull2       v22.4s, v6.8h, v1.h[1]  /* multiply by -11277 */
-    smlal2       v22.4s, v8.8h, v1.h[2]  /* multiply by -23401 */
-    sri          v25.8h, v21.8h, #5
-    smull        v24.4s, v8.4h, v1.h[0]  /* multiply by 22971 */
-    smull2       v26.4s, v8.8h, v1.h[0]  /* multiply by 22971 */
-    prfm         pldl1keep, [U, #64]
-    prfm         pldl1keep, [V, #64]
-    prfm         pldl1keep, [Y, #64]
-    sri          v25.8h, v29.8h, #11
-.endif
-    do_store     \bpp, 8
-    smull        v28.4s, v6.4h, v1.h[3]  /* multiply by 29033 */
-    smull2       v30.4s, v6.8h, v1.h[3]  /* multiply by 29033 */
+.macro do_yuv_to_rgb_stage2_store_load_stage1 fast_st3
+    rshrn           v20.4h, v20.4s, #15
+    rshrn           v24.4h, v24.4s, #14
+    rshrn           v28.4h, v28.4s, #14
+    ld1             {v4.8b}, [U], 8
+    rshrn2          v20.8h, v22.4s, #15
+    rshrn2          v24.8h, v26.4s, #14
+    rshrn2          v28.8h, v30.4s, #14
+    ld1             {v5.8b}, [V], 8
+    uaddw           v20.8h, v20.8h, v0.8b
+    uaddw           v24.8h, v24.8h, v0.8b
+    uaddw           v28.8h, v28.8h, v0.8b
+  .if \bpp != 16  /**************** rgb24/rgb32 ******************************/
+    sqxtun          v1\g_offs\defsize, v20.8h
+    ld1             {v0.8b}, [Y], 8
+    sqxtun          v1\r_offs\defsize, v24.8h
+    prfm            pldl1keep, [U, #64]
+    prfm            pldl1keep, [V, #64]
+    prfm            pldl1keep, [Y, #64]
+    sqxtun          v1\b_offs\defsize, v28.8h
+    uaddw           v6.8h, v2.8h, v4.8b     /* v6.16b = u - 128 */
+    uaddw           v8.8h, v2.8h, v5.8b     /* q2 = v - 128 */
+    smull           v20.4s, v6.4h, v1.h[1]  /* multiply by -11277 */
+    smlal           v20.4s, v8.4h, v1.h[2]  /* multiply by -23401 */
+    smull2          v22.4s, v6.8h, v1.h[1]  /* multiply by -11277 */
+    smlal2          v22.4s, v8.8h, v1.h[2]  /* multiply by -23401 */
+    smull           v24.4s, v8.4h, v1.h[0]  /* multiply by 22971 */
+    smull2          v26.4s, v8.8h, v1.h[0]  /* multiply by 22971 */
+  .else  /**************************** rgb565 ********************************/
+    sqshlu          v21.8h, v20.8h, #8
+    sqshlu          v25.8h, v24.8h, #8
+    sqshlu          v29.8h, v28.8h, #8
+    uaddw           v6.8h, v2.8h, v4.8b     /* v6.16b = u - 128 */
+    uaddw           v8.8h, v2.8h, v5.8b     /* q2 = v - 128 */
+    ld1             {v0.8b}, [Y], 8
+    smull           v20.4s, v6.4h, v1.h[1]  /* multiply by -11277 */
+    smlal           v20.4s, v8.4h, v1.h[2]  /* multiply by -23401 */
+    smull2          v22.4s, v6.8h, v1.h[1]  /* multiply by -11277 */
+    smlal2          v22.4s, v8.8h, v1.h[2]  /* multiply by -23401 */
+    sri             v25.8h, v21.8h, #5
+    smull           v24.4s, v8.4h, v1.h[0]  /* multiply by 22971 */
+    smull2          v26.4s, v8.8h, v1.h[0]  /* multiply by 22971 */
+    prfm            pldl1keep, [U, #64]
+    prfm            pldl1keep, [V, #64]
+    prfm            pldl1keep, [Y, #64]
+    sri             v25.8h, v29.8h, #11
+  .endif
+    do_store        \bpp, 8, \fast_st3
+    smull           v28.4s, v6.4h, v1.h[3]  /* multiply by 29033 */
+    smull2          v30.4s, v6.8h, v1.h[3]  /* multiply by 29033 */
 .endm
 
 .macro do_yuv_to_rgb
@@ -1703,13 +1673,21 @@
  */
 
 .balign 16
+.if \fast_st3 == 1
 Ljsimd_ycc_\colorid\()_neon_consts:
-    .short          0,      0,     0,      0
-    .short          22971, -11277, -23401, 29033
-    .short          -128,  -128,   -128,   -128
-    .short          -128,  -128,   -128,   -128
+.else
+Ljsimd_ycc_\colorid\()_neon_slowst3_consts:
+.endif
+  .short 0,      0,     0,      0
+  .short 22971, -11277, -23401, 29033
+  .short -128,  -128,   -128,   -128
+  .short -128,  -128,   -128,   -128
 
+.if \fast_st3 == 1
 asm_function jsimd_ycc_\colorid\()_convert_neon
+.else
+asm_function jsimd_ycc_\colorid\()_convert_neon_slowst3
+.endif
     OUTPUT_WIDTH    .req x0
     INPUT_BUF       .req x1
     INPUT_ROW       .req x2
@@ -1728,8 +1706,14 @@
 
     sub             sp, sp, 336
     str             x15, [sp], 16
+
     /* Load constants to d1, d2, d3 (v0.4h is just used for padding) */
-    adr             x15, Ljsimd_ycc_\colorid\()_neon_consts
+    .if \fast_st3 == 1
+      adr           x15, Ljsimd_ycc_\colorid\()_neon_consts
+    .else
+      adr           x15, Ljsimd_ycc_\colorid\()_neon_slowst3_consts
+    .endif
+
     /* Save NEON registers */
     st1             {v0.8b, v1.8b, v2.8b, v3.8b}, [sp], 32
     st1             {v4.8b, v5.8b, v6.8b, v7.8b}, [sp], 32
@@ -1777,12 +1761,12 @@
     subs            N, N, #8
     b.lt            2f
 1:
-    do_yuv_to_rgb_stage2_store_load_stage1
+    do_yuv_to_rgb_stage2_store_load_stage1 \fast_st3
     subs            N, N, #8
     b.ge            1b
 2:
     do_yuv_to_rgb_stage2
-    do_store        \bpp, 8
+    do_store        \bpp, 8, \fast_st3
     tst             N, #7
     b.eq            8f
 3:
@@ -1801,15 +1785,15 @@
     do_yuv_to_rgb
     tst             N, #4
     b.eq            6f
-    do_store        \bpp, 4
+    do_store        \bpp, 4, \fast_st3
 6:
     tst             N, #2
     b.eq            7f
-    do_store        \bpp, 2
+    do_store        \bpp, 2, \fast_st3
 7:
     tst             N, #1
     b.eq            8f
-    do_store        \bpp, 1
+    do_store        \bpp, 1, \fast_st3
 8:
     subs            NUM_ROWS, NUM_ROWS, #1
     b.gt            0b
@@ -1848,15 +1832,1622 @@
 .purgem do_yuv_to_rgb_stage1
 .purgem do_yuv_to_rgb_stage2
 .purgem do_yuv_to_rgb_stage2_store_load_stage1
+
 .endm
 
-/*--------------------------------- id ----- bpp R  rsize  G  gsize  B  bsize  defsize   */
-generate_jsimd_ycc_rgb_convert_neon extrgb,  24, 0, .4h,   1, .4h,   2, .4h,   .8b
-generate_jsimd_ycc_rgb_convert_neon extbgr,  24, 2, .4h,   1, .4h,   0, .4h,   .8b
-generate_jsimd_ycc_rgb_convert_neon extrgbx, 32, 0, .4h,   1, .4h,   2, .4h,   .8b
-generate_jsimd_ycc_rgb_convert_neon extbgrx, 32, 2, .4h,   1, .4h,   0, .4h,   .8b
-generate_jsimd_ycc_rgb_convert_neon extxbgr, 32, 3, .4h,   2, .4h,   1, .4h,   .8b
-generate_jsimd_ycc_rgb_convert_neon extxrgb, 32, 1, .4h,   2, .4h,   3, .4h,   .8b
-generate_jsimd_ycc_rgb_convert_neon rgb565,  16, 0, .4h,   0, .4h,   0, .4h,   .8b
+/*--------------------------------- id ----- bpp R  rsize G  gsize B  bsize defsize fast_st3*/
+generate_jsimd_ycc_rgb_convert_neon extrgb,  24, 0, .4h,  1, .4h,  2, .4h,  .8b,    1
+generate_jsimd_ycc_rgb_convert_neon extbgr,  24, 2, .4h,  1, .4h,  0, .4h,  .8b,    1
+generate_jsimd_ycc_rgb_convert_neon extrgbx, 32, 0, .4h,  1, .4h,  2, .4h,  .8b,    1
+generate_jsimd_ycc_rgb_convert_neon extbgrx, 32, 2, .4h,  1, .4h,  0, .4h,  .8b,    1
+generate_jsimd_ycc_rgb_convert_neon extxbgr, 32, 3, .4h,  2, .4h,  1, .4h,  .8b,    1
+generate_jsimd_ycc_rgb_convert_neon extxrgb, 32, 1, .4h,  2, .4h,  3, .4h,  .8b,    1
+generate_jsimd_ycc_rgb_convert_neon rgb565,  16, 0, .4h,  0, .4h,  0, .4h,  .8b,    1
+
+generate_jsimd_ycc_rgb_convert_neon extrgb,  24, 0, .4h,  1, .4h,  2, .4h,  .8b,    0
+generate_jsimd_ycc_rgb_convert_neon extbgr,  24, 2, .4h,  1, .4h,  0, .4h,  .8b,    0
+
 .purgem do_load
 .purgem do_store
+
+
+/*****************************************************************************/
+
+/*
+ * jsimd_extrgb_ycc_convert_neon
+ * jsimd_extbgr_ycc_convert_neon
+ * jsimd_extrgbx_ycc_convert_neon
+ * jsimd_extbgrx_ycc_convert_neon
+ * jsimd_extxbgr_ycc_convert_neon
+ * jsimd_extxrgb_ycc_convert_neon
+ *
+ * Colorspace conversion RGB -> YCbCr
+ */
+
+.macro do_store size
+  .if \size == 8
+    st1             {v20.8b}, [Y], #8
+    st1             {v21.8b}, [U], #8
+    st1             {v22.8b}, [V], #8
+  .elseif \size == 4
+    st1             {v20.b}[0], [Y], #1
+    st1             {v20.b}[1], [Y], #1
+    st1             {v20.b}[2], [Y], #1
+    st1             {v20.b}[3], [Y], #1
+    st1             {v21.b}[0], [U], #1
+    st1             {v21.b}[1], [U], #1
+    st1             {v21.b}[2], [U], #1
+    st1             {v21.b}[3], [U], #1
+    st1             {v22.b}[0], [V], #1
+    st1             {v22.b}[1], [V], #1
+    st1             {v22.b}[2], [V], #1
+    st1             {v22.b}[3], [V], #1
+  .elseif \size == 2
+    st1             {v20.b}[4], [Y], #1
+    st1             {v20.b}[5], [Y], #1
+    st1             {v21.b}[4], [U], #1
+    st1             {v21.b}[5], [U], #1
+    st1             {v22.b}[4], [V], #1
+    st1             {v22.b}[5], [V], #1
+  .elseif \size == 1
+    st1             {v20.b}[6], [Y], #1
+    st1             {v21.b}[6], [U], #1
+    st1             {v22.b}[6], [V], #1
+  .else
+    .error unsupported macroblock size
+  .endif
+.endm
+
+.macro do_load bpp, size, fast_ld3
+  .if \bpp == 24
+    .if \size == 8
+      .if \fast_ld3 == 1
+        ld3         {v10.8b, v11.8b, v12.8b}, [RGB], #24
+      .else
+        ld1         {v10.b}[0], [RGB], #1
+        ld1         {v11.b}[0], [RGB], #1
+        ld1         {v12.b}[0], [RGB], #1
+
+        ld1         {v10.b}[1], [RGB], #1
+        ld1         {v11.b}[1], [RGB], #1
+        ld1         {v12.b}[1], [RGB], #1
+
+        ld1         {v10.b}[2], [RGB], #1
+        ld1         {v11.b}[2], [RGB], #1
+        ld1         {v12.b}[2], [RGB], #1
+
+        ld1         {v10.b}[3], [RGB], #1
+        ld1         {v11.b}[3], [RGB], #1
+        ld1         {v12.b}[3], [RGB], #1
+
+        ld1         {v10.b}[4], [RGB], #1
+        ld1         {v11.b}[4], [RGB], #1
+        ld1         {v12.b}[4], [RGB], #1
+
+        ld1         {v10.b}[5], [RGB], #1
+        ld1         {v11.b}[5], [RGB], #1
+        ld1         {v12.b}[5], [RGB], #1
+
+        ld1         {v10.b}[6], [RGB], #1
+        ld1         {v11.b}[6], [RGB], #1
+        ld1         {v12.b}[6], [RGB], #1
+
+        ld1         {v10.b}[7], [RGB], #1
+        ld1         {v11.b}[7], [RGB], #1
+        ld1         {v12.b}[7], [RGB], #1
+      .endif
+      prfm          pldl1keep, [RGB, #128]
+    .elseif \size == 4
+      ld3           {v10.b, v11.b, v12.b}[0], [RGB], #3
+      ld3           {v10.b, v11.b, v12.b}[1], [RGB], #3
+      ld3           {v10.b, v11.b, v12.b}[2], [RGB], #3
+      ld3           {v10.b, v11.b, v12.b}[3], [RGB], #3
+    .elseif \size == 2
+      ld3           {v10.b, v11.b, v12.b}[4], [RGB], #3
+      ld3           {v10.b, v11.b, v12.b}[5], [RGB], #3
+    .elseif \size == 1
+      ld3           {v10.b, v11.b, v12.b}[6], [RGB], #3
+    .else
+      .error unsupported macroblock size
+    .endif
+  .elseif \bpp == 32
+    .if \size == 8
+      ld4           {v10.8b, v11.8b, v12.8b, v13.8b}, [RGB], #32
+      prfm          pldl1keep, [RGB, #128]
+    .elseif \size == 4
+      ld4           {v10.b, v11.b, v12.b, v13.b}[0], [RGB], #4
+      ld4           {v10.b, v11.b, v12.b, v13.b}[1], [RGB], #4
+      ld4           {v10.b, v11.b, v12.b, v13.b}[2], [RGB], #4
+      ld4           {v10.b, v11.b, v12.b, v13.b}[3], [RGB], #4
+    .elseif \size == 2
+      ld4           {v10.b, v11.b, v12.b, v13.b}[4], [RGB], #4
+      ld4           {v10.b, v11.b, v12.b, v13.b}[5], [RGB], #4
+    .elseif \size == 1
+      ld4           {v10.b, v11.b, v12.b, v13.b}[6], [RGB], #4
+    .else
+      .error unsupported macroblock size
+    .endif
+  .else
+    .error unsupported bpp
+  .endif
+.endm
+
+.macro generate_jsimd_rgb_ycc_convert_neon colorid, bpp, r_offs, g_offs, \
+                                           b_offs, fast_ld3
+
+/*
+ * 2-stage pipelined RGB->YCbCr conversion
+ */
+
+.macro do_rgb_to_yuv_stage1
+    ushll           v4.8h, v1\r_offs\().8b, #0  /* r = v4 */
+    ushll           v6.8h, v1\g_offs\().8b, #0  /* g = v6 */
+    ushll           v8.8h, v1\b_offs\().8b, #0  /* b = v8 */
+    rev64           v18.4s, v1.4s
+    rev64           v26.4s, v1.4s
+    rev64           v28.4s, v1.4s
+    rev64           v30.4s, v1.4s
+    umull           v14.4s, v4.4h, v0.h[0]
+    umull2          v16.4s, v4.8h, v0.h[0]
+    umlsl           v18.4s, v4.4h, v0.h[3]
+    umlsl2          v26.4s, v4.8h, v0.h[3]
+    umlal           v28.4s, v4.4h, v0.h[5]
+    umlal2          v30.4s, v4.8h, v0.h[5]
+    umlal           v14.4s, v6.4h, v0.h[1]
+    umlal2          v16.4s, v6.8h, v0.h[1]
+    umlsl           v18.4s, v6.4h, v0.h[4]
+    umlsl2          v26.4s, v6.8h, v0.h[4]
+    umlsl           v28.4s, v6.4h, v0.h[6]
+    umlsl2          v30.4s, v6.8h, v0.h[6]
+    umlal           v14.4s, v8.4h, v0.h[2]
+    umlal2          v16.4s, v8.8h, v0.h[2]
+    umlal           v18.4s, v8.4h, v0.h[5]
+    umlal2          v26.4s, v8.8h, v0.h[5]
+    umlsl           v28.4s, v8.4h, v0.h[7]
+    umlsl2          v30.4s, v8.8h, v0.h[7]
+.endm
+
+.macro do_rgb_to_yuv_stage2
+    rshrn           v20.4h, v14.4s, #16
+    shrn            v22.4h, v18.4s, #16
+    shrn            v24.4h, v28.4s, #16
+    rshrn2          v20.8h, v16.4s, #16
+    shrn2           v22.8h, v26.4s, #16
+    shrn2           v24.8h, v30.4s, #16
+    xtn             v20.8b, v20.8h       /* v20 = y */
+    xtn             v21.8b, v22.8h       /* v21 = u */
+    xtn             v22.8b, v24.8h       /* v22 = v */
+.endm
+
+.macro do_rgb_to_yuv
+    do_rgb_to_yuv_stage1
+    do_rgb_to_yuv_stage2
+.endm
+
+/* TODO: expand macros and interleave instructions if some in-order
+ *       ARM64 processor actually can dual-issue LOAD/STORE with ALU */
+.macro do_rgb_to_yuv_stage2_store_load_stage1 fast_ld3
+    do_rgb_to_yuv_stage2
+    do_load         \bpp, 8, \fast_ld3
+    st1             {v20.8b}, [Y], #8
+    st1             {v21.8b}, [U], #8
+    st1             {v22.8b}, [V], #8
+    do_rgb_to_yuv_stage1
+.endm
+
+.balign 16
+.if \fast_ld3 == 1
+Ljsimd_\colorid\()_ycc_neon_consts:
+.else
+Ljsimd_\colorid\()_ycc_neon_slowld3_consts:
+.endif
+  .short 19595, 38470, 7471, 11059
+  .short 21709, 32768, 27439, 5329
+  .short 32767, 128, 32767, 128
+  .short 32767, 128, 32767, 128
+
+.if \fast_ld3 == 1
+asm_function jsimd_\colorid\()_ycc_convert_neon
+.else
+asm_function jsimd_\colorid\()_ycc_convert_neon_slowld3
+.endif
+    OUTPUT_WIDTH    .req w0
+    INPUT_BUF       .req x1
+    OUTPUT_BUF      .req x2
+    OUTPUT_ROW      .req x3
+    NUM_ROWS        .req x4
+
+    OUTPUT_BUF0     .req x5
+    OUTPUT_BUF1     .req x6
+    OUTPUT_BUF2     .req x2  /* OUTPUT_BUF */
+
+    RGB             .req x7
+    Y               .req x9
+    U               .req x10
+    V               .req x11
+    N               .req w12
+
+    /* Load constants to d0, d1, d2, d3 */
+    .if \fast_ld3 == 1
+      adr           x13, Ljsimd_\colorid\()_ycc_neon_consts
+    .else
+      adr           x13, Ljsimd_\colorid\()_ycc_neon_slowld3_consts
+    .endif
+    ld1             {v0.8h, v1.8h}, [x13]
+
+    ldr             OUTPUT_BUF0, [OUTPUT_BUF]
+    ldr             OUTPUT_BUF1, [OUTPUT_BUF, #8]
+    ldr             OUTPUT_BUF2, [OUTPUT_BUF, #16]
+    .unreq          OUTPUT_BUF
+
+    /* Save NEON registers */
+    sub             sp, sp, #64
+    st1             {v8.8b, v9.8b, v10.8b, v11.8b}, [sp], 32
+    st1             {v12.8b, v13.8b, v14.8b, v15.8b}, [sp], 32
+
+    /* Outer loop over scanlines */
+    cmp             NUM_ROWS, #1
+    b.lt            9f
+0:
+    ldr             Y, [OUTPUT_BUF0, OUTPUT_ROW, lsl #3]
+    ldr             U, [OUTPUT_BUF1, OUTPUT_ROW, lsl #3]
+    mov             N, OUTPUT_WIDTH
+    ldr             V, [OUTPUT_BUF2, OUTPUT_ROW, lsl #3]
+    add             OUTPUT_ROW, OUTPUT_ROW, #1
+    ldr             RGB, [INPUT_BUF], #8
+
+    /* Inner loop over pixels */
+    subs            N, N, #8
+    b.lt            3f
+    do_load         \bpp, 8, \fast_ld3
+    do_rgb_to_yuv_stage1
+    subs            N, N, #8
+    b.lt            2f
+1:
+    do_rgb_to_yuv_stage2_store_load_stage1 \fast_ld3
+    subs            N, N, #8
+    b.ge            1b
+2:
+    do_rgb_to_yuv_stage2
+    do_store        8
+    tst             N, #7
+    b.eq            8f
+3:
+    tbz             N, #2, 3f
+    do_load         \bpp, 4, \fast_ld3
+3:
+    tbz             N, #1, 4f
+    do_load         \bpp, 2, \fast_ld3
+4:
+    tbz             N, #0, 5f
+    do_load         \bpp, 1, \fast_ld3
+5:
+    do_rgb_to_yuv
+    tbz             N, #2, 6f
+    do_store        4
+6:
+    tbz             N, #1, 7f
+    do_store        2
+7:
+    tbz             N, #0, 8f
+    do_store        1
+8:
+    subs            NUM_ROWS, NUM_ROWS, #1
+    b.gt            0b
+9:
+    /* Restore all registers and return */
+    sub             sp, sp, #64
+    ld1             {v8.8b, v9.8b, v10.8b, v11.8b}, [sp], 32
+    ld1             {v12.8b, v13.8b, v14.8b, v15.8b}, [sp], 32
+    br              x30
+
+    .unreq          OUTPUT_WIDTH
+    .unreq          OUTPUT_ROW
+    .unreq          INPUT_BUF
+    .unreq          NUM_ROWS
+    .unreq          OUTPUT_BUF0
+    .unreq          OUTPUT_BUF1
+    .unreq          OUTPUT_BUF2
+    .unreq          RGB
+    .unreq          Y
+    .unreq          U
+    .unreq          V
+    .unreq          N
+
+.purgem do_rgb_to_yuv
+.purgem do_rgb_to_yuv_stage1
+.purgem do_rgb_to_yuv_stage2
+.purgem do_rgb_to_yuv_stage2_store_load_stage1
+
+.endm
+
+/*--------------------------------- id ----- bpp R  G  B  Fast LD3 */
+generate_jsimd_rgb_ycc_convert_neon extrgb,  24, 0, 1, 2, 1
+generate_jsimd_rgb_ycc_convert_neon extbgr,  24, 2, 1, 0, 1
+generate_jsimd_rgb_ycc_convert_neon extrgbx, 32, 0, 1, 2, 1
+generate_jsimd_rgb_ycc_convert_neon extbgrx, 32, 2, 1, 0, 1
+generate_jsimd_rgb_ycc_convert_neon extxbgr, 32, 3, 2, 1, 1
+generate_jsimd_rgb_ycc_convert_neon extxrgb, 32, 1, 2, 3, 1
+
+generate_jsimd_rgb_ycc_convert_neon extrgb,  24, 0, 1, 2, 0
+generate_jsimd_rgb_ycc_convert_neon extbgr,  24, 2, 1, 0, 0
+
+.purgem do_load
+.purgem do_store
+
+
+/*****************************************************************************/
+
+/*
+ * Load data into workspace, applying unsigned->signed conversion
+ *
+ * TODO: can be combined with 'jsimd_fdct_ifast_neon' to get
+ *       rid of VST1.16 instructions
+ */
+
+asm_function jsimd_convsamp_neon
+    SAMPLE_DATA     .req x0
+    START_COL       .req x1
+    WORKSPACE       .req x2
+    TMP1            .req x9
+    TMP2            .req x10
+    TMP3            .req x11
+    TMP4            .req x12
+    TMP5            .req x13
+    TMP6            .req x14
+    TMP7            .req x15
+    TMP8            .req x4
+    TMPDUP          .req w3
+
+    mov             TMPDUP, #128
+    ldp             TMP1, TMP2, [SAMPLE_DATA], 16
+    ldp             TMP3, TMP4, [SAMPLE_DATA], 16
+    dup             v0.8b, TMPDUP
+    add             TMP1, TMP1, START_COL
+    add             TMP2, TMP2, START_COL
+    ldp             TMP5, TMP6, [SAMPLE_DATA], 16
+    add             TMP3, TMP3, START_COL
+    add             TMP4, TMP4, START_COL
+    ldp             TMP7, TMP8, [SAMPLE_DATA], 16
+    add             TMP5, TMP5, START_COL
+    add             TMP6, TMP6, START_COL
+    ld1             {v16.8b}, [TMP1]
+    add             TMP7, TMP7, START_COL
+    add             TMP8, TMP8, START_COL
+    ld1             {v17.8b}, [TMP2]
+    usubl           v16.8h, v16.8b, v0.8b
+    ld1             {v18.8b}, [TMP3]
+    usubl           v17.8h, v17.8b, v0.8b
+    ld1             {v19.8b}, [TMP4]
+    usubl           v18.8h, v18.8b, v0.8b
+    ld1             {v20.8b}, [TMP5]
+    usubl           v19.8h, v19.8b, v0.8b
+    ld1             {v21.8b}, [TMP6]
+    st1             {v16.8h, v17.8h, v18.8h, v19.8h}, [WORKSPACE], 64
+    usubl           v20.8h, v20.8b, v0.8b
+    ld1             {v22.8b}, [TMP7]
+    usubl           v21.8h, v21.8b, v0.8b
+    ld1             {v23.8b}, [TMP8]
+    usubl           v22.8h, v22.8b, v0.8b
+    usubl           v23.8h, v23.8b, v0.8b
+    st1             {v20.8h, v21.8h, v22.8h, v23.8h}, [WORKSPACE], 64
+
+    br              x30
+
+    .unreq          SAMPLE_DATA
+    .unreq          START_COL
+    .unreq          WORKSPACE
+    .unreq          TMP1
+    .unreq          TMP2
+    .unreq          TMP3
+    .unreq          TMP4
+    .unreq          TMP5
+    .unreq          TMP6
+    .unreq          TMP7
+    .unreq          TMP8
+    .unreq          TMPDUP
+
+/*****************************************************************************/
+
+/*
+ * jsimd_fdct_islow_neon
+ *
+ * This file contains a slow-but-accurate integer implementation of the
+ * forward DCT (Discrete Cosine Transform). The following code is based
+ * directly on the IJG''s original jfdctint.c; see the jfdctint.c for
+ * more details.
+ *
+ * TODO: can be combined with 'jsimd_convsamp_neon' to get
+ *       rid of a bunch of VLD1.16 instructions
+ */
+
+#define CONST_BITS 13
+#define PASS1_BITS 2
+
+#define DESCALE_P1 (CONST_BITS-PASS1_BITS)
+#define DESCALE_P2 (CONST_BITS+PASS1_BITS)
+
+#define F_0_298  2446  /* FIX(0.298631336) */
+#define F_0_390  3196  /* FIX(0.390180644) */
+#define F_0_541  4433  /* FIX(0.541196100) */
+#define F_0_765  6270  /* FIX(0.765366865) */
+#define F_0_899  7373  /* FIX(0.899976223) */
+#define F_1_175  9633  /* FIX(1.175875602) */
+#define F_1_501 12299  /* FIX(1.501321110) */
+#define F_1_847 15137  /* FIX(1.847759065) */
+#define F_1_961 16069  /* FIX(1.961570560) */
+#define F_2_053 16819  /* FIX(2.053119869) */
+#define F_2_562 20995  /* FIX(2.562915447) */
+#define F_3_072 25172  /* FIX(3.072711026) */
+
+.balign 16
+Ljsimd_fdct_islow_neon_consts:
+  .short F_0_298
+  .short -F_0_390
+  .short F_0_541
+  .short F_0_765
+  .short - F_0_899
+  .short F_1_175
+  .short F_1_501
+  .short - F_1_847
+  .short - F_1_961
+  .short F_2_053
+  .short - F_2_562
+  .short F_3_072
+  .short 0          /* padding */
+  .short 0
+  .short 0
+  .short 0
+
+#undef F_0_298
+#undef F_0_390
+#undef F_0_541
+#undef F_0_765
+#undef F_0_899
+#undef F_1_175
+#undef F_1_501
+#undef F_1_847
+#undef F_1_961
+#undef F_2_053
+#undef F_2_562
+#undef F_3_072
+#define XFIX_P_0_298 v0.h[0]
+#define XFIX_N_0_390 v0.h[1]
+#define XFIX_P_0_541 v0.h[2]
+#define XFIX_P_0_765 v0.h[3]
+#define XFIX_N_0_899 v0.h[4]
+#define XFIX_P_1_175 v0.h[5]
+#define XFIX_P_1_501 v0.h[6]
+#define XFIX_N_1_847 v0.h[7]
+#define XFIX_N_1_961 v1.h[0]
+#define XFIX_P_2_053 v1.h[1]
+#define XFIX_N_2_562 v1.h[2]
+#define XFIX_P_3_072 v1.h[3]
+
+asm_function jsimd_fdct_islow_neon
+
+    DATA            .req x0
+    TMP             .req x9
+
+    /* Load constants */
+    adr             TMP, Ljsimd_fdct_islow_neon_consts
+    ld1             {v0.8h, v1.8h}, [TMP]
+
+    /* Save NEON registers */
+    sub             sp, sp, #64
+    st1             {v8.8b, v9.8b, v10.8b, v11.8b}, [sp], 32
+    st1             {v12.8b, v13.8b, v14.8b, v15.8b}, [sp], 32
+
+    /* Load all DATA into NEON registers with the following allocation:
+     *       0 1 2 3 | 4 5 6 7
+     *      ---------+--------
+     *   0 | d16     | d17    | v16.8h
+     *   1 | d18     | d19    | v17.8h
+     *   2 | d20     | d21    | v18.8h
+     *   3 | d22     | d23    | v19.8h
+     *   4 | d24     | d25    | v20.8h
+     *   5 | d26     | d27    | v21.8h
+     *   6 | d28     | d29    | v22.8h
+     *   7 | d30     | d31    | v23.8h
+     */
+
+    ld1             {v16.8h, v17.8h, v18.8h, v19.8h}, [DATA], 64
+    ld1             {v20.8h, v21.8h, v22.8h, v23.8h}, [DATA]
+    sub             DATA, DATA, #64
+
+    /* Transpose */
+    transpose_8x8   v16, v17, v18, v19, v20, v21, v22, v23, v31, v2, v3, v4
+    /* 1-D FDCT */
+    add             v24.8h, v16.8h, v23.8h  /* tmp0 = dataptr[0] + dataptr[7]; */
+    sub             v31.8h, v16.8h, v23.8h  /* tmp7 = dataptr[0] - dataptr[7]; */
+    add             v25.8h, v17.8h, v22.8h  /* tmp1 = dataptr[1] + dataptr[6]; */
+    sub             v30.8h, v17.8h, v22.8h  /* tmp6 = dataptr[1] - dataptr[6]; */
+    add             v26.8h, v18.8h, v21.8h  /* tmp2 = dataptr[2] + dataptr[5]; */
+    sub             v29.8h, v18.8h, v21.8h  /* tmp5 = dataptr[2] - dataptr[5]; */
+    add             v27.8h, v19.8h, v20.8h  /* tmp3 = dataptr[3] + dataptr[4]; */
+    sub             v28.8h, v19.8h, v20.8h  /* tmp4 = dataptr[3] - dataptr[4]; */
+
+    /* even part */
+
+    add             v8.8h, v24.8h, v27.8h   /* tmp10 = tmp0 + tmp3; */
+    sub             v9.8h, v24.8h, v27.8h   /* tmp13 = tmp0 - tmp3; */
+    add             v10.8h, v25.8h, v26.8h  /* tmp11 = tmp1 + tmp2; */
+    sub             v11.8h, v25.8h, v26.8h  /* tmp12 = tmp1 - tmp2; */
+
+    add             v16.8h, v8.8h, v10.8h  /* tmp10 + tmp11 */
+    sub             v20.8h, v8.8h, v10.8h  /* tmp10 - tmp11 */
+
+    add             v18.8h, v11.8h, v9.8h  /* tmp12 + tmp13 */
+
+    shl             v16.8h, v16.8h, #PASS1_BITS  /* dataptr[0] = (DCTELEM) LEFT_SHIFT(tmp10 + tmp11, PASS1_BITS); */
+    shl             v20.8h, v20.8h, #PASS1_BITS  /* dataptr[4] = (DCTELEM) LEFT_SHIFT(tmp10 - tmp11, PASS1_BITS); */
+
+    smull2          v24.4s, v18.8h, XFIX_P_0_541  /* z1 hi = MULTIPLY(tmp12 + tmp13, XFIX_P_0_541); */
+    smull           v18.4s, v18.4h, XFIX_P_0_541  /* z1 lo = MULTIPLY(tmp12 + tmp13, XFIX_P_0_541); */
+    mov             v22.16b, v18.16b
+    mov             v25.16b, v24.16b
+
+    smlal           v18.4s, v9.4h, XFIX_P_0_765   /* lo z1 + MULTIPLY(tmp13, XFIX_P_0_765) */
+    smlal2          v24.4s, v9.8h, XFIX_P_0_765   /* hi z1 + MULTIPLY(tmp13, XFIX_P_0_765) */
+    smlal           v22.4s, v11.4h, XFIX_N_1_847  /* lo z1 + MULTIPLY(tmp12, XFIX_N_1_847) */
+    smlal2          v25.4s, v11.8h, XFIX_N_1_847  /* hi z1 + MULTIPLY(tmp12, XFIX_N_1_847) */
+
+    rshrn           v18.4h, v18.4s, #DESCALE_P1
+    rshrn           v22.4h, v22.4s, #DESCALE_P1
+    rshrn2          v18.8h, v24.4s, #DESCALE_P1  /* dataptr[2] = (DCTELEM) DESCALE(z1 + MULTIPLY(tmp13, XFIX_P_0_765), CONST_BITS-PASS1_BITS); */
+    rshrn2          v22.8h, v25.4s, #DESCALE_P1  /* dataptr[6] = (DCTELEM) DESCALE(z1 + MULTIPLY(tmp12, XFIX_N_1_847), CONST_BITS-PASS1_BITS); */
+
+    /* Odd part */
+
+    add             v8.8h, v28.8h, v31.8h        /* z1 = tmp4 + tmp7; */
+    add             v9.8h, v29.8h, v30.8h        /* z2 = tmp5 + tmp6; */
+    add             v10.8h, v28.8h, v30.8h       /* z3 = tmp4 + tmp6; */
+    add             v11.8h, v29.8h, v31.8h       /* z4 = tmp5 + tmp7; */
+    smull           v4.4s, v10.4h, XFIX_P_1_175  /* z5 lo = z3 lo * XFIX_P_1_175 */
+    smull2          v5.4s, v10.8h, XFIX_P_1_175
+    smlal           v4.4s, v11.4h, XFIX_P_1_175  /* z5 = MULTIPLY(z3 + z4, FIX_1_175875602); */
+    smlal2          v5.4s, v11.8h, XFIX_P_1_175
+
+    smull2          v24.4s, v28.8h, XFIX_P_0_298
+    smull2          v25.4s, v29.8h, XFIX_P_2_053
+    smull2          v26.4s, v30.8h, XFIX_P_3_072
+    smull2          v27.4s, v31.8h, XFIX_P_1_501
+    smull           v28.4s, v28.4h, XFIX_P_0_298  /* tmp4 = MULTIPLY(tmp4, FIX_0_298631336); */
+    smull           v29.4s, v29.4h, XFIX_P_2_053  /* tmp5 = MULTIPLY(tmp5, FIX_2_053119869); */
+    smull           v30.4s, v30.4h, XFIX_P_3_072  /* tmp6 = MULTIPLY(tmp6, FIX_3_072711026); */
+    smull           v31.4s, v31.4h, XFIX_P_1_501  /* tmp7 = MULTIPLY(tmp7, FIX_1_501321110); */
+
+    smull2          v12.4s, v8.8h, XFIX_N_0_899
+    smull2          v13.4s, v9.8h, XFIX_N_2_562
+    smull2          v14.4s, v10.8h, XFIX_N_1_961
+    smull2          v15.4s, v11.8h, XFIX_N_0_390
+    smull           v8.4s, v8.4h, XFIX_N_0_899    /* z1 = MULTIPLY(z1, - FIX_0_899976223); */
+    smull           v9.4s, v9.4h, XFIX_N_2_562    /* z2 = MULTIPLY(z2, - FIX_2_562915447); */
+    smull           v10.4s, v10.4h, XFIX_N_1_961  /* z3 = MULTIPLY(z3, - FIX_1_961570560); */
+    smull           v11.4s, v11.4h, XFIX_N_0_390  /* z4 = MULTIPLY(z4, - FIX_0_390180644); */
+
+    add             v10.4s, v10.4s, v4.4s  /* z3 += z5 */
+    add             v14.4s, v14.4s, v5.4s
+    add             v11.4s, v11.4s, v4.4s  /* z4 += z5 */
+    add             v15.4s, v15.4s, v5.4s
+
+    add             v28.4s, v28.4s, v8.4s   /* tmp4 += z1 */
+    add             v24.4s, v24.4s, v12.4s
+    add             v29.4s, v29.4s, v9.4s   /* tmp5 += z2 */
+    add             v25.4s, v25.4s, v13.4s
+    add             v30.4s, v30.4s, v10.4s  /* tmp6 += z3 */
+    add             v26.4s, v26.4s, v14.4s
+    add             v31.4s, v31.4s, v11.4s  /* tmp7 += z4 */
+    add             v27.4s, v27.4s, v15.4s
+
+    add             v28.4s, v28.4s, v10.4s  /* tmp4 += z3 */
+    add             v24.4s, v24.4s, v14.4s
+    add             v29.4s, v29.4s, v11.4s  /* tmp5 += z4 */
+    add             v25.4s, v25.4s, v15.4s
+    add             v30.4s, v30.4s, v9.4s   /* tmp6 += z2 */
+    add             v26.4s, v26.4s, v13.4s
+    add             v31.4s, v31.4s, v8.4s   /* tmp7 += z1 */
+    add             v27.4s, v27.4s, v12.4s
+
+    rshrn           v23.4h, v28.4s, #DESCALE_P1
+    rshrn           v21.4h, v29.4s, #DESCALE_P1
+    rshrn           v19.4h, v30.4s, #DESCALE_P1
+    rshrn           v17.4h, v31.4s, #DESCALE_P1
+    rshrn2          v23.8h, v24.4s, #DESCALE_P1  /* dataptr[7] = (DCTELEM) DESCALE(tmp4 + z1 + z3, CONST_BITS-PASS1_BITS); */
+    rshrn2          v21.8h, v25.4s, #DESCALE_P1  /* dataptr[5] = (DCTELEM) DESCALE(tmp5 + z2 + z4, CONST_BITS-PASS1_BITS); */
+    rshrn2          v19.8h, v26.4s, #DESCALE_P1  /* dataptr[3] = (DCTELEM) DESCALE(tmp6 + z2 + z3, CONST_BITS-PASS1_BITS); */
+    rshrn2          v17.8h, v27.4s, #DESCALE_P1  /* dataptr[1] = (DCTELEM) DESCALE(tmp7 + z1 + z4, CONST_BITS-PASS1_BITS); */
+
+    /* Transpose */
+    transpose_8x8   v16, v17, v18, v19, v20, v21, v22, v23, v31, v2, v3, v4
+
+    /* 1-D FDCT */
+    add             v24.8h, v16.8h, v23.8h  /* tmp0 = dataptr[0] + dataptr[7]; */
+    sub             v31.8h, v16.8h, v23.8h  /* tmp7 = dataptr[0] - dataptr[7]; */
+    add             v25.8h, v17.8h, v22.8h  /* tmp1 = dataptr[1] + dataptr[6]; */
+    sub             v30.8h, v17.8h, v22.8h  /* tmp6 = dataptr[1] - dataptr[6]; */
+    add             v26.8h, v18.8h, v21.8h  /* tmp2 = dataptr[2] + dataptr[5]; */
+    sub             v29.8h, v18.8h, v21.8h  /* tmp5 = dataptr[2] - dataptr[5]; */
+    add             v27.8h, v19.8h, v20.8h  /* tmp3 = dataptr[3] + dataptr[4]; */
+    sub             v28.8h, v19.8h, v20.8h  /* tmp4 = dataptr[3] - dataptr[4]; */
+
+    /* even part */
+    add             v8.8h, v24.8h, v27.8h   /* tmp10 = tmp0 + tmp3; */
+    sub             v9.8h, v24.8h, v27.8h   /* tmp13 = tmp0 - tmp3; */
+    add             v10.8h, v25.8h, v26.8h  /* tmp11 = tmp1 + tmp2; */
+    sub             v11.8h, v25.8h, v26.8h  /* tmp12 = tmp1 - tmp2; */
+
+    add             v16.8h, v8.8h, v10.8h  /* tmp10 + tmp11 */
+    sub             v20.8h, v8.8h, v10.8h  /* tmp10 - tmp11 */
+
+    add             v18.8h, v11.8h, v9.8h  /* tmp12 + tmp13 */
+
+    srshr           v16.8h, v16.8h, #PASS1_BITS  /* dataptr[0] = (DCTELEM) DESCALE(tmp10 + tmp11, PASS1_BITS); */
+    srshr           v20.8h, v20.8h, #PASS1_BITS  /* dataptr[4] = (DCTELEM) DESCALE(tmp10 - tmp11, PASS1_BITS); */
+
+    smull2          v24.4s, v18.8h, XFIX_P_0_541  /* z1 hi = MULTIPLY(tmp12 + tmp13, XFIX_P_0_541); */
+    smull           v18.4s, v18.4h, XFIX_P_0_541  /* z1 lo = MULTIPLY(tmp12 + tmp13, XFIX_P_0_541); */
+    mov             v22.16b, v18.16b
+    mov             v25.16b, v24.16b
+
+    smlal           v18.4s, v9.4h, XFIX_P_0_765   /* lo z1 + MULTIPLY(tmp13, XFIX_P_0_765) */
+    smlal2          v24.4s, v9.8h, XFIX_P_0_765   /* hi z1 + MULTIPLY(tmp13, XFIX_P_0_765) */
+    smlal           v22.4s, v11.4h, XFIX_N_1_847  /* lo z1 + MULTIPLY(tmp12, XFIX_N_1_847) */
+    smlal2          v25.4s, v11.8h, XFIX_N_1_847  /* hi z1 + MULTIPLY(tmp12, XFIX_N_1_847) */
+
+    rshrn           v18.4h, v18.4s, #DESCALE_P2
+    rshrn           v22.4h, v22.4s, #DESCALE_P2
+    rshrn2          v18.8h, v24.4s, #DESCALE_P2  /* dataptr[2] = (DCTELEM) DESCALE(z1 + MULTIPLY(tmp13, XFIX_P_0_765), CONST_BITS-PASS1_BITS); */
+    rshrn2          v22.8h, v25.4s, #DESCALE_P2  /* dataptr[6] = (DCTELEM) DESCALE(z1 + MULTIPLY(tmp12, XFIX_N_1_847), CONST_BITS-PASS1_BITS); */
+
+    /* Odd part */
+    add             v8.8h, v28.8h, v31.8h   /* z1 = tmp4 + tmp7; */
+    add             v9.8h, v29.8h, v30.8h   /* z2 = tmp5 + tmp6; */
+    add             v10.8h, v28.8h, v30.8h  /* z3 = tmp4 + tmp6; */
+    add             v11.8h, v29.8h, v31.8h  /* z4 = tmp5 + tmp7; */
+
+    smull           v4.4s, v10.4h, XFIX_P_1_175  /* z5 lo = z3 lo * XFIX_P_1_175 */
+    smull2          v5.4s, v10.8h, XFIX_P_1_175
+    smlal           v4.4s, v11.4h, XFIX_P_1_175  /* z5 = MULTIPLY(z3 + z4, FIX_1_175875602); */
+    smlal2          v5.4s, v11.8h, XFIX_P_1_175
+
+    smull2          v24.4s, v28.8h, XFIX_P_0_298
+    smull2          v25.4s, v29.8h, XFIX_P_2_053
+    smull2          v26.4s, v30.8h, XFIX_P_3_072
+    smull2          v27.4s, v31.8h, XFIX_P_1_501
+    smull           v28.4s, v28.4h, XFIX_P_0_298  /* tmp4 = MULTIPLY(tmp4, FIX_0_298631336); */
+    smull           v29.4s, v29.4h, XFIX_P_2_053  /* tmp5 = MULTIPLY(tmp5, FIX_2_053119869); */
+    smull           v30.4s, v30.4h, XFIX_P_3_072  /* tmp6 = MULTIPLY(tmp6, FIX_3_072711026); */
+    smull           v31.4s, v31.4h, XFIX_P_1_501  /* tmp7 = MULTIPLY(tmp7, FIX_1_501321110); */
+
+    smull2          v12.4s, v8.8h, XFIX_N_0_899
+    smull2          v13.4s, v9.8h, XFIX_N_2_562
+    smull2          v14.4s, v10.8h, XFIX_N_1_961
+    smull2          v15.4s, v11.8h, XFIX_N_0_390
+    smull           v8.4s, v8.4h, XFIX_N_0_899    /* z1 = MULTIPLY(z1, - FIX_0_899976223); */
+    smull           v9.4s, v9.4h, XFIX_N_2_562    /* z2 = MULTIPLY(z2, - FIX_2_562915447); */
+    smull           v10.4s, v10.4h, XFIX_N_1_961  /* z3 = MULTIPLY(z3, - FIX_1_961570560); */
+    smull           v11.4s, v11.4h, XFIX_N_0_390  /* z4 = MULTIPLY(z4, - FIX_0_390180644); */
+
+    add             v10.4s, v10.4s, v4.4s
+    add             v14.4s, v14.4s, v5.4s
+    add             v11.4s, v11.4s, v4.4s
+    add             v15.4s, v15.4s, v5.4s
+
+    add             v28.4s, v28.4s, v8.4s   /* tmp4 += z1 */
+    add             v24.4s, v24.4s, v12.4s
+    add             v29.4s, v29.4s, v9.4s   /* tmp5 += z2 */
+    add             v25.4s, v25.4s, v13.4s
+    add             v30.4s, v30.4s, v10.4s  /* tmp6 += z3 */
+    add             v26.4s, v26.4s, v14.4s
+    add             v31.4s, v31.4s, v11.4s  /* tmp7 += z4 */
+    add             v27.4s, v27.4s, v15.4s
+
+    add             v28.4s, v28.4s, v10.4s  /* tmp4 += z3 */
+    add             v24.4s, v24.4s, v14.4s
+    add             v29.4s, v29.4s, v11.4s  /* tmp5 += z4 */
+    add             v25.4s, v25.4s, v15.4s
+    add             v30.4s, v30.4s, v9.4s   /* tmp6 += z2 */
+    add             v26.4s, v26.4s, v13.4s
+    add             v31.4s, v31.4s, v8.4s   /* tmp7 += z1 */
+    add             v27.4s, v27.4s, v12.4s
+
+    rshrn           v23.4h, v28.4s, #DESCALE_P2
+    rshrn           v21.4h, v29.4s, #DESCALE_P2
+    rshrn           v19.4h, v30.4s, #DESCALE_P2
+    rshrn           v17.4h, v31.4s, #DESCALE_P2
+    rshrn2          v23.8h, v24.4s, #DESCALE_P2  /* dataptr[7] = (DCTELEM) DESCALE(tmp4 + z1 + z3, CONST_BITS-PASS1_BITS); */
+    rshrn2          v21.8h, v25.4s, #DESCALE_P2  /* dataptr[5] = (DCTELEM) DESCALE(tmp5 + z2 + z4, CONST_BITS-PASS1_BITS); */
+    rshrn2          v19.8h, v26.4s, #DESCALE_P2  /* dataptr[3] = (DCTELEM) DESCALE(tmp6 + z2 + z3, CONST_BITS-PASS1_BITS); */
+    rshrn2          v17.8h, v27.4s, #DESCALE_P2  /* dataptr[1] = (DCTELEM) DESCALE(tmp7 + z1 + z4, CONST_BITS-PASS1_BITS); */
+
+    /* store results */
+    st1             {v16.8h, v17.8h, v18.8h, v19.8h}, [DATA], 64
+    st1             {v20.8h, v21.8h, v22.8h, v23.8h}, [DATA]
+
+    /* Restore NEON registers */
+    sub             sp, sp, #64
+    ld1             {v8.8b, v9.8b, v10.8b, v11.8b}, [sp], 32
+    ld1             {v12.8b, v13.8b, v14.8b, v15.8b}, [sp], 32
+
+    br              x30
+
+    .unreq          DATA
+    .unreq          TMP
+
+#undef XFIX_P_0_298
+#undef XFIX_N_0_390
+#undef XFIX_P_0_541
+#undef XFIX_P_0_765
+#undef XFIX_N_0_899
+#undef XFIX_P_1_175
+#undef XFIX_P_1_501
+#undef XFIX_N_1_847
+#undef XFIX_N_1_961
+#undef XFIX_P_2_053
+#undef XFIX_N_2_562
+#undef XFIX_P_3_072
+
+
+/*****************************************************************************/
+
+/*
+ * jsimd_fdct_ifast_neon
+ *
+ * This function contains a fast, not so accurate integer implementation of
+ * the forward DCT (Discrete Cosine Transform). It uses the same calculations
+ * and produces exactly the same output as IJG's original 'jpeg_fdct_ifast'
+ * function from jfdctfst.c
+ *
+ * TODO: can be combined with 'jsimd_convsamp_neon' to get
+ *       rid of a bunch of VLD1.16 instructions
+ */
+
+#undef XFIX_0_541196100
+#define XFIX_0_382683433 v0.h[0]
+#define XFIX_0_541196100 v0.h[1]
+#define XFIX_0_707106781 v0.h[2]
+#define XFIX_1_306562965 v0.h[3]
+
+.balign 16
+Ljsimd_fdct_ifast_neon_consts:
+  .short (98 * 128)               /* XFIX_0_382683433 */
+  .short (139 * 128)              /* XFIX_0_541196100 */
+  .short (181 * 128)              /* XFIX_0_707106781 */
+  .short (334 * 128 - 256 * 128)  /* XFIX_1_306562965 */
+
+asm_function jsimd_fdct_ifast_neon
+
+    DATA            .req x0
+    TMP             .req x9
+
+    /* Load constants */
+    adr             TMP, Ljsimd_fdct_ifast_neon_consts
+    ld1             {v0.4h}, [TMP]
+
+    /* Load all DATA into NEON registers with the following allocation:
+     *       0 1 2 3 | 4 5 6 7
+     *      ---------+--------
+     *   0 | d16     | d17    | v0.8h
+     *   1 | d18     | d19    | q9
+     *   2 | d20     | d21    | q10
+     *   3 | d22     | d23    | q11
+     *   4 | d24     | d25    | q12
+     *   5 | d26     | d27    | q13
+     *   6 | d28     | d29    | q14
+     *   7 | d30     | d31    | q15
+     */
+
+    ld1             {v16.8h, v17.8h, v18.8h, v19.8h}, [DATA], 64
+    ld1             {v20.8h, v21.8h, v22.8h, v23.8h}, [DATA]
+    mov             TMP, #2
+    sub             DATA, DATA, #64
+1:
+    /* Transpose */
+    transpose_8x8   v16, v17, v18, v19, v20, v21, v22, v23, v1, v2, v3, v4
+    subs            TMP, TMP, #1
+    /* 1-D FDCT */
+    add             v4.8h, v19.8h, v20.8h
+    sub             v20.8h, v19.8h, v20.8h
+    sub             v28.8h, v18.8h, v21.8h
+    add             v18.8h, v18.8h, v21.8h
+    sub             v29.8h, v17.8h, v22.8h
+    add             v17.8h, v17.8h, v22.8h
+    sub             v21.8h, v16.8h, v23.8h
+    add             v16.8h, v16.8h, v23.8h
+    sub             v6.8h, v17.8h, v18.8h
+    sub             v7.8h, v16.8h, v4.8h
+    add             v5.8h, v17.8h, v18.8h
+    add             v6.8h, v6.8h, v7.8h
+    add             v4.8h, v16.8h, v4.8h
+    sqdmulh         v6.8h, v6.8h, XFIX_0_707106781
+    add             v19.8h, v20.8h, v28.8h
+    add             v16.8h, v4.8h, v5.8h
+    sub             v20.8h, v4.8h, v5.8h
+    add             v5.8h, v28.8h, v29.8h
+    add             v29.8h, v29.8h, v21.8h
+    sqdmulh         v5.8h, v5.8h, XFIX_0_707106781
+    sub             v28.8h, v19.8h, v29.8h
+    add             v18.8h, v7.8h, v6.8h
+    sqdmulh         v28.8h, v28.8h, XFIX_0_382683433
+    sub             v22.8h, v7.8h, v6.8h
+    sqdmulh         v19.8h, v19.8h, XFIX_0_541196100
+    sqdmulh         v7.8h, v29.8h, XFIX_1_306562965
+    add             v6.8h, v21.8h, v5.8h
+    sub             v5.8h, v21.8h, v5.8h
+    add             v29.8h, v29.8h, v28.8h
+    add             v19.8h, v19.8h, v28.8h
+    add             v29.8h, v29.8h, v7.8h
+    add             v21.8h, v5.8h, v19.8h
+    sub             v19.8h, v5.8h, v19.8h
+    add             v17.8h, v6.8h, v29.8h
+    sub             v23.8h, v6.8h, v29.8h
+
+    b.ne            1b
+
+    /* store results */
+    st1             {v16.8h, v17.8h, v18.8h, v19.8h}, [DATA], 64
+    st1             {v20.8h, v21.8h, v22.8h, v23.8h}, [DATA]
+
+    br              x30
+
+    .unreq          DATA
+    .unreq          TMP
+#undef XFIX_0_382683433
+#undef XFIX_0_541196100
+#undef XFIX_0_707106781
+#undef XFIX_1_306562965
+
+
+/*****************************************************************************/
+
+/*
+ * GLOBAL(void)
+ * jsimd_quantize_neon (JCOEFPTR coef_block, DCTELEM *divisors,
+ *                      DCTELEM *workspace);
+ *
+ */
+asm_function jsimd_quantize_neon
+
+    COEF_BLOCK      .req x0
+    DIVISORS        .req x1
+    WORKSPACE       .req x2
+
+    RECIPROCAL      .req DIVISORS
+    CORRECTION      .req x9
+    SHIFT           .req x10
+    LOOP_COUNT      .req x11
+
+    mov             LOOP_COUNT, #2
+    add             CORRECTION, DIVISORS, #(64 * 2)
+    add             SHIFT, DIVISORS, #(64 * 6)
+1:
+    subs            LOOP_COUNT, LOOP_COUNT, #1
+    ld1             {v0.8h, v1.8h, v2.8h, v3.8h}, [WORKSPACE], 64
+    ld1             {v4.8h, v5.8h, v6.8h, v7.8h}, [CORRECTION], 64
+    abs             v20.8h, v0.8h
+    abs             v21.8h, v1.8h
+    abs             v22.8h, v2.8h
+    abs             v23.8h, v3.8h
+    ld1             {v28.8h, v29.8h, v30.8h, v31.8h}, [RECIPROCAL], 64
+    add             v20.8h, v20.8h, v4.8h  /* add correction */
+    add             v21.8h, v21.8h, v5.8h
+    add             v22.8h, v22.8h, v6.8h
+    add             v23.8h, v23.8h, v7.8h
+    umull           v4.4s, v20.4h, v28.4h  /* multiply by reciprocal */
+    umull2          v16.4s, v20.8h, v28.8h
+    umull           v5.4s, v21.4h, v29.4h
+    umull2          v17.4s, v21.8h, v29.8h
+    umull           v6.4s, v22.4h, v30.4h  /* multiply by reciprocal */
+    umull2          v18.4s, v22.8h, v30.8h
+    umull           v7.4s, v23.4h, v31.4h
+    umull2          v19.4s, v23.8h, v31.8h
+    ld1             {v24.8h, v25.8h, v26.8h, v27.8h}, [SHIFT], 64
+    shrn            v4.4h, v4.4s, #16
+    shrn            v5.4h, v5.4s, #16
+    shrn            v6.4h, v6.4s, #16
+    shrn            v7.4h, v7.4s, #16
+    shrn2           v4.8h, v16.4s, #16
+    shrn2           v5.8h, v17.4s, #16
+    shrn2           v6.8h, v18.4s, #16
+    shrn2           v7.8h, v19.4s, #16
+    neg             v24.8h, v24.8h
+    neg             v25.8h, v25.8h
+    neg             v26.8h, v26.8h
+    neg             v27.8h, v27.8h
+    sshr            v0.8h, v0.8h, #15  /* extract sign */
+    sshr            v1.8h, v1.8h, #15
+    sshr            v2.8h, v2.8h, #15
+    sshr            v3.8h, v3.8h, #15
+    ushl            v4.8h, v4.8h, v24.8h  /* shift */
+    ushl            v5.8h, v5.8h, v25.8h
+    ushl            v6.8h, v6.8h, v26.8h
+    ushl            v7.8h, v7.8h, v27.8h
+
+    eor             v4.16b, v4.16b, v0.16b  /* restore sign */
+    eor             v5.16b, v5.16b, v1.16b
+    eor             v6.16b, v6.16b, v2.16b
+    eor             v7.16b, v7.16b, v3.16b
+    sub             v4.8h, v4.8h, v0.8h
+    sub             v5.8h, v5.8h, v1.8h
+    sub             v6.8h, v6.8h, v2.8h
+    sub             v7.8h, v7.8h, v3.8h
+    st1             {v4.8h, v5.8h, v6.8h, v7.8h}, [COEF_BLOCK], 64
+
+    b.ne            1b
+
+    br              x30  /* return */
+
+    .unreq          COEF_BLOCK
+    .unreq          DIVISORS
+    .unreq          WORKSPACE
+    .unreq          RECIPROCAL
+    .unreq          CORRECTION
+    .unreq          SHIFT
+    .unreq          LOOP_COUNT
+
+
+/*****************************************************************************/
+
+/*
+ * Downsample pixel values of a single component.
+ * This version handles the common case of 2:1 horizontal and 1:1 vertical,
+ * without smoothing.
+ *
+ * GLOBAL(void)
+ * jsimd_h2v1_downsample_neon (JDIMENSION image_width, int max_v_samp_factor,
+ *                             JDIMENSION v_samp_factor,
+ *                             JDIMENSION width_blocks, JSAMPARRAY input_data,
+ *                             JSAMPARRAY output_data);
+ */
+
+.balign 16
+Ljsimd_h2_downsample_neon_consts:
+  .byte 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, \
+        0x08, 0x09, 0x0A, 0x0B, 0x0C, 0x0D, 0x0E, 0x0F  /* diff 0 */
+  .byte 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, \
+        0x08, 0x09, 0x0A, 0x0B, 0x0C, 0x0D, 0x0E, 0x0E  /* diff 1 */
+  .byte 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, \
+        0x08, 0x09, 0x0A, 0x0B, 0x0C, 0x0D, 0x0D, 0x0D  /* diff 2 */
+  .byte 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, \
+        0x08, 0x09, 0x0A, 0x0B, 0x0C, 0x0C, 0x0C, 0x0C  /* diff 3 */
+  .byte 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, \
+        0x08, 0x09, 0x0A, 0x0B, 0x0B, 0x0B, 0x0B, 0x0B  /* diff 4 */
+  .byte 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, \
+        0x08, 0x09, 0x0A, 0x0A, 0x0A, 0x0A, 0x0A, 0x0A  /* diff 5 */
+  .byte 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, \
+        0x08, 0x09, 0x09, 0x09, 0x09, 0x09, 0x09, 0x09  /* diff 6 */
+  .byte 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, \
+        0x08, 0x08, 0x08, 0x08, 0x08, 0x08, 0x08, 0x08  /* diff 7 */
+  .byte 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, \
+        0x07, 0x07, 0x07, 0x07, 0x07, 0x07, 0x07, 0x07  /* diff 8 */
+  .byte 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x06, \
+        0x06, 0x06, 0x06, 0x06, 0x06, 0x06, 0x06, 0x06  /* diff 9 */
+  .byte 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x05, 0x05, \
+        0x05, 0x05, 0x05, 0x05, 0x05, 0x05, 0x05, 0x05  /* diff 10 */
+  .byte 0x00, 0x01, 0x02, 0x03, 0x04, 0x04, 0x04, 0x04, \
+        0x04, 0x04, 0x04, 0x04, 0x04, 0x04, 0x04, 0x04  /* diff 11 */
+  .byte 0x00, 0x01, 0x02, 0x03, 0x03, 0x03, 0x03, 0x03, \
+        0x03, 0x03, 0x03, 0x03, 0x03, 0x03, 0x03, 0x03  /* diff 12 */
+  .byte 0x00, 0x01, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, \
+        0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02  /* diff 13 */
+  .byte 0x00, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, \
+        0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01  /* diff 14 */
+  .byte 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, \
+        0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00  /* diff 15 */
+
+asm_function jsimd_h2v1_downsample_neon
+    IMAGE_WIDTH     .req x0
+    MAX_V_SAMP      .req x1
+    V_SAMP          .req x2
+    BLOCK_WIDTH     .req x3
+    INPUT_DATA      .req x4
+    OUTPUT_DATA     .req x5
+    OUTPTR          .req x9
+    INPTR           .req x10
+    TMP1            .req x11
+    TMP2            .req x12
+    TMP3            .req x13
+    TMPDUP          .req w15
+
+    mov             TMPDUP, #0x10000
+    lsl             TMP2, BLOCK_WIDTH, #4
+    sub             TMP2, TMP2, IMAGE_WIDTH
+    adr             TMP3, Ljsimd_h2_downsample_neon_consts
+    add             TMP3, TMP3, TMP2, lsl #4
+    dup             v16.4s, TMPDUP
+    ld1             {v18.16b}, [TMP3]
+
+1:  /* row loop */
+    ldr             INPTR, [INPUT_DATA], #8
+    ldr             OUTPTR, [OUTPUT_DATA], #8
+    subs            TMP1, BLOCK_WIDTH, #1
+    b.eq            3f
+2:  /* columns */
+    ld1             {v0.16b}, [INPTR], #16
+    mov             v4.16b, v16.16b
+    subs            TMP1, TMP1, #1
+    uadalp          v4.8h, v0.16b
+    shrn            v6.8b, v4.8h, #1
+    st1             {v6.8b}, [OUTPTR], #8
+    b.ne            2b
+3:  /* last columns */
+    ld1             {v0.16b}, [INPTR]
+    mov             v4.16b, v16.16b
+    subs            V_SAMP, V_SAMP, #1
+    /* expand right */
+    tbl             v2.16b, {v0.16b}, v18.16b
+    uadalp          v4.8h, v2.16b
+    shrn            v6.8b, v4.8h, #1
+    st1             {v6.8b}, [OUTPTR], #8
+    b.ne            1b
+
+    br              x30
+
+    .unreq          IMAGE_WIDTH
+    .unreq          MAX_V_SAMP
+    .unreq          V_SAMP
+    .unreq          BLOCK_WIDTH
+    .unreq          INPUT_DATA
+    .unreq          OUTPUT_DATA
+    .unreq          OUTPTR
+    .unreq          INPTR
+    .unreq          TMP1
+    .unreq          TMP2
+    .unreq          TMP3
+    .unreq          TMPDUP
+
+
+/*****************************************************************************/
+
+/*
+ * Downsample pixel values of a single component.
+ * This version handles the common case of 2:1 horizontal and 2:1 vertical,
+ * without smoothing.
+ *
+ * GLOBAL(void)
+ * jsimd_h2v2_downsample_neon (JDIMENSION image_width, int max_v_samp_factor,
+ *                             JDIMENSION v_samp_factor, JDIMENSION width_blocks,
+ *                             JSAMPARRAY input_data, JSAMPARRAY output_data);
+ */
+
+.balign 16
+asm_function jsimd_h2v2_downsample_neon
+    IMAGE_WIDTH     .req x0
+    MAX_V_SAMP      .req x1
+    V_SAMP          .req x2
+    BLOCK_WIDTH     .req x3
+    INPUT_DATA      .req x4
+    OUTPUT_DATA     .req x5
+    OUTPTR          .req x9
+    INPTR0          .req x10
+    INPTR1          .req x14
+    TMP1            .req x11
+    TMP2            .req x12
+    TMP3            .req x13
+    TMPDUP          .req w15
+
+    mov             TMPDUP, #1
+    lsl             TMP2, BLOCK_WIDTH, #4
+    lsl             TMPDUP, TMPDUP, #17
+    sub             TMP2, TMP2, IMAGE_WIDTH
+    adr             TMP3, Ljsimd_h2_downsample_neon_consts
+    orr             TMPDUP, TMPDUP, #1
+    add             TMP3, TMP3, TMP2, lsl #4
+    dup             v16.4s, TMPDUP
+    ld1             {v18.16b}, [TMP3]
+
+1:  /* row loop */
+    ldr             INPTR0, [INPUT_DATA], #8
+    ldr             OUTPTR, [OUTPUT_DATA], #8
+    ldr             INPTR1, [INPUT_DATA], #8
+    subs            TMP1, BLOCK_WIDTH, #1
+    b.eq            3f
+2:  /* columns */
+    ld1             {v0.16b}, [INPTR0], #16
+    ld1             {v1.16b}, [INPTR1], #16
+    mov             v4.16b, v16.16b
+    subs            TMP1, TMP1, #1
+    uadalp          v4.8h, v0.16b
+    uadalp          v4.8h, v1.16b
+    shrn            v6.8b, v4.8h, #2
+    st1             {v6.8b}, [OUTPTR], #8
+    b.ne            2b
+3:  /* last columns */
+    ld1             {v0.16b}, [INPTR0], #16
+    ld1             {v1.16b}, [INPTR1], #16
+    mov             v4.16b, v16.16b
+    subs            V_SAMP, V_SAMP, #1
+    /* expand right */
+    tbl             v2.16b, {v0.16b}, v18.16b
+    tbl             v3.16b, {v1.16b}, v18.16b
+    uadalp          v4.8h, v2.16b
+    uadalp          v4.8h, v3.16b
+    shrn            v6.8b, v4.8h, #2
+    st1             {v6.8b}, [OUTPTR], #8
+    b.ne            1b
+
+    br              x30
+
+    .unreq          IMAGE_WIDTH
+    .unreq          MAX_V_SAMP
+    .unreq          V_SAMP
+    .unreq          BLOCK_WIDTH
+    .unreq          INPUT_DATA
+    .unreq          OUTPUT_DATA
+    .unreq          OUTPTR
+    .unreq          INPTR0
+    .unreq          INPTR1
+    .unreq          TMP1
+    .unreq          TMP2
+    .unreq          TMP3
+    .unreq          TMPDUP
+
+
+/*****************************************************************************/
+
+/*
+ * GLOBAL(JOCTET*)
+ * jsimd_huff_encode_one_block (working_state *state, JOCTET *buffer,
+ *                              JCOEFPTR block, int last_dc_val,
+ *                              c_derived_tbl *dctbl, c_derived_tbl *actbl)
+ *
+ */
+
+    BUFFER          .req x1
+    PUT_BUFFER      .req x6
+    PUT_BITS        .req x7
+    PUT_BITSw       .req w7
+
+.macro emit_byte
+    sub             PUT_BITS, PUT_BITS, #0x8
+    lsr             x19, PUT_BUFFER, PUT_BITS
+    uxtb            w19, w19
+    strb            w19, [BUFFER, #1]!
+    cmp             w19, #0xff
+    b.ne            14f
+    strb            wzr, [BUFFER, #1]!
+14:
+.endm
+.macro put_bits CODE, SIZE
+    lsl             PUT_BUFFER, PUT_BUFFER, \SIZE
+    add             PUT_BITS, PUT_BITS, \SIZE
+    orr             PUT_BUFFER, PUT_BUFFER, \CODE
+.endm
+.macro checkbuf31
+    cmp             PUT_BITS, #0x20
+    b.lt            31f
+    emit_byte
+    emit_byte
+    emit_byte
+    emit_byte
+31:
+.endm
+.macro checkbuf47
+    cmp             PUT_BITS, #0x30
+    b.lt            47f
+    emit_byte
+    emit_byte
+    emit_byte
+    emit_byte
+    emit_byte
+    emit_byte
+47:
+.endm
+
+.macro generate_jsimd_huff_encode_one_block fast_tbl
+
+.balign 16
+.if \fast_tbl == 1
+Ljsimd_huff_encode_one_block_neon_consts:
+.else
+Ljsimd_huff_encode_one_block_neon_slowtbl_consts:
+.endif
+    .byte 0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, 0x80, \
+          0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, 0x80
+.if \fast_tbl == 1
+    .byte    0,   1,   2,   3,  16,  17,  32,  33, \
+            18,  19,   4,   5,   6,   7,  20,  21  /* L0 => L3 : 4 lines OK */
+    .byte   34,  35,  48,  49, 255, 255,  50,  51, \
+            36,  37,  22,  23,   8,   9,  10,  11  /* L0 => L3 : 4 lines OK */
+    .byte    8,   9,  22,  23,  36,  37,  50,  51, \
+           255, 255, 255, 255, 255, 255,  52,  53  /* L1 => L4 : 4 lines OK */
+    .byte   54,  55,  40,  41,  26,  27,  12,  13, \
+            14,  15,  28,  29,  42,  43,  56,  57  /* L0 => L3 : 4 lines OK */
+    .byte    6,   7,  20,  21,  34,  35,  48,  49, \
+            50,  51,  36,  37,  22,  23,   8,   9  /* L4 => L7 : 4 lines OK */
+    .byte   42,  43,  28,  29,  14,  15,  30,  31, \
+            44,  45,  58,  59, 255, 255, 255, 255  /* L1 => L4 : 4 lines OK */
+    .byte  255, 255, 255, 255,  56,  57,  42,  43, \
+            28,  29,  14,  15,  30,  31,  44,  45  /* L3 => L6 : 4 lines OK */
+    .byte   26,  27,  40,  41,  42,  43,  28,  29, \
+            14,  15,  30,  31,  44,  45,  46,  47  /* L5 => L7 : 3 lines OK */
+    .byte  255, 255, 255, 255,   0,   1, 255, 255, \
+           255, 255, 255, 255, 255, 255, 255, 255  /* L4 : 1 lines OK */
+    .byte  255, 255, 255, 255, 255, 255, 255, 255, \
+             0,   1,  16,  17,   2,   3, 255, 255  /* L5 => L6 : 2 lines OK */
+    .byte  255, 255, 255, 255, 255, 255, 255, 255, \
+           255, 255, 255, 255,   8,   9,  22,  23  /* L5 => L6 : 2 lines OK */
+    .byte    4,   5,   6,   7, 255, 255, 255, 255, \
+           255, 255, 255, 255, 255, 255, 255, 255  /* L7 : 1 line OK */
+.endif
+
+.if \fast_tbl == 1
+asm_function jsimd_huff_encode_one_block_neon
+.else
+asm_function jsimd_huff_encode_one_block_neon_slowtbl
+.endif
+    sub             sp, sp, 272
+    sub             BUFFER, BUFFER, #0x1    /* BUFFER=buffer-- */
+    /* Save ARM registers */
+    stp             x19, x20, [sp], 16
+.if \fast_tbl == 1
+    adr             x15, Ljsimd_huff_encode_one_block_neon_consts
+.else
+    adr             x15, Ljsimd_huff_encode_one_block_neon_slowtbl_consts
+.endif
+    ldr             PUT_BUFFER, [x0, #0x10]
+    ldr             PUT_BITSw, [x0, #0x18]
+    ldrsh           w12, [x2]               /* load DC coeff in w12 */
+    /* prepare data */
+.if \fast_tbl == 1
+    ld1             {v23.16b}, [x15], #16
+    ld1             {v0.16b, v1.16b, v2.16b, v3.16b}, [x15], #64
+    ld1             {v4.16b, v5.16b, v6.16b, v7.16b}, [x15], #64
+    ld1             {v16.16b, v17.16b, v18.16b, v19.16b}, [x15], #64
+    ld1             {v24.16b, v25.16b, v26.16b, v27.16b}, [x2], #64
+    ld1             {v28.16b, v29.16b, v30.16b, v31.16b}, [x2], #64
+    sub             w12, w12, w3      /* last_dc_val, not used afterwards */
+    /* ZigZag 8x8 */
+    tbl             v0.16b, {v24.16b, v25.16b, v26.16b, v27.16b}, v0.16b
+    tbl             v1.16b, {v24.16b, v25.16b, v26.16b, v27.16b}, v1.16b
+    tbl             v2.16b, {v25.16b, v26.16b, v27.16b, v28.16b}, v2.16b
+    tbl             v3.16b, {v24.16b, v25.16b, v26.16b, v27.16b}, v3.16b
+    tbl             v4.16b, {v28.16b, v29.16b, v30.16b, v31.16b}, v4.16b
+    tbl             v5.16b, {v25.16b, v26.16b, v27.16b, v28.16b}, v5.16b
+    tbl             v6.16b, {v27.16b, v28.16b, v29.16b, v30.16b}, v6.16b
+    tbl             v7.16b, {v29.16b, v30.16b, v31.16b}, v7.16b
+    ins             v0.h[0], w12
+    tbx             v1.16b, {v28.16b}, v16.16b
+    tbx             v2.16b, {v29.16b, v30.16b}, v17.16b
+    tbx             v5.16b, {v29.16b, v30.16b}, v18.16b
+    tbx             v6.16b, {v31.16b}, v19.16b
+.else
+      add             x13, x2, #0x22
+      sub             w12, w12, w3    /* last_dc_val, not used afterwards */
+    ld1             {v23.16b}, [x15]
+      add             x14, x2, #0x18
+      add             x3, x2, #0x36
+    ins             v0.h[0], w12
+      add             x9, x2, #0x2
+    ld1             {v1.h}[0], [x13]
+      add             x15, x2, #0x30
+    ld1             {v2.h}[0], [x14]
+      add             x19, x2, #0x26
+    ld1             {v3.h}[0], [x3]
+      add             x20, x2, #0x28
+    ld1             {v0.h}[1], [x9]
+      add             x12, x2, #0x10
+    ld1             {v1.h}[1], [x15]
+      add             x13, x2, #0x40
+    ld1             {v2.h}[1], [x19]
+      add             x14, x2, #0x34
+    ld1             {v3.h}[1], [x20]
+      add             x3, x2, #0x1a
+    ld1             {v0.h}[2], [x12]
+      add             x9, x2, #0x20
+    ld1             {v1.h}[2], [x13]
+      add             x15, x2, #0x32
+    ld1             {v2.h}[2], [x14]
+      add             x19, x2, #0x42
+    ld1             {v3.h}[2], [x3]
+      add             x20, x2, #0xc
+    ld1             {v0.h}[3], [x9]
+      add             x12, x2, #0x12
+    ld1             {v1.h}[3], [x15]
+      add             x13, x2, #0x24
+    ld1             {v2.h}[3], [x19]
+      add             x14, x2, #0x50
+    ld1             {v3.h}[3], [x20]
+      add             x3, x2, #0xe
+    ld1             {v0.h}[4], [x12]
+      add             x9, x2, #0x4
+    ld1             {v1.h}[4], [x13]
+      add             x15, x2, #0x16
+    ld1             {v2.h}[4], [x14]
+      add             x19, x2, #0x60
+    ld1             {v3.h}[4], [x3]
+      add             x20, x2, #0x1c
+    ld1             {v0.h}[5], [x9]
+      add             x12, x2, #0x6
+    ld1             {v1.h}[5], [x15]
+      add             x13, x2, #0x8
+    ld1             {v2.h}[5], [x19]
+      add             x14, x2, #0x52
+    ld1             {v3.h}[5], [x20]
+      add             x3, x2, #0x2a
+    ld1             {v0.h}[6], [x12]
+      add             x9, x2, #0x14
+    ld1             {v1.h}[6], [x13]
+      add             x15, x2, #0xa
+    ld1             {v2.h}[6], [x14]
+      add             x19, x2, #0x44
+    ld1             {v3.h}[6], [x3]
+      add             x20, x2, #0x38
+    ld1             {v0.h}[7], [x9]
+      add             x12, x2, #0x46
+    ld1             {v1.h}[7], [x15]
+      add             x13, x2, #0x3a
+    ld1             {v2.h}[7], [x19]
+      add             x14, x2, #0x74
+    ld1             {v3.h}[7], [x20]
+      add             x3, x2, #0x6a
+    ld1             {v4.h}[0], [x12]
+      add             x9, x2, #0x54
+    ld1             {v5.h}[0], [x13]
+      add             x15, x2, #0x2c
+    ld1             {v6.h}[0], [x14]
+      add             x19, x2, #0x76
+    ld1             {v7.h}[0], [x3]
+      add             x20, x2, #0x78
+    ld1             {v4.h}[1], [x9]
+      add             x12, x2, #0x62
+    ld1             {v5.h}[1], [x15]
+      add             x13, x2, #0x1e
+    ld1             {v6.h}[1], [x19]
+      add             x14, x2, #0x68
+    ld1             {v7.h}[1], [x20]
+      add             x3, x2, #0x7a
+    ld1             {v4.h}[2], [x12]
+      add             x9, x2, #0x70
+    ld1             {v5.h}[2], [x13]
+      add             x15, x2, #0x2e
+    ld1             {v6.h}[2], [x14]
+      add             x19, x2, #0x5a
+    ld1             {v7.h}[2], [x3]
+      add             x20, x2, #0x6c
+    ld1             {v4.h}[3], [x9]
+      add             x12, x2, #0x72
+    ld1             {v5.h}[3], [x15]
+      add             x13, x2, #0x3c
+    ld1             {v6.h}[3], [x19]
+      add             x14, x2, #0x4c
+    ld1             {v7.h}[3], [x20]
+      add             x3, x2, #0x5e
+    ld1             {v4.h}[4], [x12]
+      add             x9, x2, #0x64
+    ld1             {v5.h}[4], [x13]
+      add             x15, x2, #0x4a
+    ld1             {v6.h}[4], [x14]
+      add             x19, x2, #0x3e
+    ld1             {v7.h}[4], [x3]
+      add             x20, x2, #0x6e
+    ld1             {v4.h}[5], [x9]
+      add             x12, x2, #0x56
+    ld1             {v5.h}[5], [x15]
+      add             x13, x2, #0x58
+    ld1             {v6.h}[5], [x19]
+      add             x14, x2, #0x4e
+    ld1             {v7.h}[5], [x20]
+      add             x3, x2, #0x7c
+    ld1             {v4.h}[6], [x12]
+      add             x9, x2, #0x48
+    ld1             {v5.h}[6], [x13]
+      add             x15, x2, #0x66
+    ld1             {v6.h}[6], [x14]
+      add             x19, x2, #0x5c
+    ld1             {v7.h}[6], [x3]
+      add             x20, x2, #0x7e
+    ld1             {v4.h}[7], [x9]
+    ld1             {v5.h}[7], [x15]
+    ld1             {v6.h}[7], [x19]
+    ld1             {v7.h}[7], [x20]
+.endif
+    cmlt            v24.8h, v0.8h, #0
+    cmlt            v25.8h, v1.8h, #0
+    cmlt            v26.8h, v2.8h, #0
+    cmlt            v27.8h, v3.8h, #0
+    cmlt            v28.8h, v4.8h, #0
+    cmlt            v29.8h, v5.8h, #0
+    cmlt            v30.8h, v6.8h, #0
+    cmlt            v31.8h, v7.8h, #0
+    abs             v0.8h, v0.8h
+    abs             v1.8h, v1.8h
+    abs             v2.8h, v2.8h
+    abs             v3.8h, v3.8h
+    abs             v4.8h, v4.8h
+    abs             v5.8h, v5.8h
+    abs             v6.8h, v6.8h
+    abs             v7.8h, v7.8h
+    eor             v24.16b, v24.16b, v0.16b
+    eor             v25.16b, v25.16b, v1.16b
+    eor             v26.16b, v26.16b, v2.16b
+    eor             v27.16b, v27.16b, v3.16b
+    eor             v28.16b, v28.16b, v4.16b
+    eor             v29.16b, v29.16b, v5.16b
+    eor             v30.16b, v30.16b, v6.16b
+    eor             v31.16b, v31.16b, v7.16b
+    cmeq            v16.8h, v0.8h, #0
+    cmeq            v17.8h, v1.8h, #0
+    cmeq            v18.8h, v2.8h, #0
+    cmeq            v19.8h, v3.8h, #0
+    cmeq            v20.8h, v4.8h, #0
+    cmeq            v21.8h, v5.8h, #0
+    cmeq            v22.8h, v6.8h, #0
+    xtn             v16.8b, v16.8h
+    xtn             v18.8b, v18.8h
+    xtn             v20.8b, v20.8h
+    xtn             v22.8b, v22.8h
+      umov            w14, v0.h[0]
+    xtn2            v16.16b, v17.8h
+      umov            w13, v24.h[0]
+    xtn2            v18.16b, v19.8h
+      clz             w14, w14
+    xtn2            v20.16b, v21.8h
+      lsl             w13, w13, w14
+    cmeq            v17.8h, v7.8h, #0
+      sub             w12, w14, #32
+    xtn2            v22.16b, v17.8h
+      lsr             w13, w13, w14
+    and             v16.16b, v16.16b, v23.16b
+      neg             w12, w12
+    and             v18.16b, v18.16b, v23.16b
+      add             x3, x4, #0x400           /* r1 = dctbl->ehufsi */
+    and             v20.16b, v20.16b, v23.16b
+      add             x15, sp, #0x80           /* x15 = t2 */
+    and             v22.16b, v22.16b, v23.16b
+      ldr             w10, [x4, x12, lsl #2]
+    addp            v16.16b, v16.16b, v18.16b
+      ldrb            w11, [x3, x12]
+    addp            v20.16b, v20.16b, v22.16b
+      checkbuf47
+    addp            v16.16b, v16.16b, v20.16b
+      put_bits        x10, x11
+    addp            v16.16b, v16.16b, v18.16b
+      checkbuf47
+    umov            x9,v16.D[0]
+      put_bits        x13, x12
+    cnt             v17.8b, v16.8b
+      mvn             x9, x9
+    addv            B18, v17.8b
+      add             x4, x5, #0x400   /* x4 = actbl->ehufsi */
+    umov            w12, v18.b[0]
+      lsr             x9, x9, #0x1     /* clear AC coeff */
+    ldr             w13, [x5, #0x3c0]  /* x13 = actbl->ehufco[0xf0] */
+    rbit            x9, x9             /* x9 = index0 */
+    ldrb            w14, [x4, #0xf0]   /* x14 = actbl->ehufsi[0xf0] */
+    cmp             w12, #(64-8)
+    mov             x11, sp
+    b.lt            4f
+    cbz             x9, 6f
+    st1             {v0.8h, v1.8h, v2.8h, v3.8h}, [x11], #64
+    st1             {v4.8h, v5.8h, v6.8h, v7.8h}, [x11], #64
+    st1             {v24.8h, v25.8h, v26.8h, v27.8h}, [x11], #64
+    st1             {v28.8h, v29.8h, v30.8h, v31.8h}, [x11], #64
+1:
+    clz             x2, x9
+    add             x15, x15, x2, lsl #1
+    lsl             x9, x9, x2
+    ldrh            w20, [x15, #-126]
+2:
+    cmp             x2, #0x10
+    b.lt            3f
+    sub             x2, x2, #0x10
+    checkbuf47
+    put_bits        x13, x14
+    b               2b
+3:
+    clz             w20, w20
+    ldrh            w3, [x15, #2]!
+    sub             w11, w20, #32
+    lsl             w3, w3, w20
+    neg             w11, w11
+    lsr             w3, w3, w20
+    add             x2, x11, x2, lsl #4
+    lsl             x9, x9, #0x1
+    ldr             w12, [x5, x2, lsl #2]
+    ldrb            w10, [x4, x2]
+    checkbuf31
+    put_bits        x12, x10
+    put_bits        x3, x11
+    cbnz            x9, 1b
+    b               6f
+4:
+    movi            v21.8h, #0x0010
+    clz             v0.8h, v0.8h
+    clz             v1.8h, v1.8h
+    clz             v2.8h, v2.8h
+    clz             v3.8h, v3.8h
+    clz             v4.8h, v4.8h
+    clz             v5.8h, v5.8h
+    clz             v6.8h, v6.8h
+    clz             v7.8h, v7.8h
+    ushl            v24.8h, v24.8h, v0.8h
+    ushl            v25.8h, v25.8h, v1.8h
+    ushl            v26.8h, v26.8h, v2.8h
+    ushl            v27.8h, v27.8h, v3.8h
+    ushl            v28.8h, v28.8h, v4.8h
+    ushl            v29.8h, v29.8h, v5.8h
+    ushl            v30.8h, v30.8h, v6.8h
+    ushl            v31.8h, v31.8h, v7.8h
+    neg             v0.8h, v0.8h
+    neg             v1.8h, v1.8h
+    neg             v2.8h, v2.8h
+    neg             v3.8h, v3.8h
+    neg             v4.8h, v4.8h
+    neg             v5.8h, v5.8h
+    neg             v6.8h, v6.8h
+    neg             v7.8h, v7.8h
+    ushl            v24.8h, v24.8h, v0.8h
+    ushl            v25.8h, v25.8h, v1.8h
+    ushl            v26.8h, v26.8h, v2.8h
+    ushl            v27.8h, v27.8h, v3.8h
+    ushl            v28.8h, v28.8h, v4.8h
+    ushl            v29.8h, v29.8h, v5.8h
+    ushl            v30.8h, v30.8h, v6.8h
+    ushl            v31.8h, v31.8h, v7.8h
+    add             v0.8h, v21.8h, v0.8h
+    add             v1.8h, v21.8h, v1.8h
+    add             v2.8h, v21.8h, v2.8h
+    add             v3.8h, v21.8h, v3.8h
+    add             v4.8h, v21.8h, v4.8h
+    add             v5.8h, v21.8h, v5.8h
+    add             v6.8h, v21.8h, v6.8h
+    add             v7.8h, v21.8h, v7.8h
+    st1             {v0.8h, v1.8h, v2.8h, v3.8h}, [x11], #64
+    st1             {v4.8h, v5.8h, v6.8h, v7.8h}, [x11], #64
+    st1             {v24.8h, v25.8h, v26.8h, v27.8h}, [x11], #64
+    st1             {v28.8h, v29.8h, v30.8h, v31.8h}, [x11], #64
+1:
+    clz             x2, x9
+    add             x15, x15, x2, lsl #1
+    lsl             x9, x9, x2
+    ldrh            w11, [x15, #-126]
+2:
+    cmp             x2, #0x10
+    b.lt            3f
+    sub             x2, x2, #0x10
+    checkbuf47
+    put_bits        x13, x14
+    b               2b
+3:
+    ldrh            w3, [x15, #2]!
+    add             x2, x11, x2, lsl #4
+    lsl             x9, x9, #0x1
+    ldr             w12, [x5, x2, lsl #2]
+    ldrb            w10, [x4, x2]
+    checkbuf31
+    put_bits        x12, x10
+    put_bits        x3, x11
+    cbnz            x9, 1b
+6:
+    add             x13, sp, #0xfe
+    cmp             x15, x13
+    b.hs            1f
+    ldr             w12, [x5]
+    ldrb            w14, [x4]
+    checkbuf47
+    put_bits        x12, x14
+1:
+    sub             sp, sp, 16
+    str             PUT_BUFFER, [x0, #0x10]
+    str             PUT_BITSw, [x0, #0x18]
+    ldp             x19, x20, [sp], 16
+    add             x0, BUFFER, #0x1
+    add             sp, sp, 256
+    br              x30
+
+.endm
+
+generate_jsimd_huff_encode_one_block 1
+generate_jsimd_huff_encode_one_block 0
+
+    .unreq          BUFFER
+    .unreq          PUT_BUFFER
+    .unreq          PUT_BITS
+    .unreq          PUT_BITSw
+
+.purgem emit_byte
+.purgem put_bits
+.purgem checkbuf31
+.purgem checkbuf47
diff --git a/simd/jsimd_arm_neon.S b/simd/jsimd_arm_neon.S
index 4d9685b..568768f 100644
--- a/simd/jsimd_arm_neon.S
+++ b/simd/jsimd_arm_neon.S
@@ -6,6 +6,8 @@
  * Author: Siarhei Siamashka <siarhei.siamashka@nokia.com>
  * Copyright (C) 2014 Siarhei Siamashka.  All Rights Reserved.
  * Copyright (C) 2014 Linaro Limited.  All Rights Reserved.
+ * Copyright (C) 2015 D. R. Commander.  All Rights Reserved.
+ * Copyright (C) 2015-2016 Matthieu Darbois.  All Rights Reserved.
  *
  * This software is provided 'as-is', without any express or implied
  * warranty.  In no event will the authors be held liable for any damages
@@ -25,7 +27,7 @@
  */
 
 #if defined(__linux__) && defined(__ELF__)
-.section .note.GNU-stack,"",%progbits /* mark stack as non-executable */
+.section .note.GNU-stack, "", %progbits  /* mark stack as non-executable */
 #endif
 
 .text
@@ -33,6 +35,7 @@
 .arch armv7a
 .object_arch armv4
 .arm
+.syntax unified
 
 
 #define RESPECT_STRICT_ALIGNMENT 1
@@ -57,10 +60,10 @@
 
 /* Transpose a block of 4x4 coefficients in four 64-bit registers */
 .macro transpose_4x4 x0, x1, x2, x3
-    vtrn.16 \x0, \x1
-    vtrn.16 \x2, \x3
-    vtrn.32 \x0, \x2
-    vtrn.32 \x1, \x3
+    vtrn.16         \x0, \x1
+    vtrn.16         \x2, \x3
+    vtrn.32         \x0, \x2
+    vtrn.32         \x1, \x3
 .endm
 
 
@@ -72,22 +75,22 @@
  * Perform dequantization and inverse DCT on one block of coefficients.
  *
  * GLOBAL(void)
- * jsimd_idct_islow_neon (void * dct_table, JCOEFPTR coef_block,
+ * jsimd_idct_islow_neon (void *dct_table, JCOEFPTR coef_block,
  *                        JSAMPARRAY output_buf, JDIMENSION output_col)
  */
 
-#define FIX_0_298631336  (2446)
-#define FIX_0_390180644  (3196)
-#define FIX_0_541196100  (4433)
-#define FIX_0_765366865  (6270)
-#define FIX_0_899976223  (7373)
-#define FIX_1_175875602  (9633)
-#define FIX_1_501321110  (12299)
-#define FIX_1_847759065  (15137)
-#define FIX_1_961570560  (16069)
-#define FIX_2_053119869  (16819)
-#define FIX_2_562915447  (20995)
-#define FIX_3_072711026  (25172)
+#define FIX_0_298631336 (2446)
+#define FIX_0_390180644 (3196)
+#define FIX_0_541196100 (4433)
+#define FIX_0_765366865 (6270)
+#define FIX_0_899976223 (7373)
+#define FIX_1_175875602 (9633)
+#define FIX_1_501321110 (12299)
+#define FIX_1_847759065 (15137)
+#define FIX_1_961570560 (16069)
+#define FIX_2_053119869 (16819)
+#define FIX_2_562915447 (20995)
+#define FIX_3_072711026 (25172)
 
 #define FIX_1_175875602_MINUS_1_961570560 (FIX_1_175875602 - FIX_1_961570560)
 #define FIX_1_175875602_MINUS_0_390180644 (FIX_1_175875602 - FIX_0_390180644)
@@ -105,8 +108,8 @@
 #define REF_1D_IDCT(xrow0, xrow1, xrow2, xrow3, xrow4, xrow5, xrow6, xrow7)   \
 {                                                                             \
     DCTELEM row0, row1, row2, row3, row4, row5, row6, row7;                   \
-    INT32   q1, q2, q3, q4, q5, q6, q7;                                       \
-    INT32   tmp11_plus_tmp2, tmp11_minus_tmp2;                                \
+    JLONG   q1, q2, q3, q4, q5, q6, q7;                                       \
+    JLONG   tmp11_plus_tmp2, tmp11_minus_tmp2;                                \
                                                                               \
     /* 1-D iDCT input data */                                                 \
     row0 = xrow0;                                                             \
@@ -127,7 +130,7 @@
     q2 = MULTIPLY(row2, FIX_0_541196100) +                                    \
          MULTIPLY(row6, FIX_0_541196100_MINUS_1_847759065);                   \
     q4 = q6;                                                                  \
-    q3 = ((INT32) row0 - (INT32) row4) << 13;                                 \
+    q3 = ((JLONG) row0 - (JLONG) row4) << 13;                                 \
     q6 += MULTIPLY(row5, -FIX_2_562915447) +                                  \
           MULTIPLY(row3, FIX_3_072711026_MINUS_2_562915447);                  \
     /* now we can use q1 (reloadable constants have been used up) */          \
@@ -154,7 +157,7 @@
     /* (tmp11 - tmp2) has been calculated (out_row6 before descale) */        \
     tmp11_minus_tmp2 = q1;                                                    \
                                                                               \
-    q1 = ((INT32) row0 + (INT32) row4) << 13;                                 \
+    q1 = ((JLONG) row0 + (JLONG) row4) << 13;                                 \
     q2 = q1 + q6;                                                             \
     q1 = q1 - q6;                                                             \
                                                                               \
@@ -169,34 +172,34 @@
     tmp13 = q1;                                                               \
 }
 
-#define XFIX_0_899976223                    d0[0]
-#define XFIX_0_541196100                    d0[1]
-#define XFIX_2_562915447                    d0[2]
-#define XFIX_0_298631336_MINUS_0_899976223  d0[3]
-#define XFIX_1_501321110_MINUS_0_899976223  d1[0]
-#define XFIX_2_053119869_MINUS_2_562915447  d1[1]
-#define XFIX_0_541196100_PLUS_0_765366865   d1[2]
-#define XFIX_1_175875602                    d1[3]
-#define XFIX_1_175875602_MINUS_0_390180644  d2[0]
-#define XFIX_0_541196100_MINUS_1_847759065  d2[1]
-#define XFIX_3_072711026_MINUS_2_562915447  d2[2]
-#define XFIX_1_175875602_MINUS_1_961570560  d2[3]
+#define XFIX_0_899976223                   d0[0]
+#define XFIX_0_541196100                   d0[1]
+#define XFIX_2_562915447                   d0[2]
+#define XFIX_0_298631336_MINUS_0_899976223 d0[3]
+#define XFIX_1_501321110_MINUS_0_899976223 d1[0]
+#define XFIX_2_053119869_MINUS_2_562915447 d1[1]
+#define XFIX_0_541196100_PLUS_0_765366865  d1[2]
+#define XFIX_1_175875602                   d1[3]
+#define XFIX_1_175875602_MINUS_0_390180644 d2[0]
+#define XFIX_0_541196100_MINUS_1_847759065 d2[1]
+#define XFIX_3_072711026_MINUS_2_562915447 d2[2]
+#define XFIX_1_175875602_MINUS_1_961570560 d2[3]
 
 .balign 16
 jsimd_idct_islow_neon_consts:
-    .short FIX_0_899976223                    /* d0[0] */
-    .short FIX_0_541196100                    /* d0[1] */
-    .short FIX_2_562915447                    /* d0[2] */
-    .short FIX_0_298631336_MINUS_0_899976223  /* d0[3] */
-    .short FIX_1_501321110_MINUS_0_899976223  /* d1[0] */
-    .short FIX_2_053119869_MINUS_2_562915447  /* d1[1] */
-    .short FIX_0_541196100_PLUS_0_765366865   /* d1[2] */
-    .short FIX_1_175875602                    /* d1[3] */
-    /* reloadable constants */
-    .short FIX_1_175875602_MINUS_0_390180644  /* d2[0] */
-    .short FIX_0_541196100_MINUS_1_847759065  /* d2[1] */
-    .short FIX_3_072711026_MINUS_2_562915447  /* d2[2] */
-    .short FIX_1_175875602_MINUS_1_961570560  /* d2[3] */
+  .short FIX_0_899976223                    /* d0[0] */
+  .short FIX_0_541196100                    /* d0[1] */
+  .short FIX_2_562915447                    /* d0[2] */
+  .short FIX_0_298631336_MINUS_0_899976223  /* d0[3] */
+  .short FIX_1_501321110_MINUS_0_899976223  /* d1[0] */
+  .short FIX_2_053119869_MINUS_2_562915447  /* d1[1] */
+  .short FIX_0_541196100_PLUS_0_765366865   /* d1[2] */
+  .short FIX_1_175875602                    /* d1[3] */
+  /* reloadable constants */
+  .short FIX_1_175875602_MINUS_0_390180644  /* d2[0] */
+  .short FIX_0_541196100_MINUS_1_847759065  /* d2[1] */
+  .short FIX_3_072711026_MINUS_2_562915447  /* d2[2] */
+  .short FIX_1_175875602_MINUS_1_961570560  /* d2[3] */
 
 asm_function jsimd_idct_islow_neon
 
@@ -255,140 +258,141 @@
     vld1.16         {d4, d5, d6, d7}, [DCT_TABLE, :128]!
     vmul.s16        q14, q14, q2
     vmul.s16        q13, q13, q1
-    vld1.16         {d0, d1, d2, d3}, [ip, :128] /* load constants */
+    vld1.16         {d0, d1, d2, d3}, [ip, :128]  /* load constants */
     add             ip, ip, #16
     vmul.s16        q15, q15, q3
-    vpush           {d8-d15} /* save NEON registers */
+    vpush           {d8-d15}                      /* save NEON registers */
     /* 1-D IDCT, pass 1, left 4x8 half */
-    vadd.s16        d4,    ROW7L, ROW3L
-    vadd.s16        d5,    ROW5L, ROW1L
-    vmull.s16       q6,    d4,    XFIX_1_175875602_MINUS_1_961570560
-    vmlal.s16       q6,    d5,    XFIX_1_175875602
-    vmull.s16       q7,    d4,    XFIX_1_175875602
+    vadd.s16        d4, ROW7L, ROW3L
+    vadd.s16        d5, ROW5L, ROW1L
+    vmull.s16       q6, d4, XFIX_1_175875602_MINUS_1_961570560
+    vmlal.s16       q6, d5, XFIX_1_175875602
+    vmull.s16       q7, d4, XFIX_1_175875602
       /* Check for the zero coefficients in the right 4x8 half */
       push            {r4, r5}
-    vmlal.s16       q7,    d5,    XFIX_1_175875602_MINUS_0_390180644
-    vsubl.s16       q3,    ROW0L, ROW4L
-      ldrd            r4,    [COEF_BLOCK, #(-96 + 2 * (4 + 1 * 8))]
-    vmull.s16       q2,    ROW2L, XFIX_0_541196100
-    vmlal.s16       q2,    ROW6L, XFIX_0_541196100_MINUS_1_847759065
-      orr             r0,    r4,    r5
-    vmov            q4,    q6
-    vmlsl.s16       q6,    ROW5L, XFIX_2_562915447
-      ldrd            r4,    [COEF_BLOCK, #(-96 + 2 * (4 + 2 * 8))]
-    vmlal.s16       q6,    ROW3L, XFIX_3_072711026_MINUS_2_562915447
-    vshl.s32        q3,    q3,    #13
-      orr             r0,    r0,    r4
-    vmlsl.s16       q4,    ROW1L, XFIX_0_899976223
-      orr             r0,    r0,    r5
-    vadd.s32        q1,    q3,    q2
-      ldrd            r4,    [COEF_BLOCK, #(-96 + 2 * (4 + 3 * 8))]
-    vmov            q5,    q7
-    vadd.s32        q1,    q1,    q6
-      orr             r0,    r0,    r4
-    vmlsl.s16       q7,    ROW7L, XFIX_0_899976223
-      orr             r0,    r0,    r5
-    vmlal.s16       q7,    ROW1L, XFIX_1_501321110_MINUS_0_899976223
-    vrshrn.s32      ROW1L, q1,    #11
-      ldrd            r4,    [COEF_BLOCK, #(-96 + 2 * (4 + 4 * 8))]
-    vsub.s32        q1,    q1,    q6
-    vmlal.s16       q5,    ROW5L, XFIX_2_053119869_MINUS_2_562915447
-      orr             r0,    r0,    r4
-    vmlsl.s16       q5,    ROW3L, XFIX_2_562915447
-      orr             r0,    r0,    r5
-    vsub.s32        q1,    q1,    q6
-    vmull.s16       q6,    ROW2L, XFIX_0_541196100_PLUS_0_765366865
-      ldrd            r4,    [COEF_BLOCK, #(-96 + 2 * (4 + 5 * 8))]
-    vmlal.s16       q6,    ROW6L, XFIX_0_541196100
-    vsub.s32        q3,    q3,    q2
-      orr             r0,    r0,    r4
-    vrshrn.s32      ROW6L, q1,    #11
-      orr             r0,    r0,    r5
-    vadd.s32        q1,    q3,    q5
-      ldrd            r4,    [COEF_BLOCK, #(-96 + 2 * (4 + 6 * 8))]
-    vsub.s32        q3,    q3,    q5
-    vaddl.s16       q5,    ROW0L, ROW4L
-      orr             r0,    r0,    r4
-    vrshrn.s32      ROW2L, q1,    #11
-      orr             r0,    r0,    r5
-    vrshrn.s32      ROW5L, q3,    #11
-      ldrd            r4,    [COEF_BLOCK, #(-96 + 2 * (4 + 7 * 8))]
-    vshl.s32        q5,    q5,    #13
-    vmlal.s16       q4,    ROW7L, XFIX_0_298631336_MINUS_0_899976223
-      orr             r0,    r0,    r4
-    vadd.s32        q2,    q5,    q6
-      orrs            r0,    r0,    r5
-    vsub.s32        q1,    q5,    q6
-    vadd.s32        q6,    q2,    q7
-      ldrd            r4,    [COEF_BLOCK, #(-96 + 2 * (4 + 0 * 8))]
-    vsub.s32        q2,    q2,    q7
-    vadd.s32        q5,    q1,    q4
-      orr             r0,    r4,    r5
-    vsub.s32        q3,    q1,    q4
+    vmlal.s16       q7, d5, XFIX_1_175875602_MINUS_0_390180644
+    vsubl.s16       q3, ROW0L, ROW4L
+      ldrd            r4, [COEF_BLOCK, #(-96 + 2 * (4 + 1 * 8))]
+    vmull.s16       q2, ROW2L, XFIX_0_541196100
+    vmlal.s16       q2, ROW6L, XFIX_0_541196100_MINUS_1_847759065
+      orr             r0, r4, r5
+    vmov            q4, q6
+    vmlsl.s16       q6, ROW5L, XFIX_2_562915447
+      ldrd            r4, [COEF_BLOCK, #(-96 + 2 * (4 + 2 * 8))]
+    vmlal.s16       q6, ROW3L, XFIX_3_072711026_MINUS_2_562915447
+    vshl.s32        q3, q3, #13
+      orr             r0, r0, r4
+    vmlsl.s16       q4, ROW1L, XFIX_0_899976223
+      orr             r0, r0, r5
+    vadd.s32        q1, q3, q2
+      ldrd            r4, [COEF_BLOCK, #(-96 + 2 * (4 + 3 * 8))]
+    vmov            q5, q7
+    vadd.s32        q1, q1, q6
+      orr             r0, r0, r4
+    vmlsl.s16       q7, ROW7L, XFIX_0_899976223
+      orr             r0, r0, r5
+    vmlal.s16       q7, ROW1L, XFIX_1_501321110_MINUS_0_899976223
+    vrshrn.s32      ROW1L, q1, #11
+      ldrd            r4, [COEF_BLOCK, #(-96 + 2 * (4 + 4 * 8))]
+    vsub.s32        q1, q1, q6
+    vmlal.s16       q5, ROW5L, XFIX_2_053119869_MINUS_2_562915447
+      orr             r0, r0, r4
+    vmlsl.s16       q5, ROW3L, XFIX_2_562915447
+      orr             r0, r0, r5
+    vsub.s32        q1, q1, q6
+    vmull.s16       q6, ROW2L, XFIX_0_541196100_PLUS_0_765366865
+      ldrd            r4, [COEF_BLOCK, #(-96 + 2 * (4 + 5 * 8))]
+    vmlal.s16       q6, ROW6L, XFIX_0_541196100
+    vsub.s32        q3, q3, q2
+      orr             r0, r0, r4
+    vrshrn.s32      ROW6L, q1, #11
+      orr             r0, r0, r5
+    vadd.s32        q1, q3, q5
+      ldrd            r4, [COEF_BLOCK, #(-96 + 2 * (4 + 6 * 8))]
+    vsub.s32        q3, q3, q5
+    vaddl.s16       q5, ROW0L, ROW4L
+      orr             r0, r0, r4
+    vrshrn.s32      ROW2L, q1, #11
+      orr             r0, r0, r5
+    vrshrn.s32      ROW5L, q3, #11
+      ldrd            r4, [COEF_BLOCK, #(-96 + 2 * (4 + 7 * 8))]
+    vshl.s32        q5, q5, #13
+    vmlal.s16       q4, ROW7L, XFIX_0_298631336_MINUS_0_899976223
+      orr             r0, r0, r4
+    vadd.s32        q2, q5, q6
+      orrs            r0, r0, r5
+    vsub.s32        q1, q5, q6
+    vadd.s32        q6, q2, q7
+      ldrd            r4, [COEF_BLOCK, #(-96 + 2 * (4 + 0 * 8))]
+    vsub.s32        q2, q2, q7
+    vadd.s32        q5, q1, q4
+      orr             r0, r4, r5
+    vsub.s32        q3, q1, q4
       pop             {r4, r5}
-    vrshrn.s32      ROW7L, q2,    #11
-    vrshrn.s32      ROW3L, q5,    #11
-    vrshrn.s32      ROW0L, q6,    #11
-    vrshrn.s32      ROW4L, q3,    #11
+    vrshrn.s32      ROW7L, q2, #11
+    vrshrn.s32      ROW3L, q5, #11
+    vrshrn.s32      ROW0L, q6, #11
+    vrshrn.s32      ROW4L, q3, #11
 
-      beq             3f /* Go to do some special handling for the sparse right 4x8 half */
+      beq             3f  /* Go to do some special handling for the sparse
+                             right 4x8 half */
 
     /* 1-D IDCT, pass 1, right 4x8 half */
-    vld1.s16        {d2},  [ip, :64]    /* reload constants */
-    vadd.s16        d10,   ROW7R, ROW3R
-    vadd.s16        d8,    ROW5R, ROW1R
+    vld1.s16        {d2}, [ip, :64]  /* reload constants */
+    vadd.s16        d10, ROW7R, ROW3R
+    vadd.s16        d8, ROW5R, ROW1R
       /* Transpose left 4x8 half */
       vtrn.16         ROW6L, ROW7L
-    vmull.s16       q6,    d10,   XFIX_1_175875602_MINUS_1_961570560
-    vmlal.s16       q6,    d8,    XFIX_1_175875602
+    vmull.s16       q6, d10, XFIX_1_175875602_MINUS_1_961570560
+    vmlal.s16       q6, d8, XFIX_1_175875602
       vtrn.16         ROW2L, ROW3L
-    vmull.s16       q7,    d10,   XFIX_1_175875602
-    vmlal.s16       q7,    d8,    XFIX_1_175875602_MINUS_0_390180644
+    vmull.s16       q7, d10, XFIX_1_175875602
+    vmlal.s16       q7, d8, XFIX_1_175875602_MINUS_0_390180644
       vtrn.16         ROW0L, ROW1L
-    vsubl.s16       q3,    ROW0R, ROW4R
-    vmull.s16       q2,    ROW2R, XFIX_0_541196100
-    vmlal.s16       q2,    ROW6R, XFIX_0_541196100_MINUS_1_847759065
+    vsubl.s16       q3, ROW0R, ROW4R
+    vmull.s16       q2, ROW2R, XFIX_0_541196100
+    vmlal.s16       q2, ROW6R, XFIX_0_541196100_MINUS_1_847759065
       vtrn.16         ROW4L, ROW5L
-    vmov            q4,    q6
-    vmlsl.s16       q6,    ROW5R, XFIX_2_562915447
-    vmlal.s16       q6,    ROW3R, XFIX_3_072711026_MINUS_2_562915447
+    vmov            q4, q6
+    vmlsl.s16       q6, ROW5R, XFIX_2_562915447
+    vmlal.s16       q6, ROW3R, XFIX_3_072711026_MINUS_2_562915447
       vtrn.32         ROW1L, ROW3L
-    vshl.s32        q3,    q3,    #13
-    vmlsl.s16       q4,    ROW1R, XFIX_0_899976223
+    vshl.s32        q3, q3, #13
+    vmlsl.s16       q4, ROW1R, XFIX_0_899976223
       vtrn.32         ROW4L, ROW6L
-    vadd.s32        q1,    q3,    q2
-    vmov            q5,    q7
-    vadd.s32        q1,    q1,    q6
+    vadd.s32        q1, q3, q2
+    vmov            q5, q7
+    vadd.s32        q1, q1, q6
       vtrn.32         ROW0L, ROW2L
-    vmlsl.s16       q7,    ROW7R, XFIX_0_899976223
-    vmlal.s16       q7,    ROW1R, XFIX_1_501321110_MINUS_0_899976223
-    vrshrn.s32      ROW1R, q1,    #11
+    vmlsl.s16       q7, ROW7R, XFIX_0_899976223
+    vmlal.s16       q7, ROW1R, XFIX_1_501321110_MINUS_0_899976223
+    vrshrn.s32      ROW1R, q1, #11
       vtrn.32         ROW5L, ROW7L
-    vsub.s32        q1,    q1,    q6
-    vmlal.s16       q5,    ROW5R, XFIX_2_053119869_MINUS_2_562915447
-    vmlsl.s16       q5,    ROW3R, XFIX_2_562915447
-    vsub.s32        q1,    q1,    q6
-    vmull.s16       q6,    ROW2R, XFIX_0_541196100_PLUS_0_765366865
-    vmlal.s16       q6,    ROW6R, XFIX_0_541196100
-    vsub.s32        q3,    q3,    q2
-    vrshrn.s32      ROW6R, q1,    #11
-    vadd.s32        q1,    q3,    q5
-    vsub.s32        q3,    q3,    q5
-    vaddl.s16       q5,    ROW0R, ROW4R
-    vrshrn.s32      ROW2R, q1,    #11
-    vrshrn.s32      ROW5R, q3,    #11
-    vshl.s32        q5,    q5,    #13
-    vmlal.s16       q4,    ROW7R, XFIX_0_298631336_MINUS_0_899976223
-    vadd.s32        q2,    q5,    q6
-    vsub.s32        q1,    q5,    q6
-    vadd.s32        q6,    q2,    q7
-    vsub.s32        q2,    q2,    q7
-    vadd.s32        q5,    q1,    q4
-    vsub.s32        q3,    q1,    q4
-    vrshrn.s32      ROW7R, q2,    #11
-    vrshrn.s32      ROW3R, q5,    #11
-    vrshrn.s32      ROW0R, q6,    #11
-    vrshrn.s32      ROW4R, q3,    #11
+    vsub.s32        q1, q1, q6
+    vmlal.s16       q5, ROW5R, XFIX_2_053119869_MINUS_2_562915447
+    vmlsl.s16       q5, ROW3R, XFIX_2_562915447
+    vsub.s32        q1, q1, q6
+    vmull.s16       q6, ROW2R, XFIX_0_541196100_PLUS_0_765366865
+    vmlal.s16       q6, ROW6R, XFIX_0_541196100
+    vsub.s32        q3, q3, q2
+    vrshrn.s32      ROW6R, q1, #11
+    vadd.s32        q1, q3, q5
+    vsub.s32        q3, q3, q5
+    vaddl.s16       q5, ROW0R, ROW4R
+    vrshrn.s32      ROW2R, q1, #11
+    vrshrn.s32      ROW5R, q3, #11
+    vshl.s32        q5, q5, #13
+    vmlal.s16       q4, ROW7R, XFIX_0_298631336_MINUS_0_899976223
+    vadd.s32        q2, q5, q6
+    vsub.s32        q1, q5, q6
+    vadd.s32        q6, q2, q7
+    vsub.s32        q2, q2, q7
+    vadd.s32        q5, q1, q4
+    vsub.s32        q3, q1, q4
+    vrshrn.s32      ROW7R, q2, #11
+    vrshrn.s32      ROW3R, q5, #11
+    vrshrn.s32      ROW0R, q6, #11
+    vrshrn.s32      ROW4R, q3, #11
     /* Transpose right 4x8 half */
     vtrn.16         ROW6R, ROW7R
     vtrn.16         ROW2R, ROW3R
@@ -400,122 +404,122 @@
     vtrn.32         ROW5R, ROW7R
 
 1:  /* 1-D IDCT, pass 2 (normal variant), left 4x8 half */
-    vld1.s16        {d2},  [ip, :64]    /* reload constants */
-    vmull.s16       q6,    ROW1R, XFIX_1_175875602 /* ROW5L <-> ROW1R */
-    vmlal.s16       q6,    ROW1L, XFIX_1_175875602
-    vmlal.s16       q6,    ROW3R, XFIX_1_175875602_MINUS_1_961570560 /* ROW7L <-> ROW3R */
-    vmlal.s16       q6,    ROW3L, XFIX_1_175875602_MINUS_1_961570560
-    vmull.s16       q7,    ROW3R, XFIX_1_175875602 /* ROW7L <-> ROW3R */
-    vmlal.s16       q7,    ROW3L, XFIX_1_175875602
-    vmlal.s16       q7,    ROW1R, XFIX_1_175875602_MINUS_0_390180644 /* ROW5L <-> ROW1R */
-    vmlal.s16       q7,    ROW1L, XFIX_1_175875602_MINUS_0_390180644
-    vsubl.s16       q3,    ROW0L, ROW0R /* ROW4L <-> ROW0R */
-    vmull.s16       q2,    ROW2L, XFIX_0_541196100
-    vmlal.s16       q2,    ROW2R, XFIX_0_541196100_MINUS_1_847759065 /* ROW6L <-> ROW2R */
-    vmov            q4,    q6
-    vmlsl.s16       q6,    ROW1R, XFIX_2_562915447 /* ROW5L <-> ROW1R */
-    vmlal.s16       q6,    ROW3L, XFIX_3_072711026_MINUS_2_562915447
-    vshl.s32        q3,    q3,    #13
-    vmlsl.s16       q4,    ROW1L, XFIX_0_899976223
-    vadd.s32        q1,    q3,    q2
-    vmov            q5,    q7
-    vadd.s32        q1,    q1,    q6
-    vmlsl.s16       q7,    ROW3R, XFIX_0_899976223 /* ROW7L <-> ROW3R */
-    vmlal.s16       q7,    ROW1L, XFIX_1_501321110_MINUS_0_899976223
-    vshrn.s32       ROW1L, q1,    #16
-    vsub.s32        q1,    q1,    q6
-    vmlal.s16       q5,    ROW1R, XFIX_2_053119869_MINUS_2_562915447 /* ROW5L <-> ROW1R */
-    vmlsl.s16       q5,    ROW3L, XFIX_2_562915447
-    vsub.s32        q1,    q1,    q6
-    vmull.s16       q6,    ROW2L, XFIX_0_541196100_PLUS_0_765366865
-    vmlal.s16       q6,    ROW2R, XFIX_0_541196100 /* ROW6L <-> ROW2R */
-    vsub.s32        q3,    q3,    q2
-    vshrn.s32       ROW2R, q1,    #16 /* ROW6L <-> ROW2R */
-    vadd.s32        q1,    q3,    q5
-    vsub.s32        q3,    q3,    q5
-    vaddl.s16       q5,    ROW0L, ROW0R /* ROW4L <-> ROW0R */
-    vshrn.s32       ROW2L, q1,    #16
-    vshrn.s32       ROW1R, q3,    #16 /* ROW5L <-> ROW1R */
-    vshl.s32        q5,    q5,    #13
-    vmlal.s16       q4,    ROW3R, XFIX_0_298631336_MINUS_0_899976223 /* ROW7L <-> ROW3R */
-    vadd.s32        q2,    q5,    q6
-    vsub.s32        q1,    q5,    q6
-    vadd.s32        q6,    q2,    q7
-    vsub.s32        q2,    q2,    q7
-    vadd.s32        q5,    q1,    q4
-    vsub.s32        q3,    q1,    q4
-    vshrn.s32       ROW3R, q2,    #16 /* ROW7L <-> ROW3R */
-    vshrn.s32       ROW3L, q5,    #16
-    vshrn.s32       ROW0L, q6,    #16
-    vshrn.s32       ROW0R, q3,    #16 /* ROW4L <-> ROW0R */
+    vld1.s16        {d2}, [ip, :64]               /* reload constants */
+    vmull.s16       q6, ROW1R, XFIX_1_175875602   /* ROW5L <-> ROW1R */
+    vmlal.s16       q6, ROW1L, XFIX_1_175875602
+    vmlal.s16       q6, ROW3R, XFIX_1_175875602_MINUS_1_961570560  /* ROW7L <-> ROW3R */
+    vmlal.s16       q6, ROW3L, XFIX_1_175875602_MINUS_1_961570560
+    vmull.s16       q7, ROW3R, XFIX_1_175875602   /* ROW7L <-> ROW3R */
+    vmlal.s16       q7, ROW3L, XFIX_1_175875602
+    vmlal.s16       q7, ROW1R, XFIX_1_175875602_MINUS_0_390180644  /* ROW5L <-> ROW1R */
+    vmlal.s16       q7, ROW1L, XFIX_1_175875602_MINUS_0_390180644
+    vsubl.s16       q3, ROW0L, ROW0R              /* ROW4L <-> ROW0R */
+    vmull.s16       q2, ROW2L, XFIX_0_541196100
+    vmlal.s16       q2, ROW2R, XFIX_0_541196100_MINUS_1_847759065  /* ROW6L <-> ROW2R */
+    vmov            q4, q6
+    vmlsl.s16       q6, ROW1R, XFIX_2_562915447   /* ROW5L <-> ROW1R */
+    vmlal.s16       q6, ROW3L, XFIX_3_072711026_MINUS_2_562915447
+    vshl.s32        q3, q3, #13
+    vmlsl.s16       q4, ROW1L, XFIX_0_899976223
+    vadd.s32        q1, q3, q2
+    vmov            q5, q7
+    vadd.s32        q1, q1, q6
+    vmlsl.s16       q7, ROW3R, XFIX_0_899976223   /* ROW7L <-> ROW3R */
+    vmlal.s16       q7, ROW1L, XFIX_1_501321110_MINUS_0_899976223
+    vshrn.s32       ROW1L, q1, #16
+    vsub.s32        q1, q1, q6
+    vmlal.s16       q5, ROW1R, XFIX_2_053119869_MINUS_2_562915447  /* ROW5L <-> ROW1R */
+    vmlsl.s16       q5, ROW3L, XFIX_2_562915447
+    vsub.s32        q1, q1, q6
+    vmull.s16       q6, ROW2L, XFIX_0_541196100_PLUS_0_765366865
+    vmlal.s16       q6, ROW2R, XFIX_0_541196100   /* ROW6L <-> ROW2R */
+    vsub.s32        q3, q3, q2
+    vshrn.s32       ROW2R, q1, #16                /* ROW6L <-> ROW2R */
+    vadd.s32        q1, q3, q5
+    vsub.s32        q3, q3, q5
+    vaddl.s16       q5, ROW0L, ROW0R              /* ROW4L <-> ROW0R */
+    vshrn.s32       ROW2L, q1, #16
+    vshrn.s32       ROW1R, q3, #16                /* ROW5L <-> ROW1R */
+    vshl.s32        q5, q5, #13
+    vmlal.s16       q4, ROW3R, XFIX_0_298631336_MINUS_0_899976223  /* ROW7L <-> ROW3R */
+    vadd.s32        q2, q5, q6
+    vsub.s32        q1, q5, q6
+    vadd.s32        q6, q2, q7
+    vsub.s32        q2, q2, q7
+    vadd.s32        q5, q1, q4
+    vsub.s32        q3, q1, q4
+    vshrn.s32       ROW3R, q2, #16                /* ROW7L <-> ROW3R */
+    vshrn.s32       ROW3L, q5, #16
+    vshrn.s32       ROW0L, q6, #16
+    vshrn.s32       ROW0R, q3, #16                /* ROW4L <-> ROW0R */
     /* 1-D IDCT, pass 2, right 4x8 half */
-    vld1.s16        {d2},  [ip, :64]    /* reload constants */
-    vmull.s16       q6,    ROW5R, XFIX_1_175875602
-    vmlal.s16       q6,    ROW5L, XFIX_1_175875602 /* ROW5L <-> ROW1R */
-    vmlal.s16       q6,    ROW7R, XFIX_1_175875602_MINUS_1_961570560
-    vmlal.s16       q6,    ROW7L, XFIX_1_175875602_MINUS_1_961570560 /* ROW7L <-> ROW3R */
-    vmull.s16       q7,    ROW7R, XFIX_1_175875602
-    vmlal.s16       q7,    ROW7L, XFIX_1_175875602 /* ROW7L <-> ROW3R */
-    vmlal.s16       q7,    ROW5R, XFIX_1_175875602_MINUS_0_390180644
-    vmlal.s16       q7,    ROW5L, XFIX_1_175875602_MINUS_0_390180644 /* ROW5L <-> ROW1R */
-    vsubl.s16       q3,    ROW4L, ROW4R /* ROW4L <-> ROW0R */
-    vmull.s16       q2,    ROW6L, XFIX_0_541196100 /* ROW6L <-> ROW2R */
-    vmlal.s16       q2,    ROW6R, XFIX_0_541196100_MINUS_1_847759065
-    vmov            q4,    q6
-    vmlsl.s16       q6,    ROW5R, XFIX_2_562915447
-    vmlal.s16       q6,    ROW7L, XFIX_3_072711026_MINUS_2_562915447 /* ROW7L <-> ROW3R */
-    vshl.s32        q3,    q3,    #13
-    vmlsl.s16       q4,    ROW5L, XFIX_0_899976223 /* ROW5L <-> ROW1R */
-    vadd.s32        q1,    q3,    q2
-    vmov            q5,    q7
-    vadd.s32        q1,    q1,    q6
-    vmlsl.s16       q7,    ROW7R, XFIX_0_899976223
-    vmlal.s16       q7,    ROW5L, XFIX_1_501321110_MINUS_0_899976223 /* ROW5L <-> ROW1R */
-    vshrn.s32       ROW5L, q1,    #16 /* ROW5L <-> ROW1R */
-    vsub.s32        q1,    q1,    q6
-    vmlal.s16       q5,    ROW5R, XFIX_2_053119869_MINUS_2_562915447
-    vmlsl.s16       q5,    ROW7L, XFIX_2_562915447 /* ROW7L <-> ROW3R */
-    vsub.s32        q1,    q1,    q6
-    vmull.s16       q6,    ROW6L, XFIX_0_541196100_PLUS_0_765366865 /* ROW6L <-> ROW2R */
-    vmlal.s16       q6,    ROW6R, XFIX_0_541196100
-    vsub.s32        q3,    q3,    q2
-    vshrn.s32       ROW6R, q1,    #16
-    vadd.s32        q1,    q3,    q5
-    vsub.s32        q3,    q3,    q5
-    vaddl.s16       q5,    ROW4L, ROW4R /* ROW4L <-> ROW0R */
-    vshrn.s32       ROW6L, q1,    #16 /* ROW6L <-> ROW2R */
-    vshrn.s32       ROW5R, q3,    #16
-    vshl.s32        q5,    q5,    #13
-    vmlal.s16       q4,    ROW7R, XFIX_0_298631336_MINUS_0_899976223
-    vadd.s32        q2,    q5,    q6
-    vsub.s32        q1,    q5,    q6
-    vadd.s32        q6,    q2,    q7
-    vsub.s32        q2,    q2,    q7
-    vadd.s32        q5,    q1,    q4
-    vsub.s32        q3,    q1,    q4
-    vshrn.s32       ROW7R, q2,    #16
-    vshrn.s32       ROW7L, q5,    #16 /* ROW7L <-> ROW3R */
-    vshrn.s32       ROW4L, q6,    #16 /* ROW4L <-> ROW0R */
-    vshrn.s32       ROW4R, q3,    #16
+    vld1.s16        {d2}, [ip, :64]               /* reload constants */
+    vmull.s16       q6, ROW5R, XFIX_1_175875602
+    vmlal.s16       q6, ROW5L, XFIX_1_175875602   /* ROW5L <-> ROW1R */
+    vmlal.s16       q6, ROW7R, XFIX_1_175875602_MINUS_1_961570560
+    vmlal.s16       q6, ROW7L, XFIX_1_175875602_MINUS_1_961570560  /* ROW7L <-> ROW3R */
+    vmull.s16       q7, ROW7R, XFIX_1_175875602
+    vmlal.s16       q7, ROW7L, XFIX_1_175875602   /* ROW7L <-> ROW3R */
+    vmlal.s16       q7, ROW5R, XFIX_1_175875602_MINUS_0_390180644
+    vmlal.s16       q7, ROW5L, XFIX_1_175875602_MINUS_0_390180644  /* ROW5L <-> ROW1R */
+    vsubl.s16       q3, ROW4L, ROW4R              /* ROW4L <-> ROW0R */
+    vmull.s16       q2, ROW6L, XFIX_0_541196100   /* ROW6L <-> ROW2R */
+    vmlal.s16       q2, ROW6R, XFIX_0_541196100_MINUS_1_847759065
+    vmov            q4, q6
+    vmlsl.s16       q6, ROW5R, XFIX_2_562915447
+    vmlal.s16       q6, ROW7L, XFIX_3_072711026_MINUS_2_562915447  /* ROW7L <-> ROW3R */
+    vshl.s32        q3, q3, #13
+    vmlsl.s16       q4, ROW5L, XFIX_0_899976223   /* ROW5L <-> ROW1R */
+    vadd.s32        q1, q3, q2
+    vmov            q5, q7
+    vadd.s32        q1, q1, q6
+    vmlsl.s16       q7, ROW7R, XFIX_0_899976223
+    vmlal.s16       q7, ROW5L, XFIX_1_501321110_MINUS_0_899976223  /* ROW5L <-> ROW1R */
+    vshrn.s32       ROW5L, q1, #16                /* ROW5L <-> ROW1R */
+    vsub.s32        q1, q1, q6
+    vmlal.s16       q5, ROW5R, XFIX_2_053119869_MINUS_2_562915447
+    vmlsl.s16       q5, ROW7L, XFIX_2_562915447   /* ROW7L <-> ROW3R */
+    vsub.s32        q1, q1, q6
+    vmull.s16       q6, ROW6L, XFIX_0_541196100_PLUS_0_765366865  /* ROW6L <-> ROW2R */
+    vmlal.s16       q6, ROW6R, XFIX_0_541196100
+    vsub.s32        q3, q3, q2
+    vshrn.s32       ROW6R, q1, #16
+    vadd.s32        q1, q3, q5
+    vsub.s32        q3, q3, q5
+    vaddl.s16       q5, ROW4L, ROW4R              /* ROW4L <-> ROW0R */
+    vshrn.s32       ROW6L, q1, #16                /* ROW6L <-> ROW2R */
+    vshrn.s32       ROW5R, q3, #16
+    vshl.s32        q5, q5, #13
+    vmlal.s16       q4, ROW7R, XFIX_0_298631336_MINUS_0_899976223
+    vadd.s32        q2, q5, q6
+    vsub.s32        q1, q5, q6
+    vadd.s32        q6, q2, q7
+    vsub.s32        q2, q2, q7
+    vadd.s32        q5, q1, q4
+    vsub.s32        q3, q1, q4
+    vshrn.s32       ROW7R, q2, #16
+    vshrn.s32       ROW7L, q5, #16                /* ROW7L <-> ROW3R */
+    vshrn.s32       ROW4L, q6, #16                /* ROW4L <-> ROW0R */
+    vshrn.s32       ROW4R, q3, #16
 
 2:  /* Descale to 8-bit and range limit */
-    vqrshrn.s16     d16,   q8,    #2
-    vqrshrn.s16     d17,   q9,    #2
-    vqrshrn.s16     d18,   q10,   #2
-    vqrshrn.s16     d19,   q11,   #2
-    vpop            {d8-d15} /* restore NEON registers */
-    vqrshrn.s16     d20,   q12,   #2
+    vqrshrn.s16     d16, q8, #2
+    vqrshrn.s16     d17, q9, #2
+    vqrshrn.s16     d18, q10, #2
+    vqrshrn.s16     d19, q11, #2
+    vpop            {d8-d15}                      /* restore NEON registers */
+    vqrshrn.s16     d20, q12, #2
       /* Transpose the final 8-bit samples and do signed->unsigned conversion */
-      vtrn.16         q8,    q9
-    vqrshrn.s16     d21,   q13,   #2
-    vqrshrn.s16     d22,   q14,   #2
-      vmov.u8         q0,    #(CENTERJSAMPLE)
-    vqrshrn.s16     d23,   q15,   #2
-      vtrn.8          d16,   d17
-      vtrn.8          d18,   d19
-      vadd.u8         q8,    q8,    q0
-      vadd.u8         q9,    q9,    q0
-      vtrn.16         q10,   q11
+      vtrn.16         q8, q9
+    vqrshrn.s16     d21, q13, #2
+    vqrshrn.s16     d22, q14, #2
+      vmov.u8         q0, #(CENTERJSAMPLE)
+    vqrshrn.s16     d23, q15, #2
+      vtrn.8          d16, d17
+      vtrn.8          d18, d19
+      vadd.u8         q8, q8, q0
+      vadd.u8         q9, q9, q0
+      vtrn.16         q10, q11
         /* Store results to the output buffer */
         ldmia           OUTPUT_BUF!, {TMP1, TMP2}
         add             TMP1, TMP1, OUTPUT_COL
@@ -527,7 +531,7 @@
         add             TMP1, TMP1, OUTPUT_COL
         add             TMP2, TMP2, OUTPUT_COL
         vst1.8          {d18}, [TMP1]
-      vadd.u8         q10,   q10,   q0
+      vadd.u8         q10, q10, q0
         vst1.8          {d19}, [TMP2]
         ldmia           OUTPUT_BUF, {TMP1, TMP2, TMP3, TMP4}
         add             TMP1, TMP1, OUTPUT_COL
@@ -536,7 +540,7 @@
         add             TMP4, TMP4, OUTPUT_COL
       vtrn.8          d22, d23
         vst1.8          {d20}, [TMP1]
-      vadd.u8         q11,   q11,   q0
+      vadd.u8         q11, q11, q0
         vst1.8          {d21}, [TMP2]
         vst1.8          {d22}, [TMP3]
         vst1.8          {d23}, [TMP4]
@@ -549,14 +553,15 @@
     vtrn.16         ROW2L, ROW3L
     vtrn.16         ROW0L, ROW1L
     vtrn.16         ROW4L, ROW5L
-    vshl.s16        ROW0R, ROW0R, #2 /* PASS1_BITS */
+    vshl.s16        ROW0R, ROW0R, #2  /* PASS1_BITS */
     vtrn.32         ROW1L, ROW3L
     vtrn.32         ROW4L, ROW6L
     vtrn.32         ROW0L, ROW2L
     vtrn.32         ROW5L, ROW7L
 
     cmp             r0, #0
-    beq             4f /* Right 4x8 half has all zeros, go to 'sparse' second pass */
+    beq             4f  /* Right 4x8 half has all zeros, go to 'sparse' second
+                           pass */
 
     /* Only row 0 is non-zero for the right 4x8 half  */
     vdup.s16        ROW1R, ROW0R[1]
@@ -567,83 +572,83 @@
     vdup.s16        ROW6R, ROW0R[2]
     vdup.s16        ROW7R, ROW0R[3]
     vdup.s16        ROW0R, ROW0R[0]
-    b               1b /* Go to 'normal' second pass */
+    b               1b  /* Go to 'normal' second pass */
 
 4:  /* 1-D IDCT, pass 2 (sparse variant with zero rows 4-7), left 4x8 half */
-    vld1.s16        {d2},  [ip, :64]    /* reload constants */
-    vmull.s16       q6,    ROW1L, XFIX_1_175875602
-    vmlal.s16       q6,    ROW3L, XFIX_1_175875602_MINUS_1_961570560
-    vmull.s16       q7,    ROW3L, XFIX_1_175875602
-    vmlal.s16       q7,    ROW1L, XFIX_1_175875602_MINUS_0_390180644
-    vmull.s16       q2,    ROW2L, XFIX_0_541196100
-    vshll.s16       q3,    ROW0L, #13
-    vmov            q4,    q6
-    vmlal.s16       q6,    ROW3L, XFIX_3_072711026_MINUS_2_562915447
-    vmlsl.s16       q4,    ROW1L, XFIX_0_899976223
-    vadd.s32        q1,    q3,    q2
-    vmov            q5,    q7
-    vmlal.s16       q7,    ROW1L, XFIX_1_501321110_MINUS_0_899976223
-    vadd.s32        q1,    q1,    q6
-    vadd.s32        q6,    q6,    q6
-    vmlsl.s16       q5,    ROW3L, XFIX_2_562915447
-    vshrn.s32       ROW1L, q1,    #16
-    vsub.s32        q1,    q1,    q6
-    vmull.s16       q6,    ROW2L, XFIX_0_541196100_PLUS_0_765366865
-    vsub.s32        q3,    q3,    q2
-    vshrn.s32       ROW2R, q1,    #16 /* ROW6L <-> ROW2R */
-    vadd.s32        q1,    q3,    q5
-    vsub.s32        q3,    q3,    q5
-    vshll.s16       q5,    ROW0L, #13
-    vshrn.s32       ROW2L, q1,    #16
-    vshrn.s32       ROW1R, q3,    #16 /* ROW5L <-> ROW1R */
-    vadd.s32        q2,    q5,    q6
-    vsub.s32        q1,    q5,    q6
-    vadd.s32        q6,    q2,    q7
-    vsub.s32        q2,    q2,    q7
-    vadd.s32        q5,    q1,    q4
-    vsub.s32        q3,    q1,    q4
-    vshrn.s32       ROW3R, q2,    #16 /* ROW7L <-> ROW3R */
-    vshrn.s32       ROW3L, q5,    #16
-    vshrn.s32       ROW0L, q6,    #16
-    vshrn.s32       ROW0R, q3,    #16 /* ROW4L <-> ROW0R */
+    vld1.s16        {d2}, [ip, :64]               /* reload constants */
+    vmull.s16       q6, ROW1L, XFIX_1_175875602
+    vmlal.s16       q6, ROW3L, XFIX_1_175875602_MINUS_1_961570560
+    vmull.s16       q7, ROW3L, XFIX_1_175875602
+    vmlal.s16       q7, ROW1L, XFIX_1_175875602_MINUS_0_390180644
+    vmull.s16       q2, ROW2L, XFIX_0_541196100
+    vshll.s16       q3, ROW0L, #13
+    vmov            q4, q6
+    vmlal.s16       q6, ROW3L, XFIX_3_072711026_MINUS_2_562915447
+    vmlsl.s16       q4, ROW1L, XFIX_0_899976223
+    vadd.s32        q1, q3, q2
+    vmov            q5, q7
+    vmlal.s16       q7, ROW1L, XFIX_1_501321110_MINUS_0_899976223
+    vadd.s32        q1, q1, q6
+    vadd.s32        q6, q6, q6
+    vmlsl.s16       q5, ROW3L, XFIX_2_562915447
+    vshrn.s32       ROW1L, q1, #16
+    vsub.s32        q1, q1, q6
+    vmull.s16       q6, ROW2L, XFIX_0_541196100_PLUS_0_765366865
+    vsub.s32        q3, q3, q2
+    vshrn.s32       ROW2R, q1, #16                /* ROW6L <-> ROW2R */
+    vadd.s32        q1, q3, q5
+    vsub.s32        q3, q3, q5
+    vshll.s16       q5, ROW0L, #13
+    vshrn.s32       ROW2L, q1, #16
+    vshrn.s32       ROW1R, q3, #16                /* ROW5L <-> ROW1R */
+    vadd.s32        q2, q5, q6
+    vsub.s32        q1, q5, q6
+    vadd.s32        q6, q2, q7
+    vsub.s32        q2, q2, q7
+    vadd.s32        q5, q1, q4
+    vsub.s32        q3, q1, q4
+    vshrn.s32       ROW3R, q2, #16                /* ROW7L <-> ROW3R */
+    vshrn.s32       ROW3L, q5, #16
+    vshrn.s32       ROW0L, q6, #16
+    vshrn.s32       ROW0R, q3, #16                /* ROW4L <-> ROW0R */
     /* 1-D IDCT, pass 2 (sparse variant with zero rows 4-7), right 4x8 half */
-    vld1.s16        {d2},  [ip, :64]    /* reload constants */
-    vmull.s16       q6,    ROW5L, XFIX_1_175875602
-    vmlal.s16       q6,    ROW7L, XFIX_1_175875602_MINUS_1_961570560
-    vmull.s16       q7,    ROW7L, XFIX_1_175875602
-    vmlal.s16       q7,    ROW5L, XFIX_1_175875602_MINUS_0_390180644
-    vmull.s16       q2,    ROW6L, XFIX_0_541196100
-    vshll.s16       q3,    ROW4L, #13
-    vmov            q4,    q6
-    vmlal.s16       q6,    ROW7L, XFIX_3_072711026_MINUS_2_562915447
-    vmlsl.s16       q4,    ROW5L, XFIX_0_899976223
-    vadd.s32        q1,    q3,    q2
-    vmov            q5,    q7
-    vmlal.s16       q7,    ROW5L, XFIX_1_501321110_MINUS_0_899976223
-    vadd.s32        q1,    q1,    q6
-    vadd.s32        q6,    q6,    q6
-    vmlsl.s16       q5,    ROW7L, XFIX_2_562915447
-    vshrn.s32       ROW5L, q1,    #16 /* ROW5L <-> ROW1R */
-    vsub.s32        q1,    q1,    q6
-    vmull.s16       q6,    ROW6L, XFIX_0_541196100_PLUS_0_765366865
-    vsub.s32        q3,    q3,    q2
-    vshrn.s32       ROW6R, q1,    #16
-    vadd.s32        q1,    q3,    q5
-    vsub.s32        q3,    q3,    q5
-    vshll.s16       q5,    ROW4L, #13
-    vshrn.s32       ROW6L, q1,    #16 /* ROW6L <-> ROW2R */
-    vshrn.s32       ROW5R, q3,    #16
-    vadd.s32        q2,    q5,    q6
-    vsub.s32        q1,    q5,    q6
-    vadd.s32        q6,    q2,    q7
-    vsub.s32        q2,    q2,    q7
-    vadd.s32        q5,    q1,    q4
-    vsub.s32        q3,    q1,    q4
-    vshrn.s32       ROW7R, q2,    #16
-    vshrn.s32       ROW7L, q5,    #16 /* ROW7L <-> ROW3R */
-    vshrn.s32       ROW4L, q6,    #16 /* ROW4L <-> ROW0R */
-    vshrn.s32       ROW4R, q3,    #16
-    b               2b /* Go to epilogue */
+    vld1.s16        {d2}, [ip, :64]               /* reload constants */
+    vmull.s16       q6, ROW5L, XFIX_1_175875602
+    vmlal.s16       q6, ROW7L, XFIX_1_175875602_MINUS_1_961570560
+    vmull.s16       q7, ROW7L, XFIX_1_175875602
+    vmlal.s16       q7, ROW5L, XFIX_1_175875602_MINUS_0_390180644
+    vmull.s16       q2, ROW6L, XFIX_0_541196100
+    vshll.s16       q3, ROW4L, #13
+    vmov            q4, q6
+    vmlal.s16       q6, ROW7L, XFIX_3_072711026_MINUS_2_562915447
+    vmlsl.s16       q4, ROW5L, XFIX_0_899976223
+    vadd.s32        q1, q3, q2
+    vmov            q5, q7
+    vmlal.s16       q7, ROW5L, XFIX_1_501321110_MINUS_0_899976223
+    vadd.s32        q1, q1, q6
+    vadd.s32        q6, q6, q6
+    vmlsl.s16       q5, ROW7L, XFIX_2_562915447
+    vshrn.s32       ROW5L, q1, #16                /* ROW5L <-> ROW1R */
+    vsub.s32        q1, q1, q6
+    vmull.s16       q6, ROW6L, XFIX_0_541196100_PLUS_0_765366865
+    vsub.s32        q3, q3, q2
+    vshrn.s32       ROW6R, q1, #16
+    vadd.s32        q1, q3, q5
+    vsub.s32        q3, q3, q5
+    vshll.s16       q5, ROW4L, #13
+    vshrn.s32       ROW6L, q1, #16                /* ROW6L <-> ROW2R */
+    vshrn.s32       ROW5R, q3, #16
+    vadd.s32        q2, q5, q6
+    vsub.s32        q1, q5, q6
+    vadd.s32        q6, q2, q7
+    vsub.s32        q2, q2, q7
+    vadd.s32        q5, q1, q4
+    vsub.s32        q3, q1, q4
+    vshrn.s32       ROW7R, q2, #16
+    vshrn.s32       ROW7L, q5, #16                /* ROW7L <-> ROW3R */
+    vshrn.s32       ROW4L, q6, #16                /* ROW4L <-> ROW0R */
+    vshrn.s32       ROW4R, q3, #16
+    b               2b                            /* Go to epilogue */
 
     .unreq          DCT_TABLE
     .unreq          COEF_BLOCK
@@ -697,10 +702,10 @@
 
 .balign 16
 jsimd_idct_ifast_neon_consts:
-    .short (277 * 128 - 256 * 128) /* XFIX_1_082392200 */
-    .short (362 * 128 - 256 * 128) /* XFIX_1_414213562 */
-    .short (473 * 128 - 256 * 128) /* XFIX_1_847759065 */
-    .short (669 * 128 - 512 * 128) /* XFIX_2_613125930 */
+  .short (277 * 128 - 256 * 128)  /* XFIX_1_082392200 */
+  .short (362 * 128 - 256 * 128)  /* XFIX_1_414213562 */
+  .short (473 * 128 - 256 * 128)  /* XFIX_1_847759065 */
+  .short (669 * 128 - 512 * 128)  /* XFIX_2_613125930 */
 
 asm_function jsimd_idct_ifast_neon
 
@@ -730,9 +735,9 @@
     vld1.16         {d16, d17, d18, d19}, [COEF_BLOCK, :128]!
     vld1.16         {d0, d1, d2, d3}, [DCT_TABLE, :128]!
     vld1.16         {d20, d21, d22, d23}, [COEF_BLOCK, :128]!
-    vmul.s16        q8,  q8,  q0
+    vmul.s16        q8, q8, q0
     vld1.16         {d4, d5, d6, d7}, [DCT_TABLE, :128]!
-    vmul.s16        q9,  q9,  q1
+    vmul.s16        q9, q9, q1
     vld1.16         {d24, d25, d26, d27}, [COEF_BLOCK, :128]!
     vmul.s16        q10, q10, q2
     vld1.16         {d0, d1, d2, d3}, [DCT_TABLE, :128]!
@@ -742,124 +747,124 @@
     vld1.16         {d4, d5, d6, d7}, [DCT_TABLE, :128]!
     vmul.s16        q14, q14, q2
     vmul.s16        q13, q13, q1
-    vld1.16         {d0}, [ip, :64] /* load constants */
+    vld1.16         {d0}, [ip, :64]  /* load constants */
     vmul.s16        q15, q15, q3
-    vpush           {d8-d13}        /* save NEON registers */
+    vpush           {d8-d13}         /* save NEON registers */
     /* 1-D IDCT, pass 1 */
-    vsub.s16        q2,  q10, q14
+    vsub.s16        q2, q10, q14
     vadd.s16        q14, q10, q14
-    vsub.s16        q1,  q11, q13
+    vsub.s16        q1, q11, q13
     vadd.s16        q13, q11, q13
-    vsub.s16        q5,  q9,  q15
-    vadd.s16        q15, q9,  q15
-    vqdmulh.s16     q4,  q2,  XFIX_1_414213562
-    vqdmulh.s16     q6,  q1,  XFIX_2_613125930
-    vadd.s16        q3,  q1,  q1
-    vsub.s16        q1,  q5,  q1
-    vadd.s16        q10, q2,  q4
-    vqdmulh.s16     q4,  q1,  XFIX_1_847759065
-    vsub.s16        q2,  q15, q13
-    vadd.s16        q3,  q3,  q6
-    vqdmulh.s16     q6,  q2,  XFIX_1_414213562
-    vadd.s16        q1,  q1,  q4
-    vqdmulh.s16     q4,  q5,  XFIX_1_082392200
+    vsub.s16        q5, q9, q15
+    vadd.s16        q15, q9, q15
+    vqdmulh.s16     q4, q2, XFIX_1_414213562
+    vqdmulh.s16     q6, q1, XFIX_2_613125930
+    vadd.s16        q3, q1, q1
+    vsub.s16        q1, q5, q1
+    vadd.s16        q10, q2, q4
+    vqdmulh.s16     q4, q1, XFIX_1_847759065
+    vsub.s16        q2, q15, q13
+    vadd.s16        q3, q3, q6
+    vqdmulh.s16     q6, q2, XFIX_1_414213562
+    vadd.s16        q1, q1, q4
+    vqdmulh.s16     q4, q5, XFIX_1_082392200
     vsub.s16        q10, q10, q14
-    vadd.s16        q2,  q2,  q6
-    vsub.s16        q6,  q8,  q12
-    vadd.s16        q12, q8,  q12
-    vadd.s16        q9,  q5,  q4
-    vadd.s16        q5,  q6,  q10
-    vsub.s16        q10, q6,  q10
-    vadd.s16        q6,  q15, q13
-    vadd.s16        q8,  q12, q14
-    vsub.s16        q3,  q6,  q3
+    vadd.s16        q2, q2, q6
+    vsub.s16        q6, q8, q12
+    vadd.s16        q12, q8, q12
+    vadd.s16        q9, q5, q4
+    vadd.s16        q5, q6, q10
+    vsub.s16        q10, q6, q10
+    vadd.s16        q6, q15, q13
+    vadd.s16        q8, q12, q14
+    vsub.s16        q3, q6, q3
     vsub.s16        q12, q12, q14
-    vsub.s16        q3,  q3,  q1
-    vsub.s16        q1,  q9,  q1
-    vadd.s16        q2,  q3,  q2
-    vsub.s16        q15, q8,  q6
-    vadd.s16        q1,  q1,  q2
-    vadd.s16        q8,  q8,  q6
-    vadd.s16        q14, q5,  q3
-    vsub.s16        q9,  q5,  q3
+    vsub.s16        q3, q3, q1
+    vsub.s16        q1, q9, q1
+    vadd.s16        q2, q3, q2
+    vsub.s16        q15, q8, q6
+    vadd.s16        q1, q1, q2
+    vadd.s16        q8, q8, q6
+    vadd.s16        q14, q5, q3
+    vsub.s16        q9, q5, q3
     vsub.s16        q13, q10, q2
     vadd.s16        q10, q10, q2
       /* Transpose */
-      vtrn.16         q8,  q9
+      vtrn.16         q8, q9
     vsub.s16        q11, q12, q1
       vtrn.16         q14, q15
     vadd.s16        q12, q12, q1
       vtrn.16         q10, q11
       vtrn.16         q12, q13
-      vtrn.32         q9,  q11
+      vtrn.32         q9, q11
       vtrn.32         q12, q14
-      vtrn.32         q8,  q10
+      vtrn.32         q8, q10
       vtrn.32         q13, q15
       vswp            d28, d21
       vswp            d26, d19
     /* 1-D IDCT, pass 2 */
-    vsub.s16        q2,  q10, q14
+    vsub.s16        q2, q10, q14
       vswp            d30, d23
     vadd.s16        q14, q10, q14
       vswp            d24, d17
-    vsub.s16        q1,  q11, q13
+    vsub.s16        q1, q11, q13
     vadd.s16        q13, q11, q13
-    vsub.s16        q5,  q9,  q15
-    vadd.s16        q15, q9,  q15
-    vqdmulh.s16     q4,  q2,  XFIX_1_414213562
-    vqdmulh.s16     q6,  q1,  XFIX_2_613125930
-    vadd.s16        q3,  q1,  q1
-    vsub.s16        q1,  q5,  q1
-    vadd.s16        q10, q2,  q4
-    vqdmulh.s16     q4,  q1,  XFIX_1_847759065
-    vsub.s16        q2,  q15, q13
-    vadd.s16        q3,  q3,  q6
-    vqdmulh.s16     q6,  q2,  XFIX_1_414213562
-    vadd.s16        q1,  q1,  q4
-    vqdmulh.s16     q4,  q5,  XFIX_1_082392200
+    vsub.s16        q5, q9, q15
+    vadd.s16        q15, q9, q15
+    vqdmulh.s16     q4, q2, XFIX_1_414213562
+    vqdmulh.s16     q6, q1, XFIX_2_613125930
+    vadd.s16        q3, q1, q1
+    vsub.s16        q1, q5, q1
+    vadd.s16        q10, q2, q4
+    vqdmulh.s16     q4, q1, XFIX_1_847759065
+    vsub.s16        q2, q15, q13
+    vadd.s16        q3, q3, q6
+    vqdmulh.s16     q6, q2, XFIX_1_414213562
+    vadd.s16        q1, q1, q4
+    vqdmulh.s16     q4, q5, XFIX_1_082392200
     vsub.s16        q10, q10, q14
-    vadd.s16        q2,  q2,  q6
-    vsub.s16        q6,  q8,  q12
-    vadd.s16        q12, q8,  q12
-    vadd.s16        q9,  q5,  q4
-    vadd.s16        q5,  q6,  q10
-    vsub.s16        q10, q6,  q10
-    vadd.s16        q6,  q15, q13
-    vadd.s16        q8,  q12, q14
-    vsub.s16        q3,  q6,  q3
+    vadd.s16        q2, q2, q6
+    vsub.s16        q6, q8, q12
+    vadd.s16        q12, q8, q12
+    vadd.s16        q9, q5, q4
+    vadd.s16        q5, q6, q10
+    vsub.s16        q10, q6, q10
+    vadd.s16        q6, q15, q13
+    vadd.s16        q8, q12, q14
+    vsub.s16        q3, q6, q3
     vsub.s16        q12, q12, q14
-    vsub.s16        q3,  q3,  q1
-    vsub.s16        q1,  q9,  q1
-    vadd.s16        q2,  q3,  q2
-    vsub.s16        q15, q8,  q6
-    vadd.s16        q1,  q1,  q2
-    vadd.s16        q8,  q8,  q6
-    vadd.s16        q14, q5,  q3
-    vsub.s16        q9,  q5,  q3
+    vsub.s16        q3, q3, q1
+    vsub.s16        q1, q9, q1
+    vadd.s16        q2, q3, q2
+    vsub.s16        q15, q8, q6
+    vadd.s16        q1, q1, q2
+    vadd.s16        q8, q8, q6
+    vadd.s16        q14, q5, q3
+    vsub.s16        q9, q5, q3
     vsub.s16        q13, q10, q2
-    vpop            {d8-d13}        /* restore NEON registers */
+    vpop            {d8-d13}      /* restore NEON registers */
     vadd.s16        q10, q10, q2
     vsub.s16        q11, q12, q1
     vadd.s16        q12, q12, q1
     /* Descale to 8-bit and range limit */
-    vmov.u8         q0,  #0x80
-    vqshrn.s16      d16, q8,  #5
-    vqshrn.s16      d17, q9,  #5
+    vmov.u8         q0, #0x80
+    vqshrn.s16      d16, q8, #5
+    vqshrn.s16      d17, q9, #5
     vqshrn.s16      d18, q10, #5
     vqshrn.s16      d19, q11, #5
     vqshrn.s16      d20, q12, #5
     vqshrn.s16      d21, q13, #5
     vqshrn.s16      d22, q14, #5
     vqshrn.s16      d23, q15, #5
-    vadd.u8         q8,  q8,  q0
-    vadd.u8         q9,  q9,  q0
+    vadd.u8         q8, q8, q0
+    vadd.u8         q9, q9, q0
     vadd.u8         q10, q10, q0
     vadd.u8         q11, q11, q0
     /* Transpose the final 8-bit samples */
-    vtrn.16         q8,  q9
+    vtrn.16         q8, q9
     vtrn.16         q10, q11
-    vtrn.32         q8,  q10
-    vtrn.32         q9,  q11
+    vtrn.32         q8, q10
+    vtrn.32         q9, q11
     vtrn.8          d16, d17
     vtrn.8          d18, d19
       /* Store results to the output buffer */
@@ -918,81 +923,80 @@
 
 #define CONST_BITS  13
 
-#define FIX_0_211164243  (1730)  /* FIX(0.211164243) */
-#define FIX_0_509795579  (4176)  /* FIX(0.509795579) */
-#define FIX_0_601344887  (4926)  /* FIX(0.601344887) */
-#define FIX_0_720959822  (5906)  /* FIX(0.720959822) */
-#define FIX_0_765366865  (6270)  /* FIX(0.765366865) */
-#define FIX_0_850430095  (6967)  /* FIX(0.850430095) */
-#define FIX_0_899976223  (7373)  /* FIX(0.899976223) */
-#define FIX_1_061594337  (8697)  /* FIX(1.061594337) */
-#define FIX_1_272758580  (10426) /* FIX(1.272758580) */
-#define FIX_1_451774981  (11893) /* FIX(1.451774981) */
-#define FIX_1_847759065  (15137) /* FIX(1.847759065) */
-#define FIX_2_172734803  (17799) /* FIX(2.172734803) */
-#define FIX_2_562915447  (20995) /* FIX(2.562915447) */
-#define FIX_3_624509785  (29692) /* FIX(3.624509785) */
+#define FIX_0_211164243 (1730)   /* FIX(0.211164243) */
+#define FIX_0_509795579 (4176)   /* FIX(0.509795579) */
+#define FIX_0_601344887 (4926)   /* FIX(0.601344887) */
+#define FIX_0_720959822 (5906)   /* FIX(0.720959822) */
+#define FIX_0_765366865 (6270)   /* FIX(0.765366865) */
+#define FIX_0_850430095 (6967)   /* FIX(0.850430095) */
+#define FIX_0_899976223 (7373)   /* FIX(0.899976223) */
+#define FIX_1_061594337 (8697)   /* FIX(1.061594337) */
+#define FIX_1_272758580 (10426)  /* FIX(1.272758580) */
+#define FIX_1_451774981 (11893)  /* FIX(1.451774981) */
+#define FIX_1_847759065 (15137)  /* FIX(1.847759065) */
+#define FIX_2_172734803 (17799)  /* FIX(2.172734803) */
+#define FIX_2_562915447 (20995)  /* FIX(2.562915447) */
+#define FIX_3_624509785 (29692)  /* FIX(3.624509785) */
 
 .balign 16
 jsimd_idct_4x4_neon_consts:
-    .short     FIX_1_847759065     /* d0[0] */
-    .short     -FIX_0_765366865    /* d0[1] */
-    .short     -FIX_0_211164243    /* d0[2] */
-    .short     FIX_1_451774981     /* d0[3] */
-    .short     -FIX_2_172734803    /* d1[0] */
-    .short     FIX_1_061594337     /* d1[1] */
-    .short     -FIX_0_509795579    /* d1[2] */
-    .short     -FIX_0_601344887    /* d1[3] */
-    .short     FIX_0_899976223     /* d2[0] */
-    .short     FIX_2_562915447     /* d2[1] */
-    .short     1 << (CONST_BITS+1) /* d2[2] */
-    .short     0                   /* d2[3] */
+  .short FIX_1_847759065      /* d0[0] */
+  .short -FIX_0_765366865     /* d0[1] */
+  .short -FIX_0_211164243     /* d0[2] */
+  .short FIX_1_451774981      /* d0[3] */
+  .short -FIX_2_172734803     /* d1[0] */
+  .short FIX_1_061594337      /* d1[1] */
+  .short -FIX_0_509795579     /* d1[2] */
+  .short -FIX_0_601344887     /* d1[3] */
+  .short FIX_0_899976223      /* d2[0] */
+  .short FIX_2_562915447      /* d2[1] */
+  .short 1 << (CONST_BITS+1)  /* d2[2] */
+  .short 0                    /* d2[3] */
 
 .macro idct_helper x4, x6, x8, x10, x12, x14, x16, shift, y26, y27, y28, y29
-    vmull.s16       q14, \x4,  d2[2]
-    vmlal.s16       q14, \x8,  d0[0]
+    vmull.s16       q14, \x4, d2[2]
+    vmlal.s16       q14, \x8, d0[0]
     vmlal.s16       q14, \x14, d0[1]
 
     vmull.s16       q13, \x16, d1[2]
     vmlal.s16       q13, \x12, d1[3]
     vmlal.s16       q13, \x10, d2[0]
-    vmlal.s16       q13, \x6,  d2[1]
+    vmlal.s16       q13, \x6, d2[1]
 
-    vmull.s16       q15, \x4,  d2[2]
-    vmlsl.s16       q15, \x8,  d0[0]
+    vmull.s16       q15, \x4, d2[2]
+    vmlsl.s16       q15, \x8, d0[0]
     vmlsl.s16       q15, \x14, d0[1]
 
     vmull.s16       q12, \x16, d0[2]
     vmlal.s16       q12, \x12, d0[3]
     vmlal.s16       q12, \x10, d1[0]
-    vmlal.s16       q12, \x6,  d1[1]
+    vmlal.s16       q12, \x6, d1[1]
 
     vadd.s32        q10, q14, q13
     vsub.s32        q14, q14, q13
 
-.if \shift > 16
-    vrshr.s32       q10,  q10, #\shift
-    vrshr.s32       q14,  q14, #\shift
+  .if \shift > 16
+    vrshr.s32       q10, q10, #\shift
+    vrshr.s32       q14, q14, #\shift
     vmovn.s32       \y26, q10
     vmovn.s32       \y29, q14
-.else
+  .else
     vrshrn.s32      \y26, q10, #\shift
     vrshrn.s32      \y29, q14, #\shift
-.endif
+  .endif
 
     vadd.s32        q10, q15, q12
     vsub.s32        q15, q15, q12
 
-.if \shift > 16
-    vrshr.s32       q10,  q10, #\shift
-    vrshr.s32       q15,  q15, #\shift
+  .if \shift > 16
+    vrshr.s32       q10, q10, #\shift
+    vrshr.s32       q15, q15, #\shift
     vmovn.s32       \y27, q10
     vmovn.s32       \y28, q15
-.else
+  .else
     vrshrn.s32      \y27, q10, #\shift
     vrshrn.s32      \y28, q15, #\shift
-.endif
-
+  .endif
 .endm
 
 asm_function jsimd_idct_4x4_neon
@@ -1128,31 +1132,30 @@
 
 .balign 8
 jsimd_idct_2x2_neon_consts:
-    .short     -FIX_0_720959822    /* d0[0] */
-    .short     FIX_0_850430095     /* d0[1] */
-    .short     -FIX_1_272758580    /* d0[2] */
-    .short     FIX_3_624509785     /* d0[3] */
+  .short -FIX_0_720959822  /* d0[0] */
+  .short FIX_0_850430095   /* d0[1] */
+  .short -FIX_1_272758580  /* d0[2] */
+  .short FIX_3_624509785   /* d0[3] */
 
 .macro idct_helper x4, x6, x10, x12, x16, shift, y26, y27
-    vshll.s16  q14,  \x4,  #15
-    vmull.s16  q13,  \x6,  d0[3]
-    vmlal.s16  q13,  \x10, d0[2]
-    vmlal.s16  q13,  \x12, d0[1]
-    vmlal.s16  q13,  \x16, d0[0]
+    vshll.s16       q14, \x4, #15
+    vmull.s16       q13, \x6, d0[3]
+    vmlal.s16       q13, \x10, d0[2]
+    vmlal.s16       q13, \x12, d0[1]
+    vmlal.s16       q13, \x16, d0[0]
 
-    vadd.s32   q10,  q14,  q13
-    vsub.s32   q14,  q14,  q13
+    vadd.s32        q10, q14, q13
+    vsub.s32        q14, q14, q13
 
-.if \shift > 16
-    vrshr.s32  q10,  q10,  #\shift
-    vrshr.s32  q14,  q14,  #\shift
-    vmovn.s32  \y26, q10
-    vmovn.s32  \y27, q14
-.else
-    vrshrn.s32 \y26, q10,  #\shift
-    vrshrn.s32 \y27, q14,  #\shift
-.endif
-
+  .if \shift > 16
+    vrshr.s32       q10, q10, #\shift
+    vrshr.s32       q14, q14, #\shift
+    vmovn.s32       \y26, q10
+    vmovn.s32       \y27, q14
+  .else
+    vrshrn.s32      \y26, q10, #\shift
+    vrshrn.s32      \y27, q14, #\shift
+  .endif
 .endm
 
 asm_function jsimd_idct_2x2_neon
@@ -1206,30 +1209,30 @@
     /* Pass 1 */
 #if 0
     idct_helper     d4, d6, d10, d12, d16, 13, d4, d6
-    transpose_4x4   d4, d6, d8,  d10
+    transpose_4x4   d4, d6, d8, d10
     idct_helper     d5, d7, d11, d13, d17, 13, d5, d7
-    transpose_4x4   d5, d7, d9,  d11
+    transpose_4x4   d5, d7, d9, d11
 #else
-    vmull.s16       q13, d6,  d0[3]
+    vmull.s16       q13, d6, d0[3]
     vmlal.s16       q13, d10, d0[2]
     vmlal.s16       q13, d12, d0[1]
     vmlal.s16       q13, d16, d0[0]
-    vmull.s16       q12, d7,  d0[3]
+    vmull.s16       q12, d7, d0[3]
     vmlal.s16       q12, d11, d0[2]
     vmlal.s16       q12, d13, d0[1]
     vmlal.s16       q12, d17, d0[0]
-    vshll.s16       q14, d4,  #15
-    vshll.s16       q15, d5,  #15
+    vshll.s16       q14, d4, #15
+    vshll.s16       q15, d5, #15
     vadd.s32        q10, q14, q13
     vsub.s32        q14, q14, q13
-    vrshrn.s32      d4,  q10, #13
-    vrshrn.s32      d6,  q14, #13
+    vrshrn.s32      d4, q10, #13
+    vrshrn.s32      d6, q14, #13
     vadd.s32        q10, q15, q12
     vsub.s32        q14, q15, q12
-    vrshrn.s32      d5,  q10, #13
-    vrshrn.s32      d7,  q14, #13
-    vtrn.16         q2,  q3
-    vtrn.32         q3,  q5
+    vrshrn.s32      d5, q10, #13
+    vrshrn.s32      d7, q14, #13
+    vtrn.16         q2, q3
+    vtrn.32         q3, q5
 #endif
 
     /* Pass 2 */
@@ -1279,110 +1282,110 @@
 
 
 .macro do_load size
-    .if \size == 8
-        vld1.8  {d4}, [U, :64]!
-        vld1.8  {d5}, [V, :64]!
-        vld1.8  {d0}, [Y, :64]!
-        pld     [U, #64]
-        pld     [V, #64]
-        pld     [Y, #64]
-    .elseif \size == 4
-        vld1.8  {d4[0]}, [U]!
-        vld1.8  {d4[1]}, [U]!
-        vld1.8  {d4[2]}, [U]!
-        vld1.8  {d4[3]}, [U]!
-        vld1.8  {d5[0]}, [V]!
-        vld1.8  {d5[1]}, [V]!
-        vld1.8  {d5[2]}, [V]!
-        vld1.8  {d5[3]}, [V]!
-        vld1.8  {d0[0]}, [Y]!
-        vld1.8  {d0[1]}, [Y]!
-        vld1.8  {d0[2]}, [Y]!
-        vld1.8  {d0[3]}, [Y]!
-    .elseif \size == 2
-        vld1.8  {d4[4]}, [U]!
-        vld1.8  {d4[5]}, [U]!
-        vld1.8  {d5[4]}, [V]!
-        vld1.8  {d5[5]}, [V]!
-        vld1.8  {d0[4]}, [Y]!
-        vld1.8  {d0[5]}, [Y]!
-    .elseif \size == 1
-        vld1.8  {d4[6]}, [U]!
-        vld1.8  {d5[6]}, [V]!
-        vld1.8  {d0[6]}, [Y]!
-    .else
-        .error unsupported macroblock size
-    .endif
+  .if \size == 8
+    vld1.8          {d4}, [U, :64]!
+    vld1.8          {d5}, [V, :64]!
+    vld1.8          {d0}, [Y, :64]!
+    pld             [U, #64]
+    pld             [V, #64]
+    pld             [Y, #64]
+  .elseif \size == 4
+    vld1.8          {d4[0]}, [U]!
+    vld1.8          {d4[1]}, [U]!
+    vld1.8          {d4[2]}, [U]!
+    vld1.8          {d4[3]}, [U]!
+    vld1.8          {d5[0]}, [V]!
+    vld1.8          {d5[1]}, [V]!
+    vld1.8          {d5[2]}, [V]!
+    vld1.8          {d5[3]}, [V]!
+    vld1.8          {d0[0]}, [Y]!
+    vld1.8          {d0[1]}, [Y]!
+    vld1.8          {d0[2]}, [Y]!
+    vld1.8          {d0[3]}, [Y]!
+  .elseif \size == 2
+    vld1.8          {d4[4]}, [U]!
+    vld1.8          {d4[5]}, [U]!
+    vld1.8          {d5[4]}, [V]!
+    vld1.8          {d5[5]}, [V]!
+    vld1.8          {d0[4]}, [Y]!
+    vld1.8          {d0[5]}, [Y]!
+  .elseif \size == 1
+    vld1.8          {d4[6]}, [U]!
+    vld1.8          {d5[6]}, [V]!
+    vld1.8          {d0[6]}, [Y]!
+  .else
+    .error unsupported macroblock size
+  .endif
 .endm
 
 .macro do_store bpp, size
-    .if \bpp == 24
-        .if \size == 8
-            vst3.8  {d10, d11, d12}, [RGB]!
-        .elseif \size == 4
-            vst3.8  {d10[0], d11[0], d12[0]}, [RGB]!
-            vst3.8  {d10[1], d11[1], d12[1]}, [RGB]!
-            vst3.8  {d10[2], d11[2], d12[2]}, [RGB]!
-            vst3.8  {d10[3], d11[3], d12[3]}, [RGB]!
-        .elseif \size == 2
-            vst3.8  {d10[4], d11[4], d12[4]}, [RGB]!
-            vst3.8  {d10[5], d11[5], d12[5]}, [RGB]!
-        .elseif \size == 1
-            vst3.8  {d10[6], d11[6], d12[6]}, [RGB]!
-        .else
-            .error unsupported macroblock size
-        .endif
-    .elseif \bpp == 32
-        .if \size == 8
-            vst4.8  {d10, d11, d12, d13}, [RGB]!
-        .elseif \size == 4
-            vst4.8  {d10[0], d11[0], d12[0], d13[0]}, [RGB]!
-            vst4.8  {d10[1], d11[1], d12[1], d13[1]}, [RGB]!
-            vst4.8  {d10[2], d11[2], d12[2], d13[2]}, [RGB]!
-            vst4.8  {d10[3], d11[3], d12[3], d13[3]}, [RGB]!
-        .elseif \size == 2
-            vst4.8  {d10[4], d11[4], d12[4], d13[4]}, [RGB]!
-            vst4.8  {d10[5], d11[5], d12[5], d13[5]}, [RGB]!
-        .elseif \size == 1
-            vst4.8  {d10[6], d11[6], d12[6], d13[6]}, [RGB]!
-        .else
-            .error unsupported macroblock size
-        .endif
-    .elseif \bpp == 16
-        .if \size == 8
-            vst1.16  {q15}, [RGB]!
-        .elseif \size == 4
-            vst1.16  {d30}, [RGB]!
-        .elseif \size == 2
-            vst1.16  {d31[0]}, [RGB]!
-            vst1.16  {d31[1]}, [RGB]!
-        .elseif \size == 1
-            vst1.16  {d31[2]}, [RGB]!
-        .else
-            .error unsupported macroblock size
-        .endif
+  .if \bpp == 24
+    .if \size == 8
+      vst3.8        {d10, d11, d12}, [RGB]!
+    .elseif \size == 4
+      vst3.8        {d10[0], d11[0], d12[0]}, [RGB]!
+      vst3.8        {d10[1], d11[1], d12[1]}, [RGB]!
+      vst3.8        {d10[2], d11[2], d12[2]}, [RGB]!
+      vst3.8        {d10[3], d11[3], d12[3]}, [RGB]!
+    .elseif \size == 2
+      vst3.8        {d10[4], d11[4], d12[4]}, [RGB]!
+      vst3.8        {d10[5], d11[5], d12[5]}, [RGB]!
+    .elseif \size == 1
+      vst3.8        {d10[6], d11[6], d12[6]}, [RGB]!
     .else
-        .error unsupported bpp
+      .error unsupported macroblock size
     .endif
+  .elseif \bpp == 32
+    .if \size == 8
+      vst4.8        {d10, d11, d12, d13}, [RGB]!
+    .elseif \size == 4
+      vst4.8        {d10[0], d11[0], d12[0], d13[0]}, [RGB]!
+      vst4.8        {d10[1], d11[1], d12[1], d13[1]}, [RGB]!
+      vst4.8        {d10[2], d11[2], d12[2], d13[2]}, [RGB]!
+      vst4.8        {d10[3], d11[3], d12[3], d13[3]}, [RGB]!
+    .elseif \size == 2
+      vst4.8        {d10[4], d11[4], d12[4], d13[4]}, [RGB]!
+      vst4.8        {d10[5], d11[5], d12[5], d13[5]}, [RGB]!
+    .elseif \size == 1
+      vst4.8        {d10[6], d11[6], d12[6], d13[6]}, [RGB]!
+    .else
+      .error unsupported macroblock size
+    .endif
+  .elseif \bpp == 16
+    .if \size == 8
+      vst1.16       {q15}, [RGB]!
+    .elseif \size == 4
+      vst1.16       {d30}, [RGB]!
+    .elseif \size == 2
+      vst1.16       {d31[0]}, [RGB]!
+      vst1.16       {d31[1]}, [RGB]!
+    .elseif \size == 1
+      vst1.16       {d31[2]}, [RGB]!
+    .else
+      .error unsupported macroblock size
+    .endif
+  .else
+    .error unsupported bpp
+  .endif
 .endm
 
 .macro generate_jsimd_ycc_rgb_convert_neon colorid, bpp, r_offs, g_offs, b_offs
 
 /*
- * 2 stage pipelined YCbCr->RGB conversion
+ * 2-stage pipelined YCbCr->RGB conversion
  */
 
 .macro do_yuv_to_rgb_stage1
-    vaddw.u8        q3, q1, d4     /* q3 = u - 128 */
-    vaddw.u8        q4, q1, d5     /* q2 = v - 128 */
-    vmull.s16       q10, d6, d1[1] /* multiply by -11277 */
-    vmlal.s16       q10, d8, d1[2] /* multiply by -23401 */
-    vmull.s16       q11, d7, d1[1] /* multiply by -11277 */
-    vmlal.s16       q11, d9, d1[2] /* multiply by -23401 */
-    vmull.s16       q12, d8, d1[0] /* multiply by 22971 */
-    vmull.s16       q13, d9, d1[0] /* multiply by 22971 */
-    vmull.s16       q14, d6, d1[3] /* multiply by 29033 */
-    vmull.s16       q15, d7, d1[3] /* multiply by 29033 */
+    vaddw.u8        q3, q1, d4      /* q3 = u - 128 */
+    vaddw.u8        q4, q1, d5      /* q2 = v - 128 */
+    vmull.s16       q10, d6, d1[1]  /* multiply by -11277 */
+    vmlal.s16       q10, d8, d1[2]  /* multiply by -23401 */
+    vmull.s16       q11, d7, d1[1]  /* multiply by -11277 */
+    vmlal.s16       q11, d9, d1[2]  /* multiply by -23401 */
+    vmull.s16       q12, d8, d1[0]  /* multiply by 22971 */
+    vmull.s16       q13, d9, d1[0]  /* multiply by 22971 */
+    vmull.s16       q14, d6, d1[3]  /* multiply by 29033 */
+    vmull.s16       q15, d7, d1[3]  /* multiply by 29033 */
 .endm
 
 .macro do_yuv_to_rgb_stage2
@@ -1395,17 +1398,17 @@
     vaddw.u8        q11, q10, d0
     vaddw.u8        q12, q12, d0
     vaddw.u8        q14, q14, d0
-.if \bpp != 16
+  .if \bpp != 16
     vqmovun.s16     d1\g_offs, q11
     vqmovun.s16     d1\r_offs, q12
     vqmovun.s16     d1\b_offs, q14
-.else /* rgb565 */
+  .else  /* rgb565 */
     vqshlu.s16      q13, q11, #8
     vqshlu.s16      q15, q12, #8
     vqshlu.s16      q14, q14, #8
     vsri.u16        q15, q13, #5
     vsri.u16        q15, q14, #11
-.endif
+  .endif
 .endm
 
 .macro do_yuv_to_rgb_stage2_store_load_stage1
@@ -1421,27 +1424,27 @@
                                        vrshrn.s32      d28, q14, #14
     vld1.8          {d5}, [V, :64]!
                                        vrshrn.s32      d29, q15, #14
-    vaddw.u8        q3, q1, d4     /* q3 = u - 128 */
-    vaddw.u8        q4, q1, d5     /* q2 = v - 128 */
+    vaddw.u8        q3, q1, d4      /* q3 = u - 128 */
+    vaddw.u8        q4, q1, d5      /* q2 = v - 128 */
                                        vaddw.u8        q11, q10, d0
-    vmull.s16       q10, d6, d1[1] /* multiply by -11277 */
-    vmlal.s16       q10, d8, d1[2] /* multiply by -23401 */
+    vmull.s16       q10, d6, d1[1]  /* multiply by -11277 */
+    vmlal.s16       q10, d8, d1[2]  /* multiply by -23401 */
                                        vaddw.u8        q12, q12, d0
                                        vaddw.u8        q14, q14, d0
-.if \bpp != 16 /**************** rgb24/rgb32 *********************************/
+  .if \bpp != 16  /**************** rgb24/rgb32 ******************************/
                                        vqmovun.s16     d1\g_offs, q11
     pld             [Y, #64]
                                        vqmovun.s16     d1\r_offs, q12
     vld1.8          {d0}, [Y, :64]!
                                        vqmovun.s16     d1\b_offs, q14
-    vmull.s16       q11, d7, d1[1] /* multiply by -11277 */
-    vmlal.s16       q11, d9, d1[2] /* multiply by -23401 */
+    vmull.s16       q11, d7, d1[1]  /* multiply by -11277 */
+    vmlal.s16       q11, d9, d1[2]  /* multiply by -23401 */
                                        do_store        \bpp, 8
-    vmull.s16       q12, d8, d1[0] /* multiply by 22971 */
-    vmull.s16       q13, d9, d1[0] /* multiply by 22971 */
-    vmull.s16       q14, d6, d1[3] /* multiply by 29033 */
-    vmull.s16       q15, d7, d1[3] /* multiply by 29033 */
-.else /**************************** rgb565 ***********************************/
+    vmull.s16       q12, d8, d1[0]  /* multiply by 22971 */
+    vmull.s16       q13, d9, d1[0]  /* multiply by 22971 */
+    vmull.s16       q14, d6, d1[3]  /* multiply by 29033 */
+    vmull.s16       q15, d7, d1[3]  /* multiply by 29033 */
+  .else  /**************************** rgb565 ********************************/
                                        vqshlu.s16      q13, q11, #8
     pld             [Y, #64]
                                        vqshlu.s16      q15, q12, #8
@@ -1456,7 +1459,7 @@
     vmull.s16       q14, d6, d1[3]
                                        do_store        \bpp, 8
     vmull.s16       q15, d7, d1[3]
-.endif
+  .endif
 .endm
 
 .macro do_yuv_to_rgb
@@ -1470,10 +1473,10 @@
 
 .balign 16
 jsimd_ycc_\colorid\()_neon_consts:
-    .short          0,      0,     0,      0
-    .short          22971, -11277, -23401, 29033
-    .short          -128,  -128,   -128,   -128
-    .short          -128,  -128,   -128,   -128
+  .short 0,      0,     0,      0
+  .short 22971, -11277, -23401, 29033
+  .short -128,  -128,   -128,   -128
+  .short -128,  -128,   -128,   -128
 
 asm_function jsimd_ycc_\colorid\()_convert_neon
     OUTPUT_WIDTH    .req r0
@@ -1618,123 +1621,123 @@
  */
 
 .macro do_store size
-    .if \size == 8
-        vst1.8  {d20}, [Y]!
-        vst1.8  {d21}, [U]!
-        vst1.8  {d22}, [V]!
-    .elseif \size == 4
-        vst1.8  {d20[0]}, [Y]!
-        vst1.8  {d20[1]}, [Y]!
-        vst1.8  {d20[2]}, [Y]!
-        vst1.8  {d20[3]}, [Y]!
-        vst1.8  {d21[0]}, [U]!
-        vst1.8  {d21[1]}, [U]!
-        vst1.8  {d21[2]}, [U]!
-        vst1.8  {d21[3]}, [U]!
-        vst1.8  {d22[0]}, [V]!
-        vst1.8  {d22[1]}, [V]!
-        vst1.8  {d22[2]}, [V]!
-        vst1.8  {d22[3]}, [V]!
-    .elseif \size == 2
-        vst1.8  {d20[4]}, [Y]!
-        vst1.8  {d20[5]}, [Y]!
-        vst1.8  {d21[4]}, [U]!
-        vst1.8  {d21[5]}, [U]!
-        vst1.8  {d22[4]}, [V]!
-        vst1.8  {d22[5]}, [V]!
-    .elseif \size == 1
-        vst1.8  {d20[6]}, [Y]!
-        vst1.8  {d21[6]}, [U]!
-        vst1.8  {d22[6]}, [V]!
-    .else
-        .error unsupported macroblock size
-    .endif
+  .if \size == 8
+    vst1.8          {d20}, [Y]!
+    vst1.8          {d21}, [U]!
+    vst1.8          {d22}, [V]!
+  .elseif \size == 4
+    vst1.8          {d20[0]}, [Y]!
+    vst1.8          {d20[1]}, [Y]!
+    vst1.8          {d20[2]}, [Y]!
+    vst1.8          {d20[3]}, [Y]!
+    vst1.8          {d21[0]}, [U]!
+    vst1.8          {d21[1]}, [U]!
+    vst1.8          {d21[2]}, [U]!
+    vst1.8          {d21[3]}, [U]!
+    vst1.8          {d22[0]}, [V]!
+    vst1.8          {d22[1]}, [V]!
+    vst1.8          {d22[2]}, [V]!
+    vst1.8          {d22[3]}, [V]!
+  .elseif \size == 2
+    vst1.8          {d20[4]}, [Y]!
+    vst1.8          {d20[5]}, [Y]!
+    vst1.8          {d21[4]}, [U]!
+    vst1.8          {d21[5]}, [U]!
+    vst1.8          {d22[4]}, [V]!
+    vst1.8          {d22[5]}, [V]!
+  .elseif \size == 1
+    vst1.8          {d20[6]}, [Y]!
+    vst1.8          {d21[6]}, [U]!
+    vst1.8          {d22[6]}, [V]!
+  .else
+    .error unsupported macroblock size
+  .endif
 .endm
 
 .macro do_load bpp, size
-    .if \bpp == 24
-        .if \size == 8
-            vld3.8  {d10, d11, d12}, [RGB]!
-            pld     [RGB, #128]
-        .elseif \size == 4
-            vld3.8  {d10[0], d11[0], d12[0]}, [RGB]!
-            vld3.8  {d10[1], d11[1], d12[1]}, [RGB]!
-            vld3.8  {d10[2], d11[2], d12[2]}, [RGB]!
-            vld3.8  {d10[3], d11[3], d12[3]}, [RGB]!
-        .elseif \size == 2
-            vld3.8  {d10[4], d11[4], d12[4]}, [RGB]!
-            vld3.8  {d10[5], d11[5], d12[5]}, [RGB]!
-        .elseif \size == 1
-            vld3.8  {d10[6], d11[6], d12[6]}, [RGB]!
-        .else
-            .error unsupported macroblock size
-        .endif
-    .elseif \bpp == 32
-        .if \size == 8
-            vld4.8  {d10, d11, d12, d13}, [RGB]!
-            pld     [RGB, #128]
-        .elseif \size == 4
-            vld4.8  {d10[0], d11[0], d12[0], d13[0]}, [RGB]!
-            vld4.8  {d10[1], d11[1], d12[1], d13[1]}, [RGB]!
-            vld4.8  {d10[2], d11[2], d12[2], d13[2]}, [RGB]!
-            vld4.8  {d10[3], d11[3], d12[3], d13[3]}, [RGB]!
-        .elseif \size == 2
-            vld4.8  {d10[4], d11[4], d12[4], d13[4]}, [RGB]!
-            vld4.8  {d10[5], d11[5], d12[5], d13[5]}, [RGB]!
-        .elseif \size == 1
-            vld4.8  {d10[6], d11[6], d12[6], d13[6]}, [RGB]!
-        .else
-            .error unsupported macroblock size
-        .endif
+  .if \bpp == 24
+    .if \size == 8
+      vld3.8        {d10, d11, d12}, [RGB]!
+      pld           [RGB, #128]
+    .elseif \size == 4
+      vld3.8        {d10[0], d11[0], d12[0]}, [RGB]!
+      vld3.8        {d10[1], d11[1], d12[1]}, [RGB]!
+      vld3.8        {d10[2], d11[2], d12[2]}, [RGB]!
+      vld3.8        {d10[3], d11[3], d12[3]}, [RGB]!
+    .elseif \size == 2
+      vld3.8        {d10[4], d11[4], d12[4]}, [RGB]!
+      vld3.8        {d10[5], d11[5], d12[5]}, [RGB]!
+    .elseif \size == 1
+      vld3.8        {d10[6], d11[6], d12[6]}, [RGB]!
     .else
-        .error unsupported bpp
+      .error unsupported macroblock size
     .endif
+  .elseif \bpp == 32
+    .if \size == 8
+      vld4.8        {d10, d11, d12, d13}, [RGB]!
+      pld           [RGB, #128]
+    .elseif \size == 4
+      vld4.8        {d10[0], d11[0], d12[0], d13[0]}, [RGB]!
+      vld4.8        {d10[1], d11[1], d12[1], d13[1]}, [RGB]!
+      vld4.8        {d10[2], d11[2], d12[2], d13[2]}, [RGB]!
+      vld4.8        {d10[3], d11[3], d12[3], d13[3]}, [RGB]!
+    .elseif \size == 2
+      vld4.8        {d10[4], d11[4], d12[4], d13[4]}, [RGB]!
+      vld4.8        {d10[5], d11[5], d12[5], d13[5]}, [RGB]!
+    .elseif \size == 1
+      vld4.8        {d10[6], d11[6], d12[6], d13[6]}, [RGB]!
+    .else
+      .error unsupported macroblock size
+    .endif
+  .else
+    .error unsupported bpp
+  .endif
 .endm
 
 .macro generate_jsimd_rgb_ycc_convert_neon colorid, bpp, r_offs, g_offs, b_offs
 
 /*
- * 2 stage pipelined RGB->YCbCr conversion
+ * 2-stage pipelined RGB->YCbCr conversion
  */
 
 .macro do_rgb_to_yuv_stage1
-    vmovl.u8    q2, d1\r_offs /* r = { d4, d5 } */
-    vmovl.u8    q3, d1\g_offs /* g = { d6, d7 } */
-    vmovl.u8    q4, d1\b_offs /* b = { d8, d9 } */
-    vmull.u16   q7, d4, d0[0]
-    vmlal.u16   q7, d6, d0[1]
-    vmlal.u16   q7, d8, d0[2]
-    vmull.u16   q8, d5, d0[0]
-    vmlal.u16   q8, d7, d0[1]
-    vmlal.u16   q8, d9, d0[2]
-    vrev64.32   q9,  q1
-    vrev64.32   q13, q1
-    vmlsl.u16   q9,  d4, d0[3]
-    vmlsl.u16   q9,  d6, d1[0]
-    vmlal.u16   q9,  d8, d1[1]
-    vmlsl.u16   q13, d5, d0[3]
-    vmlsl.u16   q13, d7, d1[0]
-    vmlal.u16   q13, d9, d1[1]
-    vrev64.32   q14, q1
-    vrev64.32   q15, q1
-    vmlal.u16   q14, d4, d1[1]
-    vmlsl.u16   q14, d6, d1[2]
-    vmlsl.u16   q14, d8, d1[3]
-    vmlal.u16   q15, d5, d1[1]
-    vmlsl.u16   q15, d7, d1[2]
-    vmlsl.u16   q15, d9, d1[3]
+    vmovl.u8        q2, d1\r_offs  /* r = { d4, d5 } */
+    vmovl.u8        q3, d1\g_offs  /* g = { d6, d7 } */
+    vmovl.u8        q4, d1\b_offs  /* b = { d8, d9 } */
+    vmull.u16       q7, d4, d0[0]
+    vmlal.u16       q7, d6, d0[1]
+    vmlal.u16       q7, d8, d0[2]
+    vmull.u16       q8, d5, d0[0]
+    vmlal.u16       q8, d7, d0[1]
+    vmlal.u16       q8, d9, d0[2]
+    vrev64.32       q9, q1
+    vrev64.32       q13, q1
+    vmlsl.u16       q9, d4, d0[3]
+    vmlsl.u16       q9, d6, d1[0]
+    vmlal.u16       q9, d8, d1[1]
+    vmlsl.u16       q13, d5, d0[3]
+    vmlsl.u16       q13, d7, d1[0]
+    vmlal.u16       q13, d9, d1[1]
+    vrev64.32       q14, q1
+    vrev64.32       q15, q1
+    vmlal.u16       q14, d4, d1[1]
+    vmlsl.u16       q14, d6, d1[2]
+    vmlsl.u16       q14, d8, d1[3]
+    vmlal.u16       q15, d5, d1[1]
+    vmlsl.u16       q15, d7, d1[2]
+    vmlsl.u16       q15, d9, d1[3]
 .endm
 
 .macro do_rgb_to_yuv_stage2
-    vrshrn.u32  d20, q7,  #16
-    vrshrn.u32  d21, q8,  #16
-    vshrn.u32   d22, q9,  #16
-    vshrn.u32   d23, q13, #16
-    vshrn.u32   d24, q14, #16
-    vshrn.u32   d25, q15, #16
-    vmovn.u16   d20, q10      /* d20 = y */
-    vmovn.u16   d21, q11      /* d21 = u */
-    vmovn.u16   d22, q12      /* d22 = v */
+    vrshrn.u32      d20, q7, #16
+    vrshrn.u32      d21, q8, #16
+    vshrn.u32       d22, q9, #16
+    vshrn.u32       d23, q13, #16
+    vshrn.u32       d24, q14, #16
+    vshrn.u32       d25, q15, #16
+    vmovn.u16       d20, q10       /* d20 = y */
+    vmovn.u16       d21, q11       /* d21 = u */
+    vmovn.u16       d22, q12       /* d22 = v */
 .endm
 
 .macro do_rgb_to_yuv
@@ -1743,52 +1746,52 @@
 .endm
 
 .macro do_rgb_to_yuv_stage2_store_load_stage1
-      vrshrn.u32  d20, q7,  #16
-      vrshrn.u32  d21, q8,  #16
-      vshrn.u32   d22, q9,  #16
-    vrev64.32   q9,  q1
-      vshrn.u32   d23, q13, #16
-    vrev64.32   q13, q1
-      vshrn.u32   d24, q14, #16
-      vshrn.u32   d25, q15, #16
-    do_load     \bpp, 8
-      vmovn.u16   d20, q10      /* d20 = y */
-    vmovl.u8    q2, d1\r_offs   /* r = { d4, d5 } */
-      vmovn.u16   d21, q11      /* d21 = u */
-    vmovl.u8    q3, d1\g_offs   /* g = { d6, d7 } */
-      vmovn.u16   d22, q12      /* d22 = v */
-    vmovl.u8    q4, d1\b_offs   /* b = { d8, d9 } */
-    vmull.u16   q7, d4, d0[0]
-    vmlal.u16   q7, d6, d0[1]
-    vmlal.u16   q7, d8, d0[2]
-      vst1.8      {d20}, [Y]!
-    vmull.u16   q8, d5, d0[0]
-    vmlal.u16   q8, d7, d0[1]
-    vmlal.u16   q8, d9, d0[2]
-    vmlsl.u16   q9,  d4, d0[3]
-    vmlsl.u16   q9,  d6, d1[0]
-    vmlal.u16   q9,  d8, d1[1]
-      vst1.8      {d21}, [U]!
-    vmlsl.u16   q13, d5, d0[3]
-    vmlsl.u16   q13, d7, d1[0]
-    vmlal.u16   q13, d9, d1[1]
-    vrev64.32   q14, q1
-    vrev64.32   q15, q1
-    vmlal.u16   q14, d4, d1[1]
-    vmlsl.u16   q14, d6, d1[2]
-    vmlsl.u16   q14, d8, d1[3]
-      vst1.8      {d22}, [V]!
-    vmlal.u16   q15, d5, d1[1]
-    vmlsl.u16   q15, d7, d1[2]
-    vmlsl.u16   q15, d9, d1[3]
+      vrshrn.u32      d20, q7, #16
+      vrshrn.u32      d21, q8, #16
+      vshrn.u32       d22, q9, #16
+    vrev64.32       q9, q1
+      vshrn.u32       d23, q13, #16
+    vrev64.32       q13, q1
+      vshrn.u32       d24, q14, #16
+      vshrn.u32       d25, q15, #16
+    do_load         \bpp, 8
+      vmovn.u16       d20, q10     /* d20 = y */
+    vmovl.u8        q2, d1\r_offs  /* r = { d4, d5 } */
+      vmovn.u16       d21, q11     /* d21 = u */
+    vmovl.u8        q3, d1\g_offs  /* g = { d6, d7 } */
+      vmovn.u16       d22, q12     /* d22 = v */
+    vmovl.u8        q4, d1\b_offs  /* b = { d8, d9 } */
+    vmull.u16       q7, d4, d0[0]
+    vmlal.u16       q7, d6, d0[1]
+    vmlal.u16       q7, d8, d0[2]
+      vst1.8          {d20}, [Y]!
+    vmull.u16       q8, d5, d0[0]
+    vmlal.u16       q8, d7, d0[1]
+    vmlal.u16       q8, d9, d0[2]
+    vmlsl.u16       q9, d4, d0[3]
+    vmlsl.u16       q9, d6, d1[0]
+    vmlal.u16       q9, d8, d1[1]
+      vst1.8          {d21}, [U]!
+    vmlsl.u16       q13, d5, d0[3]
+    vmlsl.u16       q13, d7, d1[0]
+    vmlal.u16       q13, d9, d1[1]
+    vrev64.32       q14, q1
+    vrev64.32       q15, q1
+    vmlal.u16       q14, d4, d1[1]
+    vmlsl.u16       q14, d6, d1[2]
+    vmlsl.u16       q14, d8, d1[3]
+      vst1.8          {d22}, [V]!
+    vmlal.u16       q15, d5, d1[1]
+    vmlsl.u16       q15, d7, d1[2]
+    vmlsl.u16       q15, d9, d1[3]
 .endm
 
 .balign 16
 jsimd_\colorid\()_ycc_neon_consts:
-    .short          19595, 38470, 7471,  11059
-    .short          21709, 32768, 27439, 5329
-    .short          32767, 128,   32767, 128
-    .short          32767, 128,   32767, 128
+  .short 19595, 38470, 7471,  11059
+  .short 21709, 32768, 27439, 5329
+  .short 32767, 128,   32767, 128
+  .short 32767, 128,   32767, 128
 
 asm_function jsimd_\colorid\()_ycc_convert_neon
     OUTPUT_WIDTH    .req r0
@@ -1998,10 +2001,10 @@
 
 .balign 16
 jsimd_fdct_ifast_neon_consts:
-    .short (98 * 128)              /* XFIX_0_382683433 */
-    .short (139 * 128)             /* XFIX_0_541196100 */
-    .short (181 * 128)             /* XFIX_0_707106781 */
-    .short (334 * 128 - 256 * 128) /* XFIX_1_306562965 */
+  .short (98 * 128)               /* XFIX_0_382683433 */
+  .short (139 * 128)              /* XFIX_0_541196100 */
+  .short (181 * 128)              /* XFIX_0_707106781 */
+  .short (334 * 128 - 256 * 128)  /* XFIX_1_306562965 */
 
 asm_function jsimd_fdct_ifast_neon
 
@@ -2038,52 +2041,52 @@
     /* Transpose */
     vtrn.16         q12, q13
     vtrn.16         q10, q11
-    vtrn.16         q8,  q9
+    vtrn.16         q8, q9
     vtrn.16         q14, q15
-    vtrn.32         q9,  q11
+    vtrn.32         q9, q11
     vtrn.32         q13, q15
-    vtrn.32         q8,  q10
+    vtrn.32         q8, q10
     vtrn.32         q12, q14
     vswp            d30, d23
     vswp            d24, d17
     vswp            d26, d19
       /* 1-D FDCT */
-      vadd.s16        q2,  q11, q12
+      vadd.s16        q2, q11, q12
     vswp            d28, d21
       vsub.s16        q12, q11, q12
-      vsub.s16        q6,  q10, q13
+      vsub.s16        q6, q10, q13
       vadd.s16        q10, q10, q13
-      vsub.s16        q7,  q9,  q14
-      vadd.s16        q9,  q9,  q14
-      vsub.s16        q1,  q8,  q15
-      vadd.s16        q8,  q8,  q15
-      vsub.s16        q4,  q9,  q10
-      vsub.s16        q5,  q8,  q2
-      vadd.s16        q3,  q9,  q10
-      vadd.s16        q4,  q4,  q5
-      vadd.s16        q2,  q8,  q2
-      vqdmulh.s16     q4,  q4,  XFIX_0_707106781
+      vsub.s16        q7, q9, q14
+      vadd.s16        q9, q9, q14
+      vsub.s16        q1, q8, q15
+      vadd.s16        q8, q8, q15
+      vsub.s16        q4, q9, q10
+      vsub.s16        q5, q8, q2
+      vadd.s16        q3, q9, q10
+      vadd.s16        q4, q4, q5
+      vadd.s16        q2, q8, q2
+      vqdmulh.s16     q4, q4, XFIX_0_707106781
       vadd.s16        q11, q12, q6
-      vadd.s16        q8,  q2,  q3
-      vsub.s16        q12, q2,  q3
-      vadd.s16        q3,  q6,  q7
-      vadd.s16        q7,  q7,  q1
-      vqdmulh.s16     q3,  q3,  XFIX_0_707106781
-      vsub.s16        q6,  q11, q7
-      vadd.s16        q10, q5,  q4
-      vqdmulh.s16     q6,  q6,  XFIX_0_382683433
-      vsub.s16        q14, q5,  q4
+      vadd.s16        q8, q2, q3
+      vsub.s16        q12, q2, q3
+      vadd.s16        q3, q6, q7
+      vadd.s16        q7, q7, q1
+      vqdmulh.s16     q3, q3, XFIX_0_707106781
+      vsub.s16        q6, q11, q7
+      vadd.s16        q10, q5, q4
+      vqdmulh.s16     q6, q6, XFIX_0_382683433
+      vsub.s16        q14, q5, q4
       vqdmulh.s16     q11, q11, XFIX_0_541196100
-      vqdmulh.s16     q5,  q7,  XFIX_1_306562965
-      vadd.s16        q4,  q1,  q3
-      vsub.s16        q3,  q1,  q3
-      vadd.s16        q7,  q7,  q6
+      vqdmulh.s16     q5, q7, XFIX_1_306562965
+      vadd.s16        q4, q1, q3
+      vsub.s16        q3, q1, q3
+      vadd.s16        q7, q7, q6
       vadd.s16        q11, q11, q6
-      vadd.s16        q7,  q7,  q5
-      vadd.s16        q13, q3,  q11
-      vsub.s16        q11, q3,  q11
-      vadd.s16        q9,  q4,  q7
-      vsub.s16        q15, q4,  q7
+      vadd.s16        q7, q7, q5
+      vadd.s16        q13, q3, q11
+      vsub.s16        q11, q3, q11
+      vadd.s16        q9, q4, q7
+      vsub.s16        q15, q4, q7
     subs            TMP, TMP, #1
     bne             1b
 
@@ -2104,8 +2107,8 @@
 
 /*
  * GLOBAL(void)
- * jsimd_quantize_neon (JCOEFPTR coef_block, DCTELEM * divisors,
- *                      DCTELEM * workspace);
+ * jsimd_quantize_neon (JCOEFPTR coef_block, DCTELEM *divisors,
+ *                      DCTELEM *workspace);
  *
  * Note: the code uses 2 stage pipelining in order to improve instructions
  *       scheduling and eliminate stalls (this provides ~15% better
@@ -2132,22 +2135,22 @@
     vld1.16         {d20, d21, d22, d23}, [CORRECTION, :128]!
     vabs.s16        q13, q1
     vld1.16         {d16, d17, d18, d19}, [RECIPROCAL, :128]!
-    vadd.u16        q12, q12, q10 /* add correction */
+    vadd.u16        q12, q12, q10  /* add correction */
     vadd.u16        q13, q13, q11
-    vmull.u16       q10, d24, d16 /* multiply by reciprocal */
+    vmull.u16       q10, d24, d16  /* multiply by reciprocal */
     vmull.u16       q11, d25, d17
-    vmull.u16       q8,  d26, d18
-    vmull.u16       q9,  d27, d19
+    vmull.u16       q8, d26, d18
+    vmull.u16       q9, d27, d19
     vld1.16         {d24, d25, d26, d27}, [SHIFT, :128]!
     vshrn.u32       d20, q10, #16
     vshrn.u32       d21, q11, #16
-    vshrn.u32       d22, q8,  #16
-    vshrn.u32       d23, q9,  #16
+    vshrn.u32       d22, q8, #16
+    vshrn.u32       d23, q9, #16
     vneg.s16        q12, q12
     vneg.s16        q13, q13
-    vshr.s16        q2,  q0,  #15 /* extract sign */
-    vshr.s16        q3,  q1,  #15
-    vshl.u16        q14, q10, q12 /* shift */
+    vshr.s16        q2, q0, #15    /* extract sign */
+    vshr.s16        q3, q1, #15
+    vshl.u16        q14, q10, q12  /* shift */
     vshl.u16        q15, q11, q13
 
     push            {r4, r5}
@@ -2160,25 +2163,25 @@
     vabs.s16        q13, q1
       veor.u16        q15, q15, q3
     vld1.16         {d16, d17, d18, d19}, [RECIPROCAL, :128]!
-    vadd.u16        q12, q12, q10 /* add correction */
+    vadd.u16        q12, q12, q10  /* add correction */
     vadd.u16        q13, q13, q11
-    vmull.u16       q10, d24, d16 /* multiply by reciprocal */
+    vmull.u16       q10, d24, d16  /* multiply by reciprocal */
     vmull.u16       q11, d25, d17
-    vmull.u16       q8,  d26, d18
-    vmull.u16       q9,  d27, d19
+    vmull.u16       q8, d26, d18
+    vmull.u16       q9, d27, d19
       vsub.u16        q14, q14, q2
     vld1.16         {d24, d25, d26, d27}, [SHIFT, :128]!
       vsub.u16        q15, q15, q3
     vshrn.u32       d20, q10, #16
     vshrn.u32       d21, q11, #16
       vst1.16         {d28, d29, d30, d31}, [COEF_BLOCK, :128]!
-    vshrn.u32       d22, q8,  #16
-    vshrn.u32       d23, q9,  #16
+    vshrn.u32       d22, q8, #16
+    vshrn.u32       d23, q9, #16
     vneg.s16        q12, q12
     vneg.s16        q13, q13
-    vshr.s16        q2,  q0,  #15 /* extract sign */
-    vshr.s16        q3,  q1,  #15
-    vshl.u16        q14, q10, q12 /* shift */
+    vshr.s16        q2, q0, #15    /* extract sign */
+    vshr.s16        q3, q1, #15
+    vshl.u16        q14, q10, q12  /* shift */
     vshl.u16        q15, q11, q13
     subs            LOOP_COUNT, LOOP_COUNT, #1
     bne             1b
@@ -2190,7 +2193,7 @@
       vsub.u16        q15, q15, q3
       vst1.16         {d28, d29, d30, d31}, [COEF_BLOCK, :128]!
 
-    bx              lr /* return */
+    bx              lr  /* return */
 
     .unreq          COEF_BLOCK
     .unreq          DIVISORS
@@ -2205,10 +2208,10 @@
 
 /*
  * GLOBAL(void)
- * jsimd_h2v1_fancy_upsample_neon (int          max_v_samp_factor,
- *                                 JDIMENSION   downsampled_width,
- *                                 JSAMPARRAY   input_data,
- *                                 JSAMPARRAY * output_data_ptr);
+ * jsimd_h2v1_fancy_upsample_neon (int max_v_samp_factor,
+ *                                 JDIMENSION downsampled_width,
+ *                                 JSAMPARRAY input_data,
+ *                                 JSAMPARRAY *output_data_ptr);
  *
  * Note: the use of unaligned writes is the main remaining bottleneck in
  *       this code, which can be potentially solved to get up to tens
@@ -2222,22 +2225,22 @@
  * Register d28 is used for multiplication by 3. Register q15 is used
  * for adding +1 bias.
  */
-.macro upsample16   OUTPTR, INPTR
+.macro upsample16 OUTPTR, INPTR
     vld1.8          {q0}, [\INPTR]!
-    vmovl.u8        q8,  d0
-    vext.8          q2,  q1,  q0, #15
-    vmovl.u8        q9,  d1
+    vmovl.u8        q8, d0
+    vext.8          q2, q1, q0, #15
+    vmovl.u8        q9, d1
     vaddw.u8        q10, q15, d4
     vaddw.u8        q11, q15, d5
-    vmlal.u8        q8,  d4,  d28
-    vmlal.u8        q9,  d5,  d28
-    vmlal.u8        q10, d0,  d28
-    vmlal.u8        q11, d1,  d28
-    vmov            q1,  q0       /* backup source pixels to q1 */
-    vrshrn.u16      d6,  q8,  #2
-    vrshrn.u16      d7,  q9,  #2
-    vshrn.u16       d8,  q10, #2
-    vshrn.u16       d9,  q11, #2
+    vmlal.u8        q8, d4, d28
+    vmlal.u8        q9, d5, d28
+    vmlal.u8        q10, d0, d28
+    vmlal.u8        q11, d1, d28
+    vmov            q1, q0        /* backup source pixels to q1 */
+    vrshrn.u16      d6, q8, #2
+    vrshrn.u16      d7, q9, #2
+    vshrn.u16       d8, q10, #2
+    vshrn.u16       d9, q11, #2
     vst2.8          {d6, d7, d8, d9}, [\OUTPTR]!
 .endm
 
@@ -2248,39 +2251,39 @@
  * Also this unrolling allows to reorder loads and stores to compensate
  * multiplication latency and reduce stalls.
  */
-.macro upsample32   OUTPTR, INPTR
+.macro upsample32 OUTPTR, INPTR
     /* even 16 pixels group */
     vld1.8          {q0}, [\INPTR]!
-    vmovl.u8        q8,  d0
-    vext.8          q2,  q1,  q0, #15
-    vmovl.u8        q9,  d1
+    vmovl.u8        q8, d0
+    vext.8          q2, q1, q0, #15
+    vmovl.u8        q9, d1
     vaddw.u8        q10, q15, d4
     vaddw.u8        q11, q15, d5
-    vmlal.u8        q8,  d4,  d28
-    vmlal.u8        q9,  d5,  d28
-    vmlal.u8        q10, d0,  d28
-    vmlal.u8        q11, d1,  d28
-        /* odd 16 pixels group */
-        vld1.8          {q1}, [\INPTR]!
-    vrshrn.u16      d6,  q8,  #2
-    vrshrn.u16      d7,  q9,  #2
-    vshrn.u16       d8,  q10, #2
-    vshrn.u16       d9,  q11, #2
-        vmovl.u8        q8,  d2
-        vext.8          q2,  q0,  q1, #15
-        vmovl.u8        q9,  d3
-        vaddw.u8        q10, q15, d4
-        vaddw.u8        q11, q15, d5
-        vmlal.u8        q8,  d4,  d28
-        vmlal.u8        q9,  d5,  d28
-        vmlal.u8        q10, d2,  d28
-        vmlal.u8        q11, d3,  d28
+    vmlal.u8        q8, d4, d28
+    vmlal.u8        q9, d5, d28
+    vmlal.u8        q10, d0, d28
+    vmlal.u8        q11, d1, d28
+      /* odd 16 pixels group */
+      vld1.8          {q1}, [\INPTR]!
+    vrshrn.u16      d6, q8, #2
+    vrshrn.u16      d7, q9, #2
+    vshrn.u16       d8, q10, #2
+    vshrn.u16       d9, q11, #2
+      vmovl.u8        q8, d2
+      vext.8          q2, q0, q1, #15
+      vmovl.u8        q9, d3
+      vaddw.u8        q10, q15, d4
+      vaddw.u8        q11, q15, d5
+      vmlal.u8        q8, d4, d28
+      vmlal.u8        q9, d5, d28
+      vmlal.u8        q10, d2, d28
+      vmlal.u8        q11, d3, d28
     vst2.8          {d6, d7, d8, d9}, [\OUTPTR]!
-        vrshrn.u16      d6,  q8,  #2
-        vrshrn.u16      d7,  q9,  #2
-        vshrn.u16       d8,  q10, #2
-        vshrn.u16       d9,  q11, #2
-        vst2.8          {d6, d7, d8, d9}, [\OUTPTR]!
+      vrshrn.u16      d6, q8, #2
+      vrshrn.u16      d7, q9, #2
+      vshrn.u16       d8, q10, #2
+      vshrn.u16       d9, q11, #2
+      vst2.8          {d6, d7, d8, d9}, [\OUTPTR]!
 .endm
 
 /*
@@ -2341,21 +2344,21 @@
 2:
     tst             \WIDTH, #8
     beq             2f
-    vmov            d1,  d0
+    vmov            d1, d0
     sub             \INPTR, \INPTR, #8
     vld1.8          {d0}, [\INPTR]
 2:  /* upsample the remaining pixels */
-    vmovl.u8        q8,  d0
-    vext.8          q2,  q1,  q0, #15
-    vmovl.u8        q9,  d1
+    vmovl.u8        q8, d0
+    vext.8          q2, q1, q0, #15
+    vmovl.u8        q9, d1
     vaddw.u8        q10, q15, d4
     vaddw.u8        q11, q15, d5
-    vmlal.u8        q8,  d4,  d28
-    vmlal.u8        q9,  d5,  d28
-    vmlal.u8        q10, d0,  d28
-    vmlal.u8        q11, d1,  d28
-    vrshrn.u16      d10, q8,  #2
-    vrshrn.u16      d12, q9,  #2
+    vmlal.u8        q8, d4, d28
+    vmlal.u8        q9, d5, d28
+    vmlal.u8        q10, d0, d28
+    vmlal.u8        q11, d1, d28
+    vrshrn.u16      d10, q8, #2
+    vrshrn.u16      d12, q9, #2
     vshrn.u16       d11, q10, #2
     vshrn.u16       d13, q11, #2
     vzip.8          d10, d11
@@ -2364,12 +2367,12 @@
     tst             \WIDTH, #8
     beq             2f
     vst1.8          {d10, d11}, [\OUTPTR]!
-    vmov            q5,  q6
+    vmov            q5, q6
 2:
     tst             \WIDTH, #4
     beq             2f
     vst1.8          {d10}, [\OUTPTR]!
-    vmov            d10,  d11
+    vmov            d10, d11
 2:
     tst             \WIDTH, #2
     beq             2f
@@ -2433,7 +2436,443 @@
     .unreq          WIDTH
     .unreq          TMP
 
-
 .purgem upsample16
 .purgem upsample32
 .purgem upsample_row
+
+
+/*****************************************************************************/
+
+/*
+ * GLOBAL(JOCTET*)
+ * jsimd_huff_encode_one_block (working_state *state, JOCTET *buffer,
+ *                              JCOEFPTR block, int last_dc_val,
+ *                              c_derived_tbl *dctbl, c_derived_tbl *actbl)
+ *
+ */
+
+.macro emit_byte BUFFER, PUT_BUFFER, PUT_BITS, ZERO, TMP
+    sub             \PUT_BITS, \PUT_BITS, #0x8
+    lsr             \TMP, \PUT_BUFFER, \PUT_BITS
+    uxtb            \TMP, \TMP
+    strb            \TMP, [\BUFFER, #1]!
+    cmp             \TMP, #0xff
+    /*it eq*/
+    strbeq          \ZERO, [\BUFFER, #1]!
+.endm
+
+.macro put_bits PUT_BUFFER, PUT_BITS, CODE, SIZE
+    /*lsl             \PUT_BUFFER, \PUT_BUFFER, \SIZE*/
+    add             \PUT_BITS, \SIZE
+    /*orr             \PUT_BUFFER, \PUT_BUFFER, \CODE*/
+    orr             \PUT_BUFFER, \CODE, \PUT_BUFFER, lsl \SIZE
+.endm
+
+.macro checkbuf15 BUFFER, PUT_BUFFER, PUT_BITS, ZERO, TMP
+  cmp               \PUT_BITS, #0x10
+  blt               15f
+    eor               \ZERO, \ZERO, \ZERO
+    emit_byte         \BUFFER, \PUT_BUFFER, \PUT_BITS, \ZERO, \TMP
+    emit_byte         \BUFFER, \PUT_BUFFER, \PUT_BITS, \ZERO, \TMP
+15:
+.endm
+
+.balign 16
+jsimd_huff_encode_one_block_neon_consts:
+  .byte 0x01
+  .byte 0x02
+  .byte 0x04
+  .byte 0x08
+  .byte 0x10
+  .byte 0x20
+  .byte 0x40
+  .byte 0x80
+
+asm_function jsimd_huff_encode_one_block_neon
+    push            {r4, r5, r6, r7, r8, r9, r10, r11, lr}
+    add             r7, sp, #0x1c
+    sub             r4, sp, #0x40
+    bfc             r4, #0, #5
+    mov             sp, r4           /* align sp on 32 bytes */
+    vst1.64         {d8, d9, d10, d11}, [r4, :128]!
+    vst1.64         {d12, d13, d14, d15}, [r4, :128]
+    sub             sp, #0x140       /* reserve 320 bytes */
+    str             r0, [sp, #0x18]  /* working state > sp + Ox18 */
+    add             r4, sp, #0x20    /* r4 = t1 */
+    ldr             lr, [r7, #0x8]   /* lr = dctbl */
+    sub             r10, r1, #0x1    /* r10=buffer-- */
+    ldrsh           r1, [r2]
+    mov             r9, #0x10
+    mov             r8, #0x1
+    adr             r5, jsimd_huff_encode_one_block_neon_consts
+    /* prepare data */
+    vld1.8          {d26}, [r5, :64]
+    veor            q8, q8, q8
+    veor            q9, q9, q9
+    vdup.16         q14, r9
+    vdup.16         q15, r8
+    veor            q10, q10, q10
+    veor            q11, q11, q11
+    sub             r1, r1, r3
+    add             r9, r2, #0x22
+    add             r8, r2, #0x18
+    add             r3, r2, #0x36
+    vmov.16         d0[0], r1
+    vld1.16         {d2[0]}, [r9, :16]
+    vld1.16         {d4[0]}, [r8, :16]
+    vld1.16         {d6[0]}, [r3, :16]
+    add             r1, r2, #0x2
+    add             r9, r2, #0x30
+    add             r8, r2, #0x26
+    add             r3, r2, #0x28
+    vld1.16         {d0[1]}, [r1, :16]
+    vld1.16         {d2[1]}, [r9, :16]
+    vld1.16         {d4[1]}, [r8, :16]
+    vld1.16         {d6[1]}, [r3, :16]
+    add             r1, r2, #0x10
+    add             r9, r2, #0x40
+    add             r8, r2, #0x34
+    add             r3, r2, #0x1a
+    vld1.16         {d0[2]}, [r1, :16]
+    vld1.16         {d2[2]}, [r9, :16]
+    vld1.16         {d4[2]}, [r8, :16]
+    vld1.16         {d6[2]}, [r3, :16]
+    add             r1, r2, #0x20
+    add             r9, r2, #0x32
+    add             r8, r2, #0x42
+    add             r3, r2, #0xc
+    vld1.16         {d0[3]}, [r1, :16]
+    vld1.16         {d2[3]}, [r9, :16]
+    vld1.16         {d4[3]}, [r8, :16]
+    vld1.16         {d6[3]}, [r3, :16]
+    add             r1, r2, #0x12
+    add             r9, r2, #0x24
+    add             r8, r2, #0x50
+    add             r3, r2, #0xe
+    vld1.16         {d1[0]}, [r1, :16]
+    vld1.16         {d3[0]}, [r9, :16]
+    vld1.16         {d5[0]}, [r8, :16]
+    vld1.16         {d7[0]}, [r3, :16]
+    add             r1, r2, #0x4
+    add             r9, r2, #0x16
+    add             r8, r2, #0x60
+    add             r3, r2, #0x1c
+    vld1.16         {d1[1]}, [r1, :16]
+    vld1.16         {d3[1]}, [r9, :16]
+    vld1.16         {d5[1]}, [r8, :16]
+    vld1.16         {d7[1]}, [r3, :16]
+    add             r1, r2, #0x6
+    add             r9, r2, #0x8
+    add             r8, r2, #0x52
+    add             r3, r2, #0x2a
+    vld1.16         {d1[2]}, [r1, :16]
+    vld1.16         {d3[2]}, [r9, :16]
+    vld1.16         {d5[2]}, [r8, :16]
+    vld1.16         {d7[2]}, [r3, :16]
+    add             r1, r2, #0x14
+    add             r9, r2, #0xa
+    add             r8, r2, #0x44
+    add             r3, r2, #0x38
+    vld1.16         {d1[3]}, [r1, :16]
+    vld1.16         {d3[3]}, [r9, :16]
+    vld1.16         {d5[3]}, [r8, :16]
+    vld1.16         {d7[3]}, [r3, :16]
+    vcgt.s16        q8, q8, q0
+    vcgt.s16        q9, q9, q1
+    vcgt.s16        q10, q10, q2
+    vcgt.s16        q11, q11, q3
+    vabs.s16        q0, q0
+    vabs.s16        q1, q1
+    vabs.s16        q2, q2
+    vabs.s16        q3, q3
+    veor            q8, q8, q0
+    veor            q9, q9, q1
+    veor            q10, q10, q2
+    veor            q11, q11, q3
+    add             r9, r4, #0x20
+    add             r8, r4, #0x80
+    add             r3, r4, #0xa0
+    vclz.i16        q0, q0
+    vclz.i16        q1, q1
+    vclz.i16        q2, q2
+    vclz.i16        q3, q3
+    vsub.i16        q0, q14, q0
+    vsub.i16        q1, q14, q1
+    vsub.i16        q2, q14, q2
+    vsub.i16        q3, q14, q3
+    vst1.16         {d0, d1, d2, d3}, [r4, :256]
+    vst1.16         {d4, d5, d6, d7}, [r9, :256]
+    vshl.s16        q0, q15, q0
+    vshl.s16        q1, q15, q1
+    vshl.s16        q2, q15, q2
+    vshl.s16        q3, q15, q3
+    vsub.i16        q0, q0, q15
+    vsub.i16        q1, q1, q15
+    vsub.i16        q2, q2, q15
+    vsub.i16        q3, q3, q15
+    vand            q8, q8, q0
+    vand            q9, q9, q1
+    vand            q10, q10, q2
+    vand            q11, q11, q3
+    vst1.16         {d16, d17, d18, d19}, [r8, :256]
+    vst1.16         {d20, d21, d22, d23}, [r3, :256]
+    add             r1, r2, #0x46
+    add             r9, r2, #0x3a
+    add             r8, r2, #0x74
+    add             r3, r2, #0x6a
+    vld1.16         {d8[0]}, [r1, :16]
+    vld1.16         {d10[0]}, [r9, :16]
+    vld1.16         {d12[0]}, [r8, :16]
+    vld1.16         {d14[0]}, [r3, :16]
+    veor            q8, q8, q8
+    veor            q9, q9, q9
+    veor            q10, q10, q10
+    veor            q11, q11, q11
+    add             r1, r2, #0x54
+    add             r9, r2, #0x2c
+    add             r8, r2, #0x76
+    add             r3, r2, #0x78
+    vld1.16         {d8[1]}, [r1, :16]
+    vld1.16         {d10[1]}, [r9, :16]
+    vld1.16         {d12[1]}, [r8, :16]
+    vld1.16         {d14[1]}, [r3, :16]
+    add             r1, r2, #0x62
+    add             r9, r2, #0x1e
+    add             r8, r2, #0x68
+    add             r3, r2, #0x7a
+    vld1.16         {d8[2]}, [r1, :16]
+    vld1.16         {d10[2]}, [r9, :16]
+    vld1.16         {d12[2]}, [r8, :16]
+    vld1.16         {d14[2]}, [r3, :16]
+    add             r1, r2, #0x70
+    add             r9, r2, #0x2e
+    add             r8, r2, #0x5a
+    add             r3, r2, #0x6c
+    vld1.16         {d8[3]}, [r1, :16]
+    vld1.16         {d10[3]}, [r9, :16]
+    vld1.16         {d12[3]}, [r8, :16]
+    vld1.16         {d14[3]}, [r3, :16]
+    add             r1, r2, #0x72
+    add             r9, r2, #0x3c
+    add             r8, r2, #0x4c
+    add             r3, r2, #0x5e
+    vld1.16         {d9[0]}, [r1, :16]
+    vld1.16         {d11[0]}, [r9, :16]
+    vld1.16         {d13[0]}, [r8, :16]
+    vld1.16         {d15[0]}, [r3, :16]
+    add             r1, r2, #0x64
+    add             r9, r2, #0x4a
+    add             r8, r2, #0x3e
+    add             r3, r2, #0x6e
+    vld1.16         {d9[1]}, [r1, :16]
+    vld1.16         {d11[1]}, [r9, :16]
+    vld1.16         {d13[1]}, [r8, :16]
+    vld1.16         {d15[1]}, [r3, :16]
+    add             r1, r2, #0x56
+    add             r9, r2, #0x58
+    add             r8, r2, #0x4e
+    add             r3, r2, #0x7c
+    vld1.16         {d9[2]}, [r1, :16]
+    vld1.16         {d11[2]}, [r9, :16]
+    vld1.16         {d13[2]}, [r8, :16]
+    vld1.16         {d15[2]}, [r3, :16]
+    add             r1, r2, #0x48
+    add             r9, r2, #0x66
+    add             r8, r2, #0x5c
+    add             r3, r2, #0x7e
+    vld1.16         {d9[3]}, [r1, :16]
+    vld1.16         {d11[3]}, [r9, :16]
+    vld1.16         {d13[3]}, [r8, :16]
+    vld1.16         {d15[3]}, [r3, :16]
+    vcgt.s16        q8, q8, q4
+    vcgt.s16        q9, q9, q5
+    vcgt.s16        q10, q10, q6
+    vcgt.s16        q11, q11, q7
+    vabs.s16        q4, q4
+    vabs.s16        q5, q5
+    vabs.s16        q6, q6
+    vabs.s16        q7, q7
+    veor            q8, q8, q4
+    veor            q9, q9, q5
+    veor            q10, q10, q6
+    veor            q11, q11, q7
+    add             r1, r4, #0x40
+    add             r9, r4, #0x60
+    add             r8, r4, #0xc0
+    add             r3, r4, #0xe0
+    vclz.i16        q4, q4
+    vclz.i16        q5, q5
+    vclz.i16        q6, q6
+    vclz.i16        q7, q7
+    vsub.i16        q4, q14, q4
+    vsub.i16        q5, q14, q5
+    vsub.i16        q6, q14, q6
+    vsub.i16        q7, q14, q7
+    vst1.16         {d8, d9, d10, d11}, [r1, :256]
+    vst1.16         {d12, d13, d14, d15}, [r9, :256]
+    vshl.s16        q4, q15, q4
+    vshl.s16        q5, q15, q5
+    vshl.s16        q6, q15, q6
+    vshl.s16        q7, q15, q7
+    vsub.i16        q4, q4, q15
+    vsub.i16        q5, q5, q15
+    vsub.i16        q6, q6, q15
+    vsub.i16        q7, q7, q15
+    vand            q8, q8, q4
+    vand            q9, q9, q5
+    vand            q10, q10, q6
+    vand            q11, q11, q7
+    vst1.16         {d16, d17, d18, d19}, [r8, :256]
+    vst1.16         {d20, d21, d22, d23}, [r3, :256]
+    ldr             r12, [r7, #0xc]       /* r12 = actbl */
+    add             r1, lr, #0x400        /* r1 = dctbl->ehufsi */
+    mov             r9, r12               /* r9 = actbl */
+    add             r6, r4, #0x80         /* r6 = t2 */
+    ldr             r11, [r0, #0x8]       /* r11 = put_buffer */
+    ldr             r4, [r0, #0xc]        /* r4  = put_bits */
+    ldrh            r2, [r6, #-128]       /* r2  = nbits */
+    ldrh            r3, [r6]              /* r3  = temp2 & (((JLONG) 1)<<nbits) - 1; */
+    ldr             r0, [lr, r2, lsl #2]
+    ldrb            r5, [r1, r2]
+    put_bits        r11, r4, r0, r5
+    checkbuf15      r10, r11, r4, r5, r0
+    put_bits        r11, r4, r3, r2
+    checkbuf15      r10, r11, r4, r5, r0
+    mov             lr, r6                /* lr = t2 */
+    add             r5, r9, #0x400        /* r5 = actbl->ehufsi */
+    ldrsb           r6, [r5, #0xf0]       /* r6 = actbl->ehufsi[0xf0] */
+    veor            q8, q8, q8
+    vceq.i16        q0, q0, q8
+    vceq.i16        q1, q1, q8
+    vceq.i16        q2, q2, q8
+    vceq.i16        q3, q3, q8
+    vceq.i16        q4, q4, q8
+    vceq.i16        q5, q5, q8
+    vceq.i16        q6, q6, q8
+    vceq.i16        q7, q7, q8
+    vmovn.i16       d0, q0
+    vmovn.i16       d2, q1
+    vmovn.i16       d4, q2
+    vmovn.i16       d6, q3
+    vmovn.i16       d8, q4
+    vmovn.i16       d10, q5
+    vmovn.i16       d12, q6
+    vmovn.i16       d14, q7
+    vand            d0, d0, d26
+    vand            d2, d2, d26
+    vand            d4, d4, d26
+    vand            d6, d6, d26
+    vand            d8, d8, d26
+    vand            d10, d10, d26
+    vand            d12, d12, d26
+    vand            d14, d14, d26
+    vpadd.i8        d0, d0, d2
+    vpadd.i8        d4, d4, d6
+    vpadd.i8        d8, d8, d10
+    vpadd.i8        d12, d12, d14
+    vpadd.i8        d0, d0, d4
+    vpadd.i8        d8, d8, d12
+    vpadd.i8        d0, d0, d8
+    vmov.32         r1, d0[1]
+    vmov.32         r8, d0[0]
+    mvn             r1, r1
+    mvn             r8, r8
+    lsrs            r1, r1, #0x1
+    rrx             r8, r8            /* shift in last r1 bit while shifting out DC bit */
+    rbit            r1, r1            /* r1 = index1 */
+    rbit            r8, r8            /* r8 = index0 */
+    ldr             r0, [r9, #0x3c0]  /* r0 = actbl->ehufco[0xf0] */
+    str             r1, [sp, #0x14]   /* index1 > sp + 0x14 */
+    cmp             r8, #0x0
+    beq             6f
+1:
+    clz             r2, r8
+    add             lr, lr, r2, lsl #1
+    lsl             r8, r8, r2
+    ldrh            r1, [lr, #-126]
+2:
+    cmp             r2, #0x10
+    blt             3f
+    sub             r2, r2, #0x10
+    put_bits        r11, r4, r0, r6
+    cmp             r4, #0x10
+    blt             2b
+    eor             r3, r3, r3
+    emit_byte       r10, r11, r4, r3, r12
+    emit_byte       r10, r11, r4, r3, r12
+    b               2b
+3:
+    add             r2, r1, r2, lsl #4
+    ldrh            r3, [lr, #2]!
+    ldr             r12, [r9, r2, lsl #2]
+    ldrb            r2, [r5, r2]
+    put_bits        r11, r4, r12, r2
+    checkbuf15      r10, r11, r4, r2, r12
+    put_bits        r11, r4, r3, r1
+    checkbuf15      r10, r11, r4, r2, r12
+    lsls            r8, r8, #0x1
+    bne             1b
+6:
+    add             r12, sp, #0x20   /* r12 = t1 */
+    ldr             r8, [sp, #0x14]  /* r8 = index1 */
+    adds            r12, #0xc0       /* r12 = t2 + (DCTSIZE2/2) */
+    cmp             r8, #0x0
+    beq             6f
+    clz             r2, r8
+    sub             r12, r12, lr
+    lsl             r8, r8, r2
+    add             r2, r2, r12, lsr #1
+    add             lr, lr, r2, lsl #1
+    b               7f
+1:
+    clz             r2, r8
+    add             lr, lr, r2, lsl #1
+    lsl             r8, r8, r2
+7:
+    ldrh            r1, [lr, #-126]
+2:
+    cmp             r2, #0x10
+    blt             3f
+    sub             r2, r2, #0x10
+    put_bits        r11, r4, r0, r6
+    cmp             r4, #0x10
+    blt             2b
+    eor             r3, r3, r3
+    emit_byte       r10, r11, r4, r3, r12
+    emit_byte       r10, r11, r4, r3, r12
+    b               2b
+3:
+    add             r2, r1, r2, lsl #4
+    ldrh            r3, [lr, #2]!
+    ldr             r12, [r9, r2, lsl #2]
+    ldrb            r2, [r5, r2]
+    put_bits        r11, r4, r12, r2
+    checkbuf15      r10, r11, r4, r2, r12
+    put_bits        r11, r4, r3, r1
+    checkbuf15      r10, r11, r4, r2, r12
+    lsls            r8, r8, #0x1
+    bne             1b
+6:
+    add             r0, sp, #0x20
+    add             r0, #0xfe
+    cmp             lr, r0
+    bhs             1f
+    ldr             r1, [r9]
+    ldrb            r0, [r5]
+    put_bits        r11, r4, r1, r0
+    checkbuf15      r10, r11, r4, r0, r1
+1:
+    ldr             r12, [sp, #0x18]
+    str             r11, [r12, #0x8]
+    str             r4, [r12, #0xc]
+    add             r0, r10, #0x1
+    add             r4, sp, #0x140
+    vld1.64         {d8, d9, d10, d11}, [r4, :128]!
+    vld1.64         {d12, d13, d14, d15}, [r4, :128]
+    sub             r4, r7, #0x1c
+    mov             sp, r4
+    pop             {r4, r5, r6, r7, r8, r9, r10, r11, pc}
+
+.purgem emit_byte
+.purgem put_bits
+.purgem checkbuf15
diff --git a/simd/jsimd_i386.c b/simd/jsimd_i386.c
index 25d0697..aef1ad4 100644
--- a/simd/jsimd_i386.c
+++ b/simd/jsimd_i386.c
@@ -2,7 +2,8 @@
  * jsimd_i386.c
  *
  * Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
- * Copyright 2009-2011, 2013-2014 D. R. Commander
+ * Copyright 2009-2011, 2013-2014, 2016 D. R. Commander
+ * Copyright 2015 Matthieu Darbois
  *
  * Based on the x86 SIMD extension for IJG JPEG library,
  * Copyright (C) 1999-2006, MIYASAKA Masaru.
@@ -30,6 +31,7 @@
 #define IS_ALIGNED_SSE(ptr) (IS_ALIGNED(ptr, 4)) /* 16 byte alignment */
 
 static unsigned int simd_support = ~0;
+static unsigned int simd_huffman = 1;
 
 /*
  * Check what SIMD accelerations are supported.
@@ -62,6 +64,9 @@
   env = getenv("JSIMD_FORCENONE");
   if ((env != NULL) && (strcmp(env, "1") == 0))
     simd_support = 0;
+  env = getenv("JSIMD_NOHUFFENC");
+  if ((env != NULL) && (strcmp(env, "1") == 0))
+    simd_huffman = 0;
 }
 
 GLOBAL(int)
@@ -332,7 +337,7 @@
 }
 
 GLOBAL(void)
-jsimd_h2v2_downsample (j_compress_ptr cinfo, jpeg_component_info * compptr,
+jsimd_h2v2_downsample (j_compress_ptr cinfo, jpeg_component_info *compptr,
                        JSAMPARRAY input_data, JSAMPARRAY output_data)
 {
   if (simd_support & JSIMD_SSE2)
@@ -347,7 +352,7 @@
 }
 
 GLOBAL(void)
-jsimd_h2v1_downsample (j_compress_ptr cinfo, jpeg_component_info * compptr,
+jsimd_h2v1_downsample (j_compress_ptr cinfo, jpeg_component_info *compptr,
                        JSAMPARRAY input_data, JSAMPARRAY output_data)
 {
   if (simd_support & JSIMD_SSE2)
@@ -401,9 +406,9 @@
 
 GLOBAL(void)
 jsimd_h2v2_upsample (j_decompress_ptr cinfo,
-                     jpeg_component_info * compptr,
+                     jpeg_component_info *compptr,
                      JSAMPARRAY input_data,
-                     JSAMPARRAY * output_data_ptr)
+                     JSAMPARRAY *output_data_ptr)
 {
   if (simd_support & JSIMD_SSE2)
     jsimd_h2v2_upsample_sse2(cinfo->max_v_samp_factor, cinfo->output_width,
@@ -415,9 +420,9 @@
 
 GLOBAL(void)
 jsimd_h2v1_upsample (j_decompress_ptr cinfo,
-                     jpeg_component_info * compptr,
+                     jpeg_component_info *compptr,
                      JSAMPARRAY input_data,
-                     JSAMPARRAY * output_data_ptr)
+                     JSAMPARRAY *output_data_ptr)
 {
   if (simd_support & JSIMD_SSE2)
     jsimd_h2v1_upsample_sse2(cinfo->max_v_samp_factor, cinfo->output_width,
@@ -469,9 +474,9 @@
 
 GLOBAL(void)
 jsimd_h2v2_fancy_upsample (j_decompress_ptr cinfo,
-                           jpeg_component_info * compptr,
+                           jpeg_component_info *compptr,
                            JSAMPARRAY input_data,
-                           JSAMPARRAY * output_data_ptr)
+                           JSAMPARRAY *output_data_ptr)
 {
   if ((simd_support & JSIMD_SSE2) &&
       IS_ALIGNED_SSE(jconst_fancy_upsample_sse2))
@@ -486,9 +491,9 @@
 
 GLOBAL(void)
 jsimd_h2v1_fancy_upsample (j_decompress_ptr cinfo,
-                           jpeg_component_info * compptr,
+                           jpeg_component_info *compptr,
                            JSAMPARRAY input_data,
-                           JSAMPARRAY * output_data_ptr)
+                           JSAMPARRAY *output_data_ptr)
 {
   if ((simd_support & JSIMD_SSE2) &&
       IS_ALIGNED_SSE(jconst_fancy_upsample_sse2))
@@ -693,7 +698,7 @@
 
 GLOBAL(void)
 jsimd_convsamp (JSAMPARRAY sample_data, JDIMENSION start_col,
-                DCTELEM * workspace)
+                DCTELEM *workspace)
 {
   if (simd_support & JSIMD_SSE2)
     jsimd_convsamp_sse2(sample_data, start_col, workspace);
@@ -703,7 +708,7 @@
 
 GLOBAL(void)
 jsimd_convsamp_float (JSAMPARRAY sample_data, JDIMENSION start_col,
-                      FAST_FLOAT * workspace)
+                      FAST_FLOAT *workspace)
 {
   if (simd_support & JSIMD_SSE2)
     jsimd_convsamp_float_sse2(sample_data, start_col, workspace);
@@ -771,7 +776,7 @@
 }
 
 GLOBAL(void)
-jsimd_fdct_islow (DCTELEM * data)
+jsimd_fdct_islow (DCTELEM *data)
 {
   if ((simd_support & JSIMD_SSE2) && IS_ALIGNED_SSE(jconst_fdct_islow_sse2))
     jsimd_fdct_islow_sse2(data);
@@ -780,7 +785,7 @@
 }
 
 GLOBAL(void)
-jsimd_fdct_ifast (DCTELEM * data)
+jsimd_fdct_ifast (DCTELEM *data)
 {
   if ((simd_support & JSIMD_SSE2) && IS_ALIGNED_SSE(jconst_fdct_islow_sse2))
     jsimd_fdct_ifast_sse2(data);
@@ -789,7 +794,7 @@
 }
 
 GLOBAL(void)
-jsimd_fdct_float (FAST_FLOAT * data)
+jsimd_fdct_float (FAST_FLOAT *data)
 {
   if ((simd_support & JSIMD_SSE) && IS_ALIGNED_SSE(jconst_fdct_float_sse))
     jsimd_fdct_float_sse(data);
@@ -842,8 +847,8 @@
 }
 
 GLOBAL(void)
-jsimd_quantize (JCOEFPTR coef_block, DCTELEM * divisors,
-                DCTELEM * workspace)
+jsimd_quantize (JCOEFPTR coef_block, DCTELEM *divisors,
+                DCTELEM *workspace)
 {
   if (simd_support & JSIMD_SSE2)
     jsimd_quantize_sse2(coef_block, divisors, workspace);
@@ -852,8 +857,8 @@
 }
 
 GLOBAL(void)
-jsimd_quantize_float (JCOEFPTR coef_block, FAST_FLOAT * divisors,
-                      FAST_FLOAT * workspace)
+jsimd_quantize_float (JCOEFPTR coef_block, FAST_FLOAT *divisors,
+                      FAST_FLOAT *workspace)
 {
   if (simd_support & JSIMD_SSE2)
     jsimd_quantize_float_sse2(coef_block, divisors, workspace);
@@ -914,7 +919,7 @@
 }
 
 GLOBAL(void)
-jsimd_idct_2x2 (j_decompress_ptr cinfo, jpeg_component_info * compptr,
+jsimd_idct_2x2 (j_decompress_ptr cinfo, jpeg_component_info *compptr,
                 JCOEFPTR coef_block, JSAMPARRAY output_buf,
                 JDIMENSION output_col)
 {
@@ -926,7 +931,7 @@
 }
 
 GLOBAL(void)
-jsimd_idct_4x4 (j_decompress_ptr cinfo, jpeg_component_info * compptr,
+jsimd_idct_4x4 (j_decompress_ptr cinfo, jpeg_component_info *compptr,
                 JCOEFPTR coef_block, JSAMPARRAY output_buf,
                 JDIMENSION output_col)
 {
@@ -1018,7 +1023,7 @@
 }
 
 GLOBAL(void)
-jsimd_idct_islow (j_decompress_ptr cinfo, jpeg_component_info * compptr,
+jsimd_idct_islow (j_decompress_ptr cinfo, jpeg_component_info *compptr,
                   JCOEFPTR coef_block, JSAMPARRAY output_buf,
                   JDIMENSION output_col)
 {
@@ -1031,7 +1036,7 @@
 }
 
 GLOBAL(void)
-jsimd_idct_ifast (j_decompress_ptr cinfo, jpeg_component_info * compptr,
+jsimd_idct_ifast (j_decompress_ptr cinfo, jpeg_component_info *compptr,
                   JCOEFPTR coef_block, JSAMPARRAY output_buf,
                   JDIMENSION output_col)
 {
@@ -1044,7 +1049,7 @@
 }
 
 GLOBAL(void)
-jsimd_idct_float (j_decompress_ptr cinfo, jpeg_component_info * compptr,
+jsimd_idct_float (j_decompress_ptr cinfo, jpeg_component_info *compptr,
                   JCOEFPTR coef_block, JSAMPARRAY output_buf,
                   JDIMENSION output_col)
 {
@@ -1059,3 +1064,28 @@
                            output_col);
 }
 
+GLOBAL(int)
+jsimd_can_huff_encode_one_block (void)
+{
+  init_simd();
+
+  if (DCTSIZE != 8)
+    return 0;
+  if (sizeof(JCOEF) != 2)
+    return 0;
+
+  if ((simd_support & JSIMD_SSE2) && simd_huffman &&
+      IS_ALIGNED_SSE(jconst_huff_encode_one_block))
+    return 1;
+
+  return 0;
+}
+
+GLOBAL(JOCTET*)
+jsimd_huff_encode_one_block (void *state, JOCTET *buffer, JCOEFPTR block,
+                             int last_dc_val, c_derived_tbl *dctbl,
+                             c_derived_tbl *actbl)
+{
+  return jsimd_huff_encode_one_block_sse2(state, buffer, block, last_dc_val,
+                                          dctbl, actbl);
+}
diff --git a/simd/jsimd_mips.c b/simd/jsimd_mips.c
index cf87b32..bdd9912 100644
--- a/simd/jsimd_mips.c
+++ b/simd/jsimd_mips.c
@@ -4,6 +4,7 @@
  * Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
  * Copyright 2009-2011, 2014 D. R. Commander
  * Copyright (C) 2013-2014, MIPS Technologies, Inc., California
+ * Copyright 2015 Matthieu Darbois
  *
  * Based on the x86 SIMD extension for IJG JPEG library,
  * Copyright (C) 1999-2006, MIYASAKA Masaru.
@@ -87,9 +88,9 @@
 
 /* The following struct is borrowed from jdsample.c */
 typedef void (*upsample1_ptr) (j_decompress_ptr cinfo,
-                               jpeg_component_info * compptr,
+                               jpeg_component_info *compptr,
                                JSAMPARRAY input_data,
-                               JSAMPARRAY * output_data_ptr);
+                               JSAMPARRAY *output_data_ptr);
 
 typedef struct {
   struct jpeg_upsampler pub;
@@ -102,7 +103,7 @@
   UINT8 v_expand[MAX_COMPONENTS];
 } my_upsampler;
 
-typedef my_upsampler * my_upsample_ptr;
+typedef my_upsampler *my_upsample_ptr;
 
 GLOBAL(int)
 jsimd_can_rgb_ycc (void)
@@ -377,7 +378,7 @@
 }
 
 GLOBAL(void)
-jsimd_h2v2_downsample (j_compress_ptr cinfo, jpeg_component_info * compptr,
+jsimd_h2v2_downsample (j_compress_ptr cinfo, jpeg_component_info *compptr,
                        JSAMPARRAY input_data, JSAMPARRAY output_data)
 {
   if (simd_support & JSIMD_MIPS_DSPR2)
@@ -390,7 +391,7 @@
 
 GLOBAL(void)
 jsimd_h2v2_smooth_downsample (j_compress_ptr cinfo,
-                              jpeg_component_info * compptr,
+                              jpeg_component_info *compptr,
                               JSAMPARRAY input_data, JSAMPARRAY output_data)
 {
   jsimd_h2v2_smooth_downsample_mips_dspr2(input_data, output_data,
@@ -402,7 +403,7 @@
 }
 
 GLOBAL(void)
-jsimd_h2v1_downsample (j_compress_ptr cinfo, jpeg_component_info * compptr,
+jsimd_h2v1_downsample (j_compress_ptr cinfo, jpeg_component_info *compptr,
                        JSAMPARRAY input_data, JSAMPARRAY output_data)
 {
   if (simd_support & JSIMD_MIPS_DSPR2)
@@ -466,9 +467,9 @@
 
 GLOBAL(void)
 jsimd_h2v2_upsample (j_decompress_ptr cinfo,
-                     jpeg_component_info * compptr,
+                     jpeg_component_info *compptr,
                      JSAMPARRAY input_data,
-                     JSAMPARRAY * output_data_ptr)
+                     JSAMPARRAY *output_data_ptr)
 {
   if (simd_support & JSIMD_MIPS_DSPR2)
     jsimd_h2v2_upsample_mips_dspr2(cinfo->max_v_samp_factor,
@@ -478,9 +479,9 @@
 
 GLOBAL(void)
 jsimd_h2v1_upsample (j_decompress_ptr cinfo,
-                     jpeg_component_info * compptr,
+                     jpeg_component_info *compptr,
                      JSAMPARRAY input_data,
-                     JSAMPARRAY * output_data_ptr)
+                     JSAMPARRAY *output_data_ptr)
 {
   if (simd_support & JSIMD_MIPS_DSPR2)
     jsimd_h2v1_upsample_mips_dspr2(cinfo->max_v_samp_factor,
@@ -489,8 +490,8 @@
 }
 
 GLOBAL(void)
-jsimd_int_upsample (j_decompress_ptr cinfo, jpeg_component_info * compptr,
-                    JSAMPARRAY input_data, JSAMPARRAY * output_data_ptr)
+jsimd_int_upsample (j_decompress_ptr cinfo, jpeg_component_info *compptr,
+                    JSAMPARRAY input_data, JSAMPARRAY *output_data_ptr)
 {
   my_upsample_ptr upsample = (my_upsample_ptr) cinfo->upsample;
 
@@ -537,9 +538,9 @@
 
 GLOBAL(void)
 jsimd_h2v2_fancy_upsample (j_decompress_ptr cinfo,
-                           jpeg_component_info * compptr,
+                           jpeg_component_info *compptr,
                            JSAMPARRAY input_data,
-                           JSAMPARRAY * output_data_ptr)
+                           JSAMPARRAY *output_data_ptr)
 {
   if (simd_support & JSIMD_MIPS_DSPR2)
     jsimd_h2v2_fancy_upsample_mips_dspr2(cinfo->max_v_samp_factor,
@@ -549,9 +550,9 @@
 
 GLOBAL(void)
 jsimd_h2v1_fancy_upsample (j_decompress_ptr cinfo,
-                           jpeg_component_info * compptr,
+                           jpeg_component_info *compptr,
                            JSAMPARRAY input_data,
-                           JSAMPARRAY * output_data_ptr)
+                           JSAMPARRAY *output_data_ptr)
 {
   if (simd_support & JSIMD_MIPS_DSPR2)
     jsimd_h2v1_fancy_upsample_mips_dspr2(cinfo->max_v_samp_factor,
@@ -719,7 +720,7 @@
 
 GLOBAL(void)
 jsimd_convsamp (JSAMPARRAY sample_data, JDIMENSION start_col,
-                DCTELEM * workspace)
+                DCTELEM *workspace)
 {
   if (simd_support & JSIMD_MIPS_DSPR2)
     jsimd_convsamp_mips_dspr2(sample_data, start_col, workspace);
@@ -727,7 +728,7 @@
 
 GLOBAL(void)
 jsimd_convsamp_float (JSAMPARRAY sample_data, JDIMENSION start_col,
-                      FAST_FLOAT * workspace)
+                      FAST_FLOAT *workspace)
 {
   if ((simd_support & JSIMD_MIPS_DSPR2))
     jsimd_convsamp_float_mips_dspr2(sample_data, start_col, workspace);
@@ -776,21 +777,21 @@
 }
 
 GLOBAL(void)
-jsimd_fdct_islow (DCTELEM * data)
+jsimd_fdct_islow (DCTELEM *data)
 {
   if (simd_support & JSIMD_MIPS_DSPR2)
     jsimd_fdct_islow_mips_dspr2(data);
 }
 
 GLOBAL(void)
-jsimd_fdct_ifast (DCTELEM * data)
+jsimd_fdct_ifast (DCTELEM *data)
 {
   if (simd_support & JSIMD_MIPS_DSPR2)
     jsimd_fdct_ifast_mips_dspr2(data);
 }
 
 GLOBAL(void)
-jsimd_fdct_float (FAST_FLOAT * data)
+jsimd_fdct_float (FAST_FLOAT *data)
 {
 }
 
@@ -837,16 +838,16 @@
 }
 
 GLOBAL(void)
-jsimd_quantize (JCOEFPTR coef_block, DCTELEM * divisors,
-                DCTELEM * workspace)
+jsimd_quantize (JCOEFPTR coef_block, DCTELEM *divisors,
+                DCTELEM *workspace)
 {
   if (simd_support & JSIMD_MIPS_DSPR2)
     jsimd_quantize_mips_dspr2(coef_block, divisors, workspace);
 }
 
 GLOBAL(void)
-jsimd_quantize_float (JCOEFPTR coef_block, FAST_FLOAT * divisors,
-                      FAST_FLOAT * workspace)
+jsimd_quantize_float (JCOEFPTR coef_block, FAST_FLOAT *divisors,
+                      FAST_FLOAT *workspace)
 {
   if (simd_support & JSIMD_MIPS_DSPR2)
     jsimd_quantize_float_mips_dspr2(coef_block, divisors, workspace);
@@ -944,7 +945,7 @@
 }
 
 GLOBAL(void)
-jsimd_idct_2x2 (j_decompress_ptr cinfo, jpeg_component_info * compptr,
+jsimd_idct_2x2 (j_decompress_ptr cinfo, jpeg_component_info *compptr,
                 JCOEFPTR coef_block, JSAMPARRAY output_buf,
                 JDIMENSION output_col)
 {
@@ -954,7 +955,7 @@
 }
 
 GLOBAL(void)
-jsimd_idct_4x4 (j_decompress_ptr cinfo, jpeg_component_info * compptr,
+jsimd_idct_4x4 (j_decompress_ptr cinfo, jpeg_component_info *compptr,
                 JCOEFPTR coef_block, JSAMPARRAY output_buf,
                 JDIMENSION output_col)
 {
@@ -966,7 +967,7 @@
 }
 
 GLOBAL(void)
-jsimd_idct_6x6 (j_decompress_ptr cinfo, jpeg_component_info * compptr,
+jsimd_idct_6x6 (j_decompress_ptr cinfo, jpeg_component_info *compptr,
            JCOEFPTR coef_block, JSAMPARRAY output_buf,
            JDIMENSION output_col)
 {
@@ -976,7 +977,7 @@
 }
 
 GLOBAL(void)
-jsimd_idct_12x12 (j_decompress_ptr cinfo, jpeg_component_info * compptr,
+jsimd_idct_12x12 (j_decompress_ptr cinfo, jpeg_component_info *compptr,
                   JCOEFPTR coef_block,
                   JSAMPARRAY output_buf, JDIMENSION output_col)
 {
@@ -1059,7 +1060,7 @@
 }
 
 GLOBAL(void)
-jsimd_idct_islow (j_decompress_ptr cinfo, jpeg_component_info * compptr,
+jsimd_idct_islow (j_decompress_ptr cinfo, jpeg_component_info *compptr,
                   JCOEFPTR coef_block, JSAMPARRAY output_buf,
                   JDIMENSION output_col)
 {
@@ -1081,13 +1082,13 @@
 }
 
 GLOBAL(void)
-jsimd_idct_ifast (j_decompress_ptr cinfo, jpeg_component_info * compptr,
+jsimd_idct_ifast (j_decompress_ptr cinfo, jpeg_component_info *compptr,
                   JCOEFPTR coef_block, JSAMPARRAY output_buf,
                   JDIMENSION output_col)
 {
   if (simd_support & JSIMD_MIPS_DSPR2) {
     JCOEFPTR inptr;
-    IFAST_MULT_TYPE * quantptr;
+    IFAST_MULT_TYPE *quantptr;
     DCTELEM workspace[DCTSIZE2];  /* buffers data between passes */
 
     /* Pass 1: process columns from input, store into work array. */
@@ -1108,8 +1109,22 @@
 }
 
 GLOBAL(void)
-jsimd_idct_float (j_decompress_ptr cinfo, jpeg_component_info * compptr,
+jsimd_idct_float (j_decompress_ptr cinfo, jpeg_component_info *compptr,
                   JCOEFPTR coef_block, JSAMPARRAY output_buf,
                   JDIMENSION output_col)
 {
 }
+
+GLOBAL(int)
+jsimd_can_huff_encode_one_block (void)
+{
+  return 0;
+}
+
+GLOBAL(JOCTET*)
+jsimd_huff_encode_one_block (void *state, JOCTET *buffer, JCOEFPTR block,
+                             int last_dc_val, c_derived_tbl *dctbl,
+                             c_derived_tbl *actbl)
+{
+  return NULL;
+}
diff --git a/simd/jsimd_mips_dspr2.S b/simd/jsimd_mips_dspr2.S
index 65c169a..0eed1ce 100644
--- a/simd/jsimd_mips_dspr2.S
+++ b/simd/jsimd_mips_dspr2.S
@@ -5,6 +5,7 @@
  * All rights reserved.
  * Authors:  Teodora Novkovic (teodora.novkovic@imgtec.com)
  *           Darko Laus       (darko.laus@imgtec.com)
+ * Copyright (C) 2015, D. R. Commander.  All Rights Reserved.
  * This software is provided 'as-is', without any express or implied
  * warranty.  In no event will the authors be held liable for any damages
  * arising from the use of this software.
@@ -1992,14 +1993,14 @@
     move      v0, sp
     addiu     v1, zero, 8
 4:
-    lw        t0, 8(v0)        // z2 = (INT32) wsptr[2]
-    lw        t1, 24(v0)       // z3 = (INT32) wsptr[6]
-    lw        t2, 0(v0)        // (INT32) wsptr[0]
-    lw        t3, 16(v0)       // (INT32) wsptr[4]
-    lw        s4, 4(v0)        // (INT32) wsptr[1]
-    lw        s5, 12(v0)       // (INT32) wsptr[3]
-    lw        s6, 20(v0)       // (INT32) wsptr[5]
-    lw        s7, 28(v0)       // (INT32) wsptr[7]
+    lw        t0, 8(v0)        // z2 = (JLONG) wsptr[2]
+    lw        t1, 24(v0)       // z3 = (JLONG) wsptr[6]
+    lw        t2, 0(v0)        // (JLONG) wsptr[0]
+    lw        t3, 16(v0)       // (JLONG) wsptr[4]
+    lw        s4, 4(v0)        // (JLONG) wsptr[1]
+    lw        s5, 12(v0)       // (JLONG) wsptr[3]
+    lw        s6, 20(v0)       // (JLONG) wsptr[5]
+    lw        s7, 28(v0)       // (JLONG) wsptr[7]
     or        s4, s4, t0
     or        s4, s4, t1
     or        s4, s4, t3
@@ -2025,8 +2026,8 @@
     mul       t1, t1, t8       // MULTIPLY(z3, FIX_1_847759065)
     addiu     t8, zero, 6270   // FIX_0_765366865
     mul       t0, t0, t8       // MULTIPLY(z2, FIX_0_765366865)
-    addu      t4, t2, t3       // (INT32) wsptr[0] + (INT32) wsptr[4]
-    subu      t2, t2, t3       // (INT32) wsptr[0] - (INT32) wsptr[4]
+    addu      t4, t2, t3       // (JLONG) wsptr[0] + (JLONG) wsptr[4]
+    subu      t2, t2, t3       // (JLONG) wsptr[0] - (JLONG) wsptr[4]
     sll       t4, t4, 13       // tmp0 = ((wsptr[0] + wsptr[4]) << CONST_BITS
     sll       t2, t2, 13       // tmp1 = ((wsptr[0] - wsptr[4]) << CONST_BITS
     subu      t1, t5, t1       // tmp2 = z1 + MULTIPLY(z3, - FIX_1_847759065)
@@ -2035,10 +2036,10 @@
     addu      t5, t5, t0       // tmp3 = z1 + MULTIPLY(z2, FIX_0_765366865)
     subu      t1, t4, t5       // tmp13 = tmp0 - tmp3
     addu      t0, t4, t5       // tmp10 = tmp0 + tmp3
-    lw        t4, 28(v0)       // tmp0 = (INT32) wsptr[7]
-    lw        t6, 12(v0)       // tmp2 = (INT32) wsptr[3]
-    lw        t5, 20(v0)       // tmp1 = (INT32) wsptr[5]
-    lw        t7, 4(v0)        // tmp3 = (INT32) wsptr[1]
+    lw        t4, 28(v0)       // tmp0 = (JLONG) wsptr[7]
+    lw        t6, 12(v0)       // tmp2 = (JLONG) wsptr[3]
+    lw        t5, 20(v0)       // tmp1 = (JLONG) wsptr[5]
+    lw        t7, 4(v0)        // tmp3 = (JLONG) wsptr[1]
     addu      s0, t4, t6       // z3 = tmp0 + tmp2
     addiu     t8, zero, 9633   // FIX_1_175875602
     addu      s1, t5, t7       // z4 = tmp1 + tmp3
@@ -2841,54 +2842,54 @@
     mtc1       t1, f0
     li         t0, 63
 0:
-    lwc1       f1, 0(a2)
-    lwc1       f5, 0(a1)
-    lwc1       f2, 4(a2)
-    lwc1       f6, 4(a1)
-    lwc1       f3, 8(a2)
-    lwc1       f7, 8(a1)
-    lwc1       f4, 12(a2)
-    lwc1       f8, 12(a1)
-    madd.s     f1, f0, f1, f5
-    madd.s     f2, f0, f2, f6
-    madd.s     f3, f0, f3, f7
-    madd.s     f4, f0, f4, f8
-    lwc1       f5, 16(a1)
-    lwc1       f6, 20(a1)
-    trunc.w.s  f1, f1
+    lwc1       f2, 0(a2)
+    lwc1       f10, 0(a1)
+    lwc1       f4, 4(a2)
+    lwc1       f12, 4(a1)
+    lwc1       f6, 8(a2)
+    lwc1       f14, 8(a1)
+    lwc1       f8, 12(a2)
+    lwc1       f16, 12(a1)
+    madd.s     f2, f0, f2, f10
+    madd.s     f4, f0, f4, f12
+    madd.s     f6, f0, f6, f14
+    madd.s     f8, f0, f8, f16
+    lwc1       f10, 16(a1)
+    lwc1       f12, 20(a1)
     trunc.w.s  f2, f2
-    trunc.w.s  f3, f3
     trunc.w.s  f4, f4
-    lwc1       f7, 24(a1)
-    lwc1       f8, 28(a1)
-    mfc1       t1, f1
-    mfc1       t2, f2
-    mfc1       t3, f3
-    mfc1       t4, f4
-    lwc1       f1, 16(a2)
-    lwc1       f2, 20(a2)
-    lwc1       f3, 24(a2)
-    lwc1       f4, 28(a2)
-    madd.s     f1, f0, f1, f5
-    madd.s     f2, f0, f2, f6
-    madd.s     f3, f0, f3, f7
-    madd.s     f4, f0, f4, f8
+    trunc.w.s  f6, f6
+    trunc.w.s  f8, f8
+    lwc1       f14, 24(a1)
+    lwc1       f16, 28(a1)
+    mfc1       t1, f2
+    mfc1       t2, f4
+    mfc1       t3, f6
+    mfc1       t4, f8
+    lwc1       f2, 16(a2)
+    lwc1       f4, 20(a2)
+    lwc1       f6, 24(a2)
+    lwc1       f8, 28(a2)
+    madd.s     f2, f0, f2, f10
+    madd.s     f4, f0, f4, f12
+    madd.s     f6, f0, f6, f14
+    madd.s     f8, f0, f8, f16
     addiu      t1, t1, -16384
     addiu      t2, t2, -16384
     addiu      t3, t3, -16384
     addiu      t4, t4, -16384
-    trunc.w.s  f1, f1
     trunc.w.s  f2, f2
-    trunc.w.s  f3, f3
     trunc.w.s  f4, f4
+    trunc.w.s  f6, f6
+    trunc.w.s  f8, f8
     sh         t1, 0(a0)
     sh         t2, 2(a0)
     sh         t3, 4(a0)
     sh         t4, 6(a0)
-    mfc1       t1, f1
-    mfc1       t2, f2
-    mfc1       t3, f3
-    mfc1       t4, f4
+    mfc1       t1, f2
+    mfc1       t2, f4
+    mfc1       t3, f6
+    mfc1       t4, f8
     addiu      t0, t0, -8
     addiu      a2, a2, 32
     addiu      a1, a1, 32
@@ -3269,9 +3270,9 @@
     lw        s6, 8(t1)         // wsptr[2]
     li        s5, 6270
     lw        s7, 24(t1)        // wsptr[6]
-    mul       s4, s4, s6        // MULTIPLY((INT32) wsptr[2], FIX_1_847759065)
+    mul       s4, s4, s6        // MULTIPLY((JLONG) wsptr[2], FIX_1_847759065)
     lw        t2, 0(t1)         // wsptr[0]
-    mul       s5, s5, s7        // MULTIPLY((INT32) wsptr[6], - FIX_0_765366865)
+    mul       s5, s5, s7        // MULTIPLY((JLONG) wsptr[6], - FIX_0_765366865)
     lh        t5, 28(t1)        // wsptr[7]
     lh        t6, 20(t1)        // wsptr[5]
     lh        t7, 12(t1)        // wsptr[3]
@@ -3284,7 +3285,7 @@
     mult      $ac1, zero, zero
     dpa.w.ph  $ac1, t5, s2
     dpa.w.ph  $ac1, t7, s3
-    sll       t2, t2, 14        // tmp0 = ((INT32) wsptr[0]) << (CONST_BITS+1)
+    sll       t2, t2, 14        // tmp0 = ((JLONG) wsptr[0]) << (CONST_BITS+1)
     mflo      s6, $ac0
     // MULTIPLY(wsptr[2], FIX_1_847759065 + MULTIPLY(wsptr[6], -FIX_0_765366865)
     subu      s4, s4, s5
@@ -3323,9 +3324,9 @@
     lw        s6, 40(t1)        // wsptr[2]
     li        s5, 6270
     lw        s7, 56(t1)        // wsptr[6]
-    mul       s4, s4, s6        // MULTIPLY((INT32) wsptr[2], FIX_1_847759065)
+    mul       s4, s4, s6        // MULTIPLY((JLONG) wsptr[2], FIX_1_847759065)
     lw        t2, 32(t1)        // wsptr[0]
-    mul       s5, s5, s7        // MULTIPLY((INT32) wsptr[6], - FIX_0_765366865)
+    mul       s5, s5, s7        // MULTIPLY((JLONG) wsptr[6], - FIX_0_765366865)
     lh        t5, 60(t1)        // wsptr[7]
     lh        t6, 52(t1)        // wsptr[5]
     lh        t7, 44(t1)        // wsptr[3]
@@ -3338,7 +3339,7 @@
     mult      $ac1, zero, zero
     dpa.w.ph  $ac1, t5, s2
     dpa.w.ph  $ac1, t7, s3
-    sll       t2, t2, 14        // tmp0 = ((INT32) wsptr[0]) << (CONST_BITS+1)
+    sll       t2, t2, 14        // tmp0 = ((JLONG) wsptr[0]) << (CONST_BITS+1)
     mflo      s6, $ac0
     // MULTIPLY(wsptr[2], FIX_1_847759065 + MULTIPLY(wsptr[6], -FIX_0_765366865)
     subu      s4, s4, s5
@@ -3377,9 +3378,9 @@
     lw        s6, 72(t1)        // wsptr[2]
     li        s5, 6270
     lw        s7, 88(t1)        // wsptr[6]
-    mul       s4, s4, s6        // MULTIPLY((INT32) wsptr[2], FIX_1_847759065)
+    mul       s4, s4, s6        // MULTIPLY((JLONG) wsptr[2], FIX_1_847759065)
     lw        t2, 64(t1)        // wsptr[0]
-    mul       s5, s5, s7        // MULTIPLY((INT32) wsptr[6], - FIX_0_765366865)
+    mul       s5, s5, s7        // MULTIPLY((JLONG) wsptr[6], - FIX_0_765366865)
     lh        t5, 92(t1)        // wsptr[7]
     lh        t6, 84(t1)        // wsptr[5]
     lh        t7, 76(t1)        // wsptr[3]
@@ -3392,7 +3393,7 @@
     mult      $ac1, zero, zero
     dpa.w.ph  $ac1, t5, s2
     dpa.w.ph  $ac1, t7, s3
-    sll       t2, t2, 14        // tmp0 = ((INT32) wsptr[0]) << (CONST_BITS+1)
+    sll       t2, t2, 14        // tmp0 = ((JLONG) wsptr[0]) << (CONST_BITS+1)
     mflo      s6, $ac0
     // MULTIPLY(wsptr[2], FIX_1_847759065 + MULTIPLY(wsptr[6], -FIX_0_765366865)
     subu      s4, s4, s5
@@ -3430,9 +3431,9 @@
     lw        s6, 104(t1)       // wsptr[2]
     li        s5, 6270
     lw        s7, 120(t1)       // wsptr[6]
-    mul       s4, s4, s6        // MULTIPLY((INT32) wsptr[2], FIX_1_847759065)
+    mul       s4, s4, s6        // MULTIPLY((JLONG) wsptr[2], FIX_1_847759065)
     lw        t2, 96(t1)        // wsptr[0]
-    mul       s5, s5, s7        // MULTIPLY((INT32) wsptr[6], -FIX_0_765366865)
+    mul       s5, s5, s7        // MULTIPLY((JLONG) wsptr[6], -FIX_0_765366865)
     lh        t5, 124(t1)       // wsptr[7]
     lh        t6, 116(t1)       // wsptr[5]
     lh        t7, 108(t1)       // wsptr[3]
@@ -3445,7 +3446,7 @@
     mult      $ac1, zero, zero
     dpa.w.ph  $ac1, t5, s2
     dpa.w.ph  $ac1, t7, s3
-    sll       t2, t2, 14        // tmp0 = ((INT32) wsptr[0]) << (CONST_BITS+1)
+    sll       t2, t2, 14        // tmp0 = ((JLONG) wsptr[0]) << (CONST_BITS+1)
     mflo      s6, $ac0
     // MULTIPLY(wsptr[2], FIX_1_847759065 + MULTIPLY(wsptr[6], -FIX_0_765366865)
     subu      s4, s4, s5
@@ -4151,32 +4152,32 @@
     addiu    t6, t6, -128
     addiu    t7, t7, -128
     addiu    t8, t8, -128
-    mtc1     t1, f1
-    mtc1     t2, f2
-    mtc1     t3, f3
-    mtc1     t4, f4
-    mtc1     t5, f5
-    mtc1     t6, f6
-    mtc1     t7, f7
-    mtc1     t8, f8
-    cvt.s.w  f1, f1
+    mtc1     t1, f2
+    mtc1     t2, f4
+    mtc1     t3, f6
+    mtc1     t4, f8
+    mtc1     t5, f10
+    mtc1     t6, f12
+    mtc1     t7, f14
+    mtc1     t8, f16
     cvt.s.w  f2, f2
-    cvt.s.w  f3, f3
     cvt.s.w  f4, f4
-    cvt.s.w  f5, f5
     cvt.s.w  f6, f6
-    cvt.s.w  f7, f7
     cvt.s.w  f8, f8
+    cvt.s.w  f10, f10
+    cvt.s.w  f12, f12
+    cvt.s.w  f14, f14
+    cvt.s.w  f16, f16
     lw       t0, 4(a0)
-    swc1     f1, 0(a2)
-    swc1     f2, 4(a2)
-    swc1     f3, 8(a2)
+    swc1     f2, 0(a2)
+    swc1     f4, 4(a2)
+    swc1     f6, 8(a2)
     addu     t0, t0, a1
-    swc1     f4, 12(a2)
-    swc1     f5, 16(a2)
-    swc1     f6, 20(a2)
-    swc1     f7, 24(a2)
-    swc1     f8, 28(a2)
+    swc1     f8, 12(a2)
+    swc1     f10, 16(a2)
+    swc1     f12, 20(a2)
+    swc1     f14, 24(a2)
+    swc1     f16, 28(a2)
     //elemr 1
     lbu      t1, 0(t0)
     lbu      t2, 1(t0)
@@ -4194,32 +4195,32 @@
     addiu    t6, t6, -128
     addiu    t7, t7, -128
     addiu    t8, t8, -128
-    mtc1     t1, f1
-    mtc1     t2, f2
-    mtc1     t3, f3
-    mtc1     t4, f4
-    mtc1     t5, f5
-    mtc1     t6, f6
-    mtc1     t7, f7
-    mtc1     t8, f8
-    cvt.s.w  f1, f1
+    mtc1     t1, f2
+    mtc1     t2, f4
+    mtc1     t3, f6
+    mtc1     t4, f8
+    mtc1     t5, f10
+    mtc1     t6, f12
+    mtc1     t7, f14
+    mtc1     t8, f16
     cvt.s.w  f2, f2
-    cvt.s.w  f3, f3
     cvt.s.w  f4, f4
-    cvt.s.w  f5, f5
     cvt.s.w  f6, f6
-    cvt.s.w  f7, f7
     cvt.s.w  f8, f8
+    cvt.s.w  f10, f10
+    cvt.s.w  f12, f12
+    cvt.s.w  f14, f14
+    cvt.s.w  f16, f16
     lw       t0, 8(a0)
-    swc1     f1, 32(a2)
-    swc1     f2, 36(a2)
-    swc1     f3, 40(a2)
+    swc1     f2, 32(a2)
+    swc1     f4, 36(a2)
+    swc1     f6, 40(a2)
     addu     t0, t0, a1
-    swc1     f4, 44(a2)
-    swc1     f5, 48(a2)
-    swc1     f6, 52(a2)
-    swc1     f7, 56(a2)
-    swc1     f8, 60(a2)
+    swc1     f8, 44(a2)
+    swc1     f10, 48(a2)
+    swc1     f12, 52(a2)
+    swc1     f14, 56(a2)
+    swc1     f16, 60(a2)
     //elemr 2
     lbu      t1, 0(t0)
     lbu      t2, 1(t0)
@@ -4237,32 +4238,32 @@
     addiu    t6, t6, -128
     addiu    t7, t7, -128
     addiu    t8, t8, -128
-    mtc1     t1, f1
-    mtc1     t2, f2
-    mtc1     t3, f3
-    mtc1     t4, f4
-    mtc1     t5, f5
-    mtc1     t6, f6
-    mtc1     t7, f7
-    mtc1     t8, f8
-    cvt.s.w  f1, f1
+    mtc1     t1, f2
+    mtc1     t2, f4
+    mtc1     t3, f6
+    mtc1     t4, f8
+    mtc1     t5, f10
+    mtc1     t6, f12
+    mtc1     t7, f14
+    mtc1     t8, f16
     cvt.s.w  f2, f2
-    cvt.s.w  f3, f3
     cvt.s.w  f4, f4
-    cvt.s.w  f5, f5
     cvt.s.w  f6, f6
-    cvt.s.w  f7, f7
     cvt.s.w  f8, f8
+    cvt.s.w  f10, f10
+    cvt.s.w  f12, f12
+    cvt.s.w  f14, f14
+    cvt.s.w  f16, f16
     lw       t0, 12(a0)
-    swc1     f1, 64(a2)
-    swc1     f2, 68(a2)
-    swc1     f3, 72(a2)
+    swc1     f2, 64(a2)
+    swc1     f4, 68(a2)
+    swc1     f6, 72(a2)
     addu     t0, t0, a1
-    swc1     f4, 76(a2)
-    swc1     f5, 80(a2)
-    swc1     f6, 84(a2)
-    swc1     f7, 88(a2)
-    swc1     f8, 92(a2)
+    swc1     f8, 76(a2)
+    swc1     f10, 80(a2)
+    swc1     f12, 84(a2)
+    swc1     f14, 88(a2)
+    swc1     f16, 92(a2)
     //elemr 3
     lbu      t1, 0(t0)
     lbu      t2, 1(t0)
@@ -4280,32 +4281,32 @@
     addiu    t6, t6, -128
     addiu    t7, t7, -128
     addiu    t8, t8, -128
-    mtc1     t1, f1
-    mtc1     t2, f2
-    mtc1     t3, f3
-    mtc1     t4, f4
-    mtc1     t5, f5
-    mtc1     t6, f6
-    mtc1     t7, f7
-    mtc1     t8, f8
-    cvt.s.w  f1, f1
+    mtc1     t1, f2
+    mtc1     t2, f4
+    mtc1     t3, f6
+    mtc1     t4, f8
+    mtc1     t5, f10
+    mtc1     t6, f12
+    mtc1     t7, f14
+    mtc1     t8, f16
     cvt.s.w  f2, f2
-    cvt.s.w  f3, f3
     cvt.s.w  f4, f4
-    cvt.s.w  f5, f5
     cvt.s.w  f6, f6
-    cvt.s.w  f7, f7
     cvt.s.w  f8, f8
+    cvt.s.w  f10, f10
+    cvt.s.w  f12, f12
+    cvt.s.w  f14, f14
+    cvt.s.w  f16, f16
     lw       t0, 16(a0)
-    swc1     f1, 96(a2)
-    swc1     f2, 100(a2)
-    swc1     f3, 104(a2)
+    swc1     f2, 96(a2)
+    swc1     f4, 100(a2)
+    swc1     f6, 104(a2)
     addu     t0, t0, a1
-    swc1     f4, 108(a2)
-    swc1     f5, 112(a2)
-    swc1     f6, 116(a2)
-    swc1     f7, 120(a2)
-    swc1     f8, 124(a2)
+    swc1     f8, 108(a2)
+    swc1     f10, 112(a2)
+    swc1     f12, 116(a2)
+    swc1     f14, 120(a2)
+    swc1     f16, 124(a2)
     //elemr 4
     lbu      t1, 0(t0)
     lbu      t2, 1(t0)
@@ -4323,32 +4324,32 @@
     addiu    t6, t6, -128
     addiu    t7, t7, -128
     addiu    t8, t8, -128
-    mtc1     t1, f1
-    mtc1     t2, f2
-    mtc1     t3, f3
-    mtc1     t4, f4
-    mtc1     t5, f5
-    mtc1     t6, f6
-    mtc1     t7, f7
-    mtc1     t8, f8
-    cvt.s.w  f1, f1
+    mtc1     t1, f2
+    mtc1     t2, f4
+    mtc1     t3, f6
+    mtc1     t4, f8
+    mtc1     t5, f10
+    mtc1     t6, f12
+    mtc1     t7, f14
+    mtc1     t8, f16
     cvt.s.w  f2, f2
-    cvt.s.w  f3, f3
     cvt.s.w  f4, f4
-    cvt.s.w  f5, f5
     cvt.s.w  f6, f6
-    cvt.s.w  f7, f7
     cvt.s.w  f8, f8
+    cvt.s.w  f10, f10
+    cvt.s.w  f12, f12
+    cvt.s.w  f14, f14
+    cvt.s.w  f16, f16
     lw       t0, 20(a0)
-    swc1     f1, 128(a2)
-    swc1     f2, 132(a2)
-    swc1     f3, 136(a2)
+    swc1     f2, 128(a2)
+    swc1     f4, 132(a2)
+    swc1     f6, 136(a2)
     addu     t0, t0, a1
-    swc1     f4, 140(a2)
-    swc1     f5, 144(a2)
-    swc1     f6, 148(a2)
-    swc1     f7, 152(a2)
-    swc1     f8, 156(a2)
+    swc1     f8, 140(a2)
+    swc1     f10, 144(a2)
+    swc1     f12, 148(a2)
+    swc1     f14, 152(a2)
+    swc1     f16, 156(a2)
     //elemr 5
     lbu      t1, 0(t0)
     lbu      t2, 1(t0)
@@ -4366,32 +4367,32 @@
     addiu    t6, t6, -128
     addiu    t7, t7, -128
     addiu    t8, t8, -128
-    mtc1     t1, f1
-    mtc1     t2, f2
-    mtc1     t3, f3
-    mtc1     t4, f4
-    mtc1     t5, f5
-    mtc1     t6, f6
-    mtc1     t7, f7
-    mtc1     t8, f8
-    cvt.s.w  f1, f1
+    mtc1     t1, f2
+    mtc1     t2, f4
+    mtc1     t3, f6
+    mtc1     t4, f8
+    mtc1     t5, f10
+    mtc1     t6, f12
+    mtc1     t7, f14
+    mtc1     t8, f16
     cvt.s.w  f2, f2
-    cvt.s.w  f3, f3
     cvt.s.w  f4, f4
-    cvt.s.w  f5, f5
     cvt.s.w  f6, f6
-    cvt.s.w  f7, f7
     cvt.s.w  f8, f8
+    cvt.s.w  f10, f10
+    cvt.s.w  f12, f12
+    cvt.s.w  f14, f14
+    cvt.s.w  f16, f16
     lw       t0, 24(a0)
-    swc1     f1, 160(a2)
-    swc1     f2, 164(a2)
-    swc1     f3, 168(a2)
+    swc1     f2, 160(a2)
+    swc1     f4, 164(a2)
+    swc1     f6, 168(a2)
     addu     t0, t0, a1
-    swc1     f4, 172(a2)
-    swc1     f5, 176(a2)
-    swc1     f6, 180(a2)
-    swc1     f7, 184(a2)
-    swc1     f8, 188(a2)
+    swc1     f8, 172(a2)
+    swc1     f10, 176(a2)
+    swc1     f12, 180(a2)
+    swc1     f14, 184(a2)
+    swc1     f16, 188(a2)
     //elemr 6
     lbu      t1, 0(t0)
     lbu      t2, 1(t0)
@@ -4409,32 +4410,32 @@
     addiu    t6, t6, -128
     addiu    t7, t7, -128
     addiu    t8, t8, -128
-    mtc1     t1, f1
-    mtc1     t2, f2
-    mtc1     t3, f3
-    mtc1     t4, f4
-    mtc1     t5, f5
-    mtc1     t6, f6
-    mtc1     t7, f7
-    mtc1     t8, f8
-    cvt.s.w  f1, f1
+    mtc1     t1, f2
+    mtc1     t2, f4
+    mtc1     t3, f6
+    mtc1     t4, f8
+    mtc1     t5, f10
+    mtc1     t6, f12
+    mtc1     t7, f14
+    mtc1     t8, f16
     cvt.s.w  f2, f2
-    cvt.s.w  f3, f3
     cvt.s.w  f4, f4
-    cvt.s.w  f5, f5
     cvt.s.w  f6, f6
-    cvt.s.w  f7, f7
     cvt.s.w  f8, f8
+    cvt.s.w  f10, f10
+    cvt.s.w  f12, f12
+    cvt.s.w  f14, f14
+    cvt.s.w  f16, f16
     lw       t0, 28(a0)
-    swc1     f1, 192(a2)
-    swc1     f2, 196(a2)
-    swc1     f3, 200(a2)
+    swc1     f2, 192(a2)
+    swc1     f4, 196(a2)
+    swc1     f6, 200(a2)
     addu     t0, t0, a1
-    swc1     f4, 204(a2)
-    swc1     f5, 208(a2)
-    swc1     f6, 212(a2)
-    swc1     f7, 216(a2)
-    swc1     f8, 220(a2)
+    swc1     f8, 204(a2)
+    swc1     f10, 208(a2)
+    swc1     f12, 212(a2)
+    swc1     f14, 216(a2)
+    swc1     f16, 220(a2)
     //elemr 7
     lbu      t1, 0(t0)
     lbu      t2, 1(t0)
@@ -4452,30 +4453,30 @@
     addiu    t6, t6, -128
     addiu    t7, t7, -128
     addiu    t8, t8, -128
-    mtc1     t1, f1
-    mtc1     t2, f2
-    mtc1     t3, f3
-    mtc1     t4, f4
-    mtc1     t5, f5
-    mtc1     t6, f6
-    mtc1     t7, f7
-    mtc1     t8, f8
-    cvt.s.w  f1, f1
+    mtc1     t1, f2
+    mtc1     t2, f4
+    mtc1     t3, f6
+    mtc1     t4, f8
+    mtc1     t5, f10
+    mtc1     t6, f12
+    mtc1     t7, f14
+    mtc1     t8, f16
     cvt.s.w  f2, f2
-    cvt.s.w  f3, f3
     cvt.s.w  f4, f4
-    cvt.s.w  f5, f5
     cvt.s.w  f6, f6
-    cvt.s.w  f7, f7
     cvt.s.w  f8, f8
-    swc1     f1, 224(a2)
-    swc1     f2, 228(a2)
-    swc1     f3, 232(a2)
-    swc1     f4, 236(a2)
-    swc1     f5, 240(a2)
-    swc1     f6, 244(a2)
-    swc1     f7, 248(a2)
-    swc1     f8, 252(a2)
+    cvt.s.w  f10, f10
+    cvt.s.w  f12, f12
+    cvt.s.w  f14, f14
+    cvt.s.w  f16, f16
+    swc1     f2, 224(a2)
+    swc1     f4, 228(a2)
+    swc1     f6, 232(a2)
+    swc1     f8, 236(a2)
+    swc1     f10, 240(a2)
+    swc1     f12, 244(a2)
+    swc1     f14, 248(a2)
+    swc1     f16, 252(a2)
 
     j        ra
      nop
diff --git a/simd/jsimd_powerpc.c b/simd/jsimd_powerpc.c
new file mode 100644
index 0000000..afbaa82
--- /dev/null
+++ b/simd/jsimd_powerpc.c
@@ -0,0 +1,741 @@
+/*
+ * jsimd_powerpc.c
+ *
+ * Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
+ * Copyright 2009-2011, 2014-2015 D. R. Commander
+ * Copyright 2015 Matthieu Darbois
+ *
+ * Based on the x86 SIMD extension for IJG JPEG library,
+ * Copyright (C) 1999-2006, MIYASAKA Masaru.
+ * For conditions of distribution and use, see copyright notice in jsimdext.inc
+ *
+ * This file contains the interface between the "normal" portions
+ * of the library and the SIMD implementations when running on a
+ * PowerPC architecture.
+ */
+
+#define JPEG_INTERNALS
+#include "../jinclude.h"
+#include "../jpeglib.h"
+#include "../jsimd.h"
+#include "../jdct.h"
+#include "../jsimddct.h"
+#include "jsimd.h"
+
+static unsigned int simd_support = ~0;
+
+LOCAL(void)
+init_simd (void)
+{
+  char *env = NULL;
+
+  if (simd_support != ~0U)
+    return;
+
+  simd_support = JSIMD_ALTIVEC;
+
+  /* Force different settings through environment variables */
+  env = getenv("JSIMD_FORCENONE");
+  if ((env != NULL) && (strcmp(env, "1") == 0))
+    simd_support = 0;
+}
+
+GLOBAL(int)
+jsimd_can_rgb_ycc (void)
+{
+  init_simd();
+
+  /* The code is optimised for these values only */
+  if (BITS_IN_JSAMPLE != 8)
+    return 0;
+  if (sizeof(JDIMENSION) != 4)
+    return 0;
+  if ((RGB_PIXELSIZE != 3) && (RGB_PIXELSIZE != 4))
+    return 0;
+
+  if (simd_support & JSIMD_ALTIVEC)
+    return 1;
+
+  return 0;
+}
+
+GLOBAL(int)
+jsimd_can_rgb_gray (void)
+{
+  init_simd();
+
+  /* The code is optimised for these values only */
+  if (BITS_IN_JSAMPLE != 8)
+    return 0;
+  if (sizeof(JDIMENSION) != 4)
+    return 0;
+  if ((RGB_PIXELSIZE != 3) && (RGB_PIXELSIZE != 4))
+    return 0;
+
+  if (simd_support & JSIMD_ALTIVEC)
+    return 1;
+
+  return 0;
+}
+
+GLOBAL(int)
+jsimd_can_ycc_rgb (void)
+{
+  init_simd();
+
+  /* The code is optimised for these values only */
+  if (BITS_IN_JSAMPLE != 8)
+    return 0;
+  if (sizeof(JDIMENSION) != 4)
+    return 0;
+  if ((RGB_PIXELSIZE != 3) && (RGB_PIXELSIZE != 4))
+    return 0;
+
+  if (simd_support & JSIMD_ALTIVEC)
+    return 1;
+
+  return 0;
+}
+
+GLOBAL(int)
+jsimd_can_ycc_rgb565 (void)
+{
+  return 0;
+}
+
+GLOBAL(void)
+jsimd_rgb_ycc_convert (j_compress_ptr cinfo,
+                       JSAMPARRAY input_buf, JSAMPIMAGE output_buf,
+                       JDIMENSION output_row, int num_rows)
+{
+  void (*altivecfct)(JDIMENSION, JSAMPARRAY, JSAMPIMAGE, JDIMENSION, int);
+
+  switch(cinfo->in_color_space) {
+    case JCS_EXT_RGB:
+      altivecfct=jsimd_extrgb_ycc_convert_altivec;
+      break;
+    case JCS_EXT_RGBX:
+    case JCS_EXT_RGBA:
+      altivecfct=jsimd_extrgbx_ycc_convert_altivec;
+      break;
+    case JCS_EXT_BGR:
+      altivecfct=jsimd_extbgr_ycc_convert_altivec;
+      break;
+    case JCS_EXT_BGRX:
+    case JCS_EXT_BGRA:
+      altivecfct=jsimd_extbgrx_ycc_convert_altivec;
+      break;
+    case JCS_EXT_XBGR:
+    case JCS_EXT_ABGR:
+      altivecfct=jsimd_extxbgr_ycc_convert_altivec;
+      break;
+    case JCS_EXT_XRGB:
+    case JCS_EXT_ARGB:
+      altivecfct=jsimd_extxrgb_ycc_convert_altivec;
+      break;
+    default:
+      altivecfct=jsimd_rgb_ycc_convert_altivec;
+      break;
+  }
+
+  altivecfct(cinfo->image_width, input_buf, output_buf, output_row, num_rows);
+}
+
+GLOBAL(void)
+jsimd_rgb_gray_convert (j_compress_ptr cinfo,
+                        JSAMPARRAY input_buf, JSAMPIMAGE output_buf,
+                        JDIMENSION output_row, int num_rows)
+{
+  void (*altivecfct)(JDIMENSION, JSAMPARRAY, JSAMPIMAGE, JDIMENSION, int);
+
+  switch(cinfo->in_color_space) {
+    case JCS_EXT_RGB:
+      altivecfct=jsimd_extrgb_gray_convert_altivec;
+      break;
+    case JCS_EXT_RGBX:
+    case JCS_EXT_RGBA:
+      altivecfct=jsimd_extrgbx_gray_convert_altivec;
+      break;
+    case JCS_EXT_BGR:
+      altivecfct=jsimd_extbgr_gray_convert_altivec;
+      break;
+    case JCS_EXT_BGRX:
+    case JCS_EXT_BGRA:
+      altivecfct=jsimd_extbgrx_gray_convert_altivec;
+      break;
+    case JCS_EXT_XBGR:
+    case JCS_EXT_ABGR:
+      altivecfct=jsimd_extxbgr_gray_convert_altivec;
+      break;
+    case JCS_EXT_XRGB:
+    case JCS_EXT_ARGB:
+      altivecfct=jsimd_extxrgb_gray_convert_altivec;
+      break;
+    default:
+      altivecfct=jsimd_rgb_gray_convert_altivec;
+      break;
+  }
+
+  altivecfct(cinfo->image_width, input_buf, output_buf, output_row, num_rows);
+}
+
+GLOBAL(void)
+jsimd_ycc_rgb_convert (j_decompress_ptr cinfo,
+                       JSAMPIMAGE input_buf, JDIMENSION input_row,
+                       JSAMPARRAY output_buf, int num_rows)
+{
+  void (*altivecfct)(JDIMENSION, JSAMPIMAGE, JDIMENSION, JSAMPARRAY, int);
+
+  switch(cinfo->out_color_space) {
+    case JCS_EXT_RGB:
+      altivecfct=jsimd_ycc_extrgb_convert_altivec;
+      break;
+    case JCS_EXT_RGBX:
+    case JCS_EXT_RGBA:
+      altivecfct=jsimd_ycc_extrgbx_convert_altivec;
+      break;
+    case JCS_EXT_BGR:
+      altivecfct=jsimd_ycc_extbgr_convert_altivec;
+      break;
+    case JCS_EXT_BGRX:
+    case JCS_EXT_BGRA:
+      altivecfct=jsimd_ycc_extbgrx_convert_altivec;
+      break;
+    case JCS_EXT_XBGR:
+    case JCS_EXT_ABGR:
+      altivecfct=jsimd_ycc_extxbgr_convert_altivec;
+      break;
+    case JCS_EXT_XRGB:
+    case JCS_EXT_ARGB:
+      altivecfct=jsimd_ycc_extxrgb_convert_altivec;
+      break;
+    default:
+      altivecfct=jsimd_ycc_rgb_convert_altivec;
+      break;
+  }
+
+  altivecfct(cinfo->output_width, input_buf, input_row, output_buf, num_rows);
+}
+
+GLOBAL(void)
+jsimd_ycc_rgb565_convert (j_decompress_ptr cinfo,
+                          JSAMPIMAGE input_buf, JDIMENSION input_row,
+                          JSAMPARRAY output_buf, int num_rows)
+{
+}
+
+GLOBAL(int)
+jsimd_can_h2v2_downsample (void)
+{
+  init_simd();
+
+  /* The code is optimised for these values only */
+  if (BITS_IN_JSAMPLE != 8)
+    return 0;
+  if (sizeof(JDIMENSION) != 4)
+    return 0;
+
+  if (simd_support & JSIMD_ALTIVEC)
+    return 1;
+
+  return 0;
+}
+
+GLOBAL(int)
+jsimd_can_h2v1_downsample (void)
+{
+  init_simd();
+
+  /* The code is optimised for these values only */
+  if (BITS_IN_JSAMPLE != 8)
+    return 0;
+  if (sizeof(JDIMENSION) != 4)
+    return 0;
+
+  if (simd_support & JSIMD_ALTIVEC)
+    return 1;
+
+  return 0;
+}
+
+GLOBAL(void)
+jsimd_h2v2_downsample (j_compress_ptr cinfo, jpeg_component_info *compptr,
+                       JSAMPARRAY input_data, JSAMPARRAY output_data)
+{
+  jsimd_h2v2_downsample_altivec(cinfo->image_width, cinfo->max_v_samp_factor,
+                                compptr->v_samp_factor,
+                                compptr->width_in_blocks,
+                                input_data, output_data);
+}
+
+GLOBAL(void)
+jsimd_h2v1_downsample (j_compress_ptr cinfo, jpeg_component_info *compptr,
+                       JSAMPARRAY input_data, JSAMPARRAY output_data)
+{
+  jsimd_h2v1_downsample_altivec(cinfo->image_width, cinfo->max_v_samp_factor,
+                                compptr->v_samp_factor,
+                                compptr->width_in_blocks,
+                                input_data, output_data);
+}
+
+GLOBAL(int)
+jsimd_can_h2v2_upsample (void)
+{
+  init_simd();
+
+  /* The code is optimised for these values only */
+  if (BITS_IN_JSAMPLE != 8)
+    return 0;
+  if (sizeof(JDIMENSION) != 4)
+    return 0;
+
+  if (simd_support & JSIMD_ALTIVEC)
+    return 1;
+
+  return 0;
+}
+
+GLOBAL(int)
+jsimd_can_h2v1_upsample (void)
+{
+  init_simd();
+
+  /* The code is optimised for these values only */
+  if (BITS_IN_JSAMPLE != 8)
+    return 0;
+  if (sizeof(JDIMENSION) != 4)
+    return 0;
+
+  if (simd_support & JSIMD_ALTIVEC)
+    return 1;
+
+  return 0;
+}
+
+GLOBAL(void)
+jsimd_h2v2_upsample (j_decompress_ptr cinfo,
+                     jpeg_component_info *compptr,
+                     JSAMPARRAY input_data,
+                     JSAMPARRAY *output_data_ptr)
+{
+  jsimd_h2v2_upsample_altivec(cinfo->max_v_samp_factor, cinfo->output_width,
+                              input_data, output_data_ptr);
+}
+
+GLOBAL(void)
+jsimd_h2v1_upsample (j_decompress_ptr cinfo,
+                     jpeg_component_info *compptr,
+                     JSAMPARRAY input_data,
+                     JSAMPARRAY *output_data_ptr)
+{
+  jsimd_h2v1_upsample_altivec(cinfo->max_v_samp_factor, cinfo->output_width,
+                              input_data, output_data_ptr);
+}
+
+GLOBAL(int)
+jsimd_can_h2v2_fancy_upsample (void)
+{
+  init_simd();
+
+  /* The code is optimised for these values only */
+  if (BITS_IN_JSAMPLE != 8)
+    return 0;
+  if (sizeof(JDIMENSION) != 4)
+    return 0;
+
+  if (simd_support & JSIMD_ALTIVEC)
+    return 1;
+
+  return 0;
+}
+
+GLOBAL(int)
+jsimd_can_h2v1_fancy_upsample (void)
+{
+  init_simd();
+
+  /* The code is optimised for these values only */
+  if (BITS_IN_JSAMPLE != 8)
+    return 0;
+  if (sizeof(JDIMENSION) != 4)
+    return 0;
+
+  if (simd_support & JSIMD_ALTIVEC)
+    return 1;
+
+  return 0;
+}
+
+GLOBAL(void)
+jsimd_h2v2_fancy_upsample (j_decompress_ptr cinfo,
+                           jpeg_component_info *compptr,
+                           JSAMPARRAY input_data,
+                           JSAMPARRAY *output_data_ptr)
+{
+  jsimd_h2v2_fancy_upsample_altivec(cinfo->max_v_samp_factor,
+                                    compptr->downsampled_width, input_data,
+                                    output_data_ptr);
+}
+
+GLOBAL(void)
+jsimd_h2v1_fancy_upsample (j_decompress_ptr cinfo,
+                           jpeg_component_info *compptr,
+                           JSAMPARRAY input_data,
+                           JSAMPARRAY *output_data_ptr)
+{
+  jsimd_h2v1_fancy_upsample_altivec(cinfo->max_v_samp_factor,
+                                    compptr->downsampled_width, input_data,
+                                    output_data_ptr);
+}
+
+GLOBAL(int)
+jsimd_can_h2v2_merged_upsample (void)
+{
+  init_simd();
+
+  /* The code is optimised for these values only */
+  if (BITS_IN_JSAMPLE != 8)
+    return 0;
+  if (sizeof(JDIMENSION) != 4)
+    return 0;
+
+  if (simd_support & JSIMD_ALTIVEC)
+    return 1;
+
+  return 0;
+}
+
+GLOBAL(int)
+jsimd_can_h2v1_merged_upsample (void)
+{
+  init_simd();
+
+  /* The code is optimised for these values only */
+  if (BITS_IN_JSAMPLE != 8)
+    return 0;
+  if (sizeof(JDIMENSION) != 4)
+    return 0;
+
+  if (simd_support & JSIMD_ALTIVEC)
+    return 1;
+
+  return 0;
+}
+
+GLOBAL(void)
+jsimd_h2v2_merged_upsample (j_decompress_ptr cinfo,
+                            JSAMPIMAGE input_buf,
+                            JDIMENSION in_row_group_ctr,
+                            JSAMPARRAY output_buf)
+{
+  void (*altivecfct)(JDIMENSION, JSAMPIMAGE, JDIMENSION, JSAMPARRAY);
+
+  switch(cinfo->out_color_space) {
+    case JCS_EXT_RGB:
+      altivecfct=jsimd_h2v2_extrgb_merged_upsample_altivec;
+      break;
+    case JCS_EXT_RGBX:
+    case JCS_EXT_RGBA:
+      altivecfct=jsimd_h2v2_extrgbx_merged_upsample_altivec;
+      break;
+    case JCS_EXT_BGR:
+      altivecfct=jsimd_h2v2_extbgr_merged_upsample_altivec;
+      break;
+    case JCS_EXT_BGRX:
+    case JCS_EXT_BGRA:
+      altivecfct=jsimd_h2v2_extbgrx_merged_upsample_altivec;
+      break;
+    case JCS_EXT_XBGR:
+    case JCS_EXT_ABGR:
+      altivecfct=jsimd_h2v2_extxbgr_merged_upsample_altivec;
+      break;
+    case JCS_EXT_XRGB:
+    case JCS_EXT_ARGB:
+      altivecfct=jsimd_h2v2_extxrgb_merged_upsample_altivec;
+      break;
+    default:
+      altivecfct=jsimd_h2v2_merged_upsample_altivec;
+      break;
+  }
+
+  altivecfct(cinfo->output_width, input_buf, in_row_group_ctr, output_buf);
+}
+
+GLOBAL(void)
+jsimd_h2v1_merged_upsample (j_decompress_ptr cinfo,
+                            JSAMPIMAGE input_buf,
+                            JDIMENSION in_row_group_ctr,
+                            JSAMPARRAY output_buf)
+{
+  void (*altivecfct)(JDIMENSION, JSAMPIMAGE, JDIMENSION, JSAMPARRAY);
+
+  switch(cinfo->out_color_space) {
+    case JCS_EXT_RGB:
+      altivecfct=jsimd_h2v1_extrgb_merged_upsample_altivec;
+      break;
+    case JCS_EXT_RGBX:
+    case JCS_EXT_RGBA:
+      altivecfct=jsimd_h2v1_extrgbx_merged_upsample_altivec;
+      break;
+    case JCS_EXT_BGR:
+      altivecfct=jsimd_h2v1_extbgr_merged_upsample_altivec;
+      break;
+    case JCS_EXT_BGRX:
+    case JCS_EXT_BGRA:
+      altivecfct=jsimd_h2v1_extbgrx_merged_upsample_altivec;
+      break;
+    case JCS_EXT_XBGR:
+    case JCS_EXT_ABGR:
+      altivecfct=jsimd_h2v1_extxbgr_merged_upsample_altivec;
+      break;
+    case JCS_EXT_XRGB:
+    case JCS_EXT_ARGB:
+      altivecfct=jsimd_h2v1_extxrgb_merged_upsample_altivec;
+      break;
+    default:
+      altivecfct=jsimd_h2v1_merged_upsample_altivec;
+      break;
+  }
+
+  altivecfct(cinfo->output_width, input_buf, in_row_group_ctr, output_buf);
+}
+
+GLOBAL(int)
+jsimd_can_convsamp (void)
+{
+  init_simd();
+
+  /* The code is optimised for these values only */
+  if (DCTSIZE != 8)
+    return 0;
+  if (BITS_IN_JSAMPLE != 8)
+    return 0;
+  if (sizeof(JDIMENSION) != 4)
+    return 0;
+  if (sizeof(DCTELEM) != 2)
+    return 0;
+
+  if (simd_support & JSIMD_ALTIVEC)
+    return 1;
+
+  return 0;
+}
+
+GLOBAL(int)
+jsimd_can_convsamp_float (void)
+{
+  return 0;
+}
+
+GLOBAL(void)
+jsimd_convsamp (JSAMPARRAY sample_data, JDIMENSION start_col,
+                DCTELEM *workspace)
+{
+  jsimd_convsamp_altivec(sample_data, start_col, workspace);
+}
+
+GLOBAL(void)
+jsimd_convsamp_float (JSAMPARRAY sample_data, JDIMENSION start_col,
+                      FAST_FLOAT *workspace)
+{
+}
+
+GLOBAL(int)
+jsimd_can_fdct_islow (void)
+{
+  init_simd();
+
+  /* The code is optimised for these values only */
+  if (DCTSIZE != 8)
+    return 0;
+  if (sizeof(DCTELEM) != 2)
+    return 0;
+
+  if (simd_support & JSIMD_ALTIVEC)
+    return 1;
+
+  return 0;
+}
+
+GLOBAL(int)
+jsimd_can_fdct_ifast (void)
+{
+  init_simd();
+
+  /* The code is optimised for these values only */
+  if (DCTSIZE != 8)
+    return 0;
+  if (sizeof(DCTELEM) != 2)
+    return 0;
+
+  if (simd_support & JSIMD_ALTIVEC)
+    return 1;
+
+  return 0;
+}
+
+GLOBAL(int)
+jsimd_can_fdct_float (void)
+{
+  return 0;
+}
+
+GLOBAL(void)
+jsimd_fdct_islow (DCTELEM *data)
+{
+  jsimd_fdct_islow_altivec(data);
+}
+
+GLOBAL(void)
+jsimd_fdct_ifast (DCTELEM *data)
+{
+  jsimd_fdct_ifast_altivec(data);
+}
+
+GLOBAL(void)
+jsimd_fdct_float (FAST_FLOAT *data)
+{
+}
+
+GLOBAL(int)
+jsimd_can_quantize (void)
+{
+  init_simd();
+
+  /* The code is optimised for these values only */
+  if (DCTSIZE != 8)
+    return 0;
+  if (sizeof(JCOEF) != 2)
+    return 0;
+  if (sizeof(DCTELEM) != 2)
+    return 0;
+
+  if (simd_support & JSIMD_ALTIVEC)
+    return 1;
+
+  return 0;
+}
+
+GLOBAL(int)
+jsimd_can_quantize_float (void)
+{
+  return 0;
+}
+
+GLOBAL(void)
+jsimd_quantize (JCOEFPTR coef_block, DCTELEM *divisors,
+                DCTELEM *workspace)
+{
+  jsimd_quantize_altivec(coef_block, divisors, workspace);
+}
+
+GLOBAL(void)
+jsimd_quantize_float (JCOEFPTR coef_block, FAST_FLOAT *divisors,
+                      FAST_FLOAT *workspace)
+{
+}
+
+GLOBAL(int)
+jsimd_can_idct_2x2 (void)
+{
+  return 0;
+}
+
+GLOBAL(int)
+jsimd_can_idct_4x4 (void)
+{
+  return 0;
+}
+
+GLOBAL(void)
+jsimd_idct_2x2 (j_decompress_ptr cinfo, jpeg_component_info *compptr,
+                JCOEFPTR coef_block, JSAMPARRAY output_buf,
+                JDIMENSION output_col)
+{
+}
+
+GLOBAL(void)
+jsimd_idct_4x4 (j_decompress_ptr cinfo, jpeg_component_info *compptr,
+                JCOEFPTR coef_block, JSAMPARRAY output_buf,
+                JDIMENSION output_col)
+{
+}
+
+GLOBAL(int)
+jsimd_can_idct_islow (void)
+{
+  init_simd();
+
+  /* The code is optimised for these values only */
+  if (DCTSIZE != 8)
+    return 0;
+  if (sizeof(JCOEF) != 2)
+    return 0;
+
+  if (simd_support & JSIMD_ALTIVEC)
+    return 1;
+
+  return 0;
+}
+
+GLOBAL(int)
+jsimd_can_idct_ifast (void)
+{
+  init_simd();
+
+  /* The code is optimised for these values only */
+  if (DCTSIZE != 8)
+    return 0;
+  if (sizeof(JCOEF) != 2)
+    return 0;
+
+  if (simd_support & JSIMD_ALTIVEC)
+    return 1;
+
+  return 0;
+}
+
+GLOBAL(int)
+jsimd_can_idct_float (void)
+{
+  return 0;
+}
+
+GLOBAL(void)
+jsimd_idct_islow (j_decompress_ptr cinfo, jpeg_component_info *compptr,
+                  JCOEFPTR coef_block, JSAMPARRAY output_buf,
+                  JDIMENSION output_col)
+{
+  jsimd_idct_islow_altivec(compptr->dct_table, coef_block, output_buf,
+                           output_col);
+}
+
+GLOBAL(void)
+jsimd_idct_ifast (j_decompress_ptr cinfo, jpeg_component_info *compptr,
+                  JCOEFPTR coef_block, JSAMPARRAY output_buf,
+                  JDIMENSION output_col)
+{
+  jsimd_idct_ifast_altivec(compptr->dct_table, coef_block, output_buf,
+                           output_col);
+}
+
+GLOBAL(void)
+jsimd_idct_float (j_decompress_ptr cinfo, jpeg_component_info *compptr,
+                  JCOEFPTR coef_block, JSAMPARRAY output_buf,
+                  JDIMENSION output_col)
+{
+}
+
+GLOBAL(int)
+jsimd_can_huff_encode_one_block (void)
+{
+  return 0;
+}
+
+GLOBAL(JOCTET*)
+jsimd_huff_encode_one_block (void *state, JOCTET *buffer, JCOEFPTR block,
+                             int last_dc_val, c_derived_tbl *dctbl,
+                             c_derived_tbl *actbl)
+{
+  return NULL;
+}
diff --git a/simd/jsimd_x86_64.c b/simd/jsimd_x86_64.c
index 2c47a7f..fa33bea 100644
--- a/simd/jsimd_x86_64.c
+++ b/simd/jsimd_x86_64.c
@@ -2,7 +2,8 @@
  * jsimd_x86_64.c
  *
  * Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
- * Copyright 2009-2011, 2014 D. R. Commander
+ * Copyright 2009-2011, 2014, 2016 D. R. Commander
+ * Copyright 2015 Matthieu Darbois
  *
  * Based on the x86 SIMD extension for IJG JPEG library,
  * Copyright (C) 1999-2006, MIYASAKA Masaru.
@@ -29,9 +30,38 @@
 
 #define IS_ALIGNED_SSE(ptr) (IS_ALIGNED(ptr, 4)) /* 16 byte alignment */
 
+static unsigned int simd_support = ~0;
+static unsigned int simd_huffman = 1;
+
+/*
+ * Check what SIMD accelerations are supported.
+ *
+ * FIXME: This code is racy under a multi-threaded environment.
+ */
+LOCAL(void)
+init_simd (void)
+{
+  char *env = NULL;
+
+  if (simd_support != ~0U)
+    return;
+
+  simd_support = JSIMD_SSE2 | JSIMD_SSE;
+
+  /* Force different settings through environment variables */
+  env = getenv("JSIMD_FORCENONE");
+  if ((env != NULL) && (strcmp(env, "1") == 0))
+    simd_support = 0;
+  env = getenv("JSIMD_NOHUFFENC");
+  if ((env != NULL) && (strcmp(env, "1") == 0))
+    simd_huffman = 0;
+}
+
 GLOBAL(int)
 jsimd_can_rgb_ycc (void)
 {
+  init_simd();
+
   /* The code is optimised for these values only */
   if (BITS_IN_JSAMPLE != 8)
     return 0;
@@ -40,15 +70,18 @@
   if ((RGB_PIXELSIZE != 3) && (RGB_PIXELSIZE != 4))
     return 0;
 
-  if (!IS_ALIGNED_SSE(jconst_rgb_ycc_convert_sse2))
-    return 0;
+  if ((simd_support & JSIMD_SSE2) &&
+      IS_ALIGNED_SSE(jconst_rgb_ycc_convert_sse2))
+    return 1;
 
-  return 1;
+  return 0;
 }
 
 GLOBAL(int)
 jsimd_can_rgb_gray (void)
 {
+  init_simd();
+
   /* The code is optimised for these values only */
   if (BITS_IN_JSAMPLE != 8)
     return 0;
@@ -57,15 +90,18 @@
   if ((RGB_PIXELSIZE != 3) && (RGB_PIXELSIZE != 4))
     return 0;
 
-  if (!IS_ALIGNED_SSE(jconst_rgb_gray_convert_sse2))
-    return 0;
+  if ((simd_support & JSIMD_SSE2) &&
+      IS_ALIGNED_SSE(jconst_rgb_gray_convert_sse2))
+    return 1;
 
-  return 1;
+  return 0;
 }
 
 GLOBAL(int)
 jsimd_can_ycc_rgb (void)
 {
+  init_simd();
+
   /* The code is optimised for these values only */
   if (BITS_IN_JSAMPLE != 8)
     return 0;
@@ -74,10 +110,11 @@
   if ((RGB_PIXELSIZE != 3) && (RGB_PIXELSIZE != 4))
     return 0;
 
-  if (!IS_ALIGNED_SSE(jconst_ycc_rgb_convert_sse2))
-    return 0;
+  if ((simd_support & JSIMD_SSE2) &&
+      IS_ALIGNED_SSE(jconst_ycc_rgb_convert_sse2))
+    return 1;
 
-  return 1;
+  return 0;
 }
 
 GLOBAL(int)
@@ -210,29 +247,39 @@
 GLOBAL(int)
 jsimd_can_h2v2_downsample (void)
 {
+  init_simd();
+
   /* The code is optimised for these values only */
   if (BITS_IN_JSAMPLE != 8)
     return 0;
   if (sizeof(JDIMENSION) != 4)
     return 0;
 
-  return 1;
+  if (simd_support & JSIMD_SSE2)
+    return 1;
+
+  return 0;
 }
 
 GLOBAL(int)
 jsimd_can_h2v1_downsample (void)
 {
+  init_simd();
+
   /* The code is optimised for these values only */
   if (BITS_IN_JSAMPLE != 8)
     return 0;
   if (sizeof(JDIMENSION) != 4)
     return 0;
 
-  return 1;
+  if (simd_support & JSIMD_SSE2)
+    return 1;
+
+  return 0;
 }
 
 GLOBAL(void)
-jsimd_h2v2_downsample (j_compress_ptr cinfo, jpeg_component_info * compptr,
+jsimd_h2v2_downsample (j_compress_ptr cinfo, jpeg_component_info *compptr,
                        JSAMPARRAY input_data, JSAMPARRAY output_data)
 {
   jsimd_h2v2_downsample_sse2(cinfo->image_width, cinfo->max_v_samp_factor,
@@ -241,7 +288,7 @@
 }
 
 GLOBAL(void)
-jsimd_h2v1_downsample (j_compress_ptr cinfo, jpeg_component_info * compptr,
+jsimd_h2v1_downsample (j_compress_ptr cinfo, jpeg_component_info *compptr,
                        JSAMPARRAY input_data, JSAMPARRAY output_data)
 {
   jsimd_h2v1_downsample_sse2(cinfo->image_width, cinfo->max_v_samp_factor,
@@ -252,32 +299,42 @@
 GLOBAL(int)
 jsimd_can_h2v2_upsample (void)
 {
+  init_simd();
+
   /* The code is optimised for these values only */
   if (BITS_IN_JSAMPLE != 8)
     return 0;
   if (sizeof(JDIMENSION) != 4)
     return 0;
 
-  return 1;
+  if (simd_support & JSIMD_SSE2)
+    return 1;
+
+  return 0;
 }
 
 GLOBAL(int)
 jsimd_can_h2v1_upsample (void)
 {
+  init_simd();
+
   /* The code is optimised for these values only */
   if (BITS_IN_JSAMPLE != 8)
     return 0;
   if (sizeof(JDIMENSION) != 4)
     return 0;
 
-  return 1;
+  if (simd_support & JSIMD_SSE2)
+    return 1;
+
+  return 0;
 }
 
 GLOBAL(void)
 jsimd_h2v2_upsample (j_decompress_ptr cinfo,
-                     jpeg_component_info * compptr,
+                     jpeg_component_info *compptr,
                      JSAMPARRAY input_data,
-                     JSAMPARRAY * output_data_ptr)
+                     JSAMPARRAY *output_data_ptr)
 {
   jsimd_h2v2_upsample_sse2(cinfo->max_v_samp_factor, cinfo->output_width,
                            input_data, output_data_ptr);
@@ -285,9 +342,9 @@
 
 GLOBAL(void)
 jsimd_h2v1_upsample (j_decompress_ptr cinfo,
-                     jpeg_component_info * compptr,
+                     jpeg_component_info *compptr,
                      JSAMPARRAY input_data,
-                     JSAMPARRAY * output_data_ptr)
+                     JSAMPARRAY *output_data_ptr)
 {
   jsimd_h2v1_upsample_sse2(cinfo->max_v_samp_factor, cinfo->output_width,
                            input_data, output_data_ptr);
@@ -296,38 +353,44 @@
 GLOBAL(int)
 jsimd_can_h2v2_fancy_upsample (void)
 {
+  init_simd();
+
   /* The code is optimised for these values only */
   if (BITS_IN_JSAMPLE != 8)
     return 0;
   if (sizeof(JDIMENSION) != 4)
     return 0;
 
-  if (!IS_ALIGNED_SSE(jconst_fancy_upsample_sse2))
-    return 0;
+  if ((simd_support & JSIMD_SSE2) &&
+      IS_ALIGNED_SSE(jconst_fancy_upsample_sse2))
+    return 1;
 
-  return 1;
+  return 0;
 }
 
 GLOBAL(int)
 jsimd_can_h2v1_fancy_upsample (void)
 {
+  init_simd();
+
   /* The code is optimised for these values only */
   if (BITS_IN_JSAMPLE != 8)
     return 0;
   if (sizeof(JDIMENSION) != 4)
     return 0;
 
-  if (!IS_ALIGNED_SSE(jconst_fancy_upsample_sse2))
-    return 0;
+  if ((simd_support & JSIMD_SSE2) &&
+      IS_ALIGNED_SSE(jconst_fancy_upsample_sse2))
+    return 1;
 
-  return 1;
+  return 0;
 }
 
 GLOBAL(void)
 jsimd_h2v2_fancy_upsample (j_decompress_ptr cinfo,
-                           jpeg_component_info * compptr,
+                           jpeg_component_info *compptr,
                            JSAMPARRAY input_data,
-                           JSAMPARRAY * output_data_ptr)
+                           JSAMPARRAY *output_data_ptr)
 {
   jsimd_h2v2_fancy_upsample_sse2(cinfo->max_v_samp_factor,
                                  compptr->downsampled_width, input_data,
@@ -336,9 +399,9 @@
 
 GLOBAL(void)
 jsimd_h2v1_fancy_upsample (j_decompress_ptr cinfo,
-                           jpeg_component_info * compptr,
+                           jpeg_component_info *compptr,
                            JSAMPARRAY input_data,
-                           JSAMPARRAY * output_data_ptr)
+                           JSAMPARRAY *output_data_ptr)
 {
   jsimd_h2v1_fancy_upsample_sse2(cinfo->max_v_samp_factor,
                                  compptr->downsampled_width, input_data,
@@ -348,31 +411,37 @@
 GLOBAL(int)
 jsimd_can_h2v2_merged_upsample (void)
 {
+  init_simd();
+
   /* The code is optimised for these values only */
   if (BITS_IN_JSAMPLE != 8)
     return 0;
   if (sizeof(JDIMENSION) != 4)
     return 0;
 
-  if (!IS_ALIGNED_SSE(jconst_merged_upsample_sse2))
-    return 0;
+  if ((simd_support & JSIMD_SSE2) &&
+      IS_ALIGNED_SSE(jconst_merged_upsample_sse2))
+    return 1;
 
-  return 1;
+  return 0;
 }
 
 GLOBAL(int)
 jsimd_can_h2v1_merged_upsample (void)
 {
+  init_simd();
+
   /* The code is optimised for these values only */
   if (BITS_IN_JSAMPLE != 8)
     return 0;
   if (sizeof(JDIMENSION) != 4)
     return 0;
 
-  if (!IS_ALIGNED_SSE(jconst_merged_upsample_sse2))
-    return 0;
+  if ((simd_support & JSIMD_SSE2) &&
+      IS_ALIGNED_SSE(jconst_merged_upsample_sse2))
+    return 1;
 
-  return 1;
+  return 0;
 }
 
 GLOBAL(void)
@@ -456,6 +525,8 @@
 GLOBAL(int)
 jsimd_can_convsamp (void)
 {
+  init_simd();
+
   /* The code is optimised for these values only */
   if (DCTSIZE != 8)
     return 0;
@@ -466,12 +537,17 @@
   if (sizeof(DCTELEM) != 2)
     return 0;
 
-  return 1;
+  if (simd_support & JSIMD_SSE2)
+    return 1;
+
+  return 0;
 }
 
 GLOBAL(int)
 jsimd_can_convsamp_float (void)
 {
+  init_simd();
+
   /* The code is optimised for these values only */
   if (DCTSIZE != 8)
     return 0;
@@ -482,19 +558,22 @@
   if (sizeof(FAST_FLOAT) != 4)
     return 0;
 
-  return 1;
+  if (simd_support & JSIMD_SSE2)
+    return 1;
+
+  return 0;
 }
 
 GLOBAL(void)
 jsimd_convsamp (JSAMPARRAY sample_data, JDIMENSION start_col,
-                DCTELEM * workspace)
+                DCTELEM *workspace)
 {
   jsimd_convsamp_sse2(sample_data, start_col, workspace);
 }
 
 GLOBAL(void)
 jsimd_convsamp_float (JSAMPARRAY sample_data, JDIMENSION start_col,
-                      FAST_FLOAT * workspace)
+                      FAST_FLOAT *workspace)
 {
   jsimd_convsamp_float_sse2(sample_data, start_col, workspace);
 }
@@ -502,62 +581,68 @@
 GLOBAL(int)
 jsimd_can_fdct_islow (void)
 {
+  init_simd();
+
   /* The code is optimised for these values only */
   if (DCTSIZE != 8)
     return 0;
   if (sizeof(DCTELEM) != 2)
     return 0;
 
-  if (!IS_ALIGNED_SSE(jconst_fdct_islow_sse2))
-    return 0;
+  if ((simd_support & JSIMD_SSE2) && IS_ALIGNED_SSE(jconst_fdct_islow_sse2))
+    return 1;
 
-  return 1;
+  return 0;
 }
 
 GLOBAL(int)
 jsimd_can_fdct_ifast (void)
 {
+  init_simd();
+
   /* The code is optimised for these values only */
   if (DCTSIZE != 8)
     return 0;
   if (sizeof(DCTELEM) != 2)
     return 0;
 
-  if (!IS_ALIGNED_SSE(jconst_fdct_ifast_sse2))
-    return 0;
+  if ((simd_support & JSIMD_SSE2) && IS_ALIGNED_SSE(jconst_fdct_ifast_sse2))
+    return 1;
 
-  return 1;
+  return 0;
 }
 
 GLOBAL(int)
 jsimd_can_fdct_float (void)
 {
+  init_simd();
+
   /* The code is optimised for these values only */
   if (DCTSIZE != 8)
     return 0;
   if (sizeof(FAST_FLOAT) != 4)
     return 0;
 
-  if (!IS_ALIGNED_SSE(jconst_fdct_float_sse))
-    return 0;
+  if ((simd_support & JSIMD_SSE) && IS_ALIGNED_SSE(jconst_fdct_float_sse))
+    return 1;
 
-  return 1;
+  return 0;
 }
 
 GLOBAL(void)
-jsimd_fdct_islow (DCTELEM * data)
+jsimd_fdct_islow (DCTELEM *data)
 {
   jsimd_fdct_islow_sse2(data);
 }
 
 GLOBAL(void)
-jsimd_fdct_ifast (DCTELEM * data)
+jsimd_fdct_ifast (DCTELEM *data)
 {
   jsimd_fdct_ifast_sse2(data);
 }
 
 GLOBAL(void)
-jsimd_fdct_float (FAST_FLOAT * data)
+jsimd_fdct_float (FAST_FLOAT *data)
 {
   jsimd_fdct_float_sse(data);
 }
@@ -565,6 +650,8 @@
 GLOBAL(int)
 jsimd_can_quantize (void)
 {
+  init_simd();
+
   /* The code is optimised for these values only */
   if (DCTSIZE != 8)
     return 0;
@@ -573,12 +660,17 @@
   if (sizeof(DCTELEM) != 2)
     return 0;
 
-  return 1;
+  if (simd_support & JSIMD_SSE2)
+    return 1;
+
+  return 0;
 }
 
 GLOBAL(int)
 jsimd_can_quantize_float (void)
 {
+  init_simd();
+
   /* The code is optimised for these values only */
   if (DCTSIZE != 8)
     return 0;
@@ -587,19 +679,22 @@
   if (sizeof(FAST_FLOAT) != 4)
     return 0;
 
-  return 1;
+  if (simd_support & JSIMD_SSE2)
+    return 1;
+
+  return 0;
 }
 
 GLOBAL(void)
-jsimd_quantize (JCOEFPTR coef_block, DCTELEM * divisors,
-                DCTELEM * workspace)
+jsimd_quantize (JCOEFPTR coef_block, DCTELEM *divisors,
+                DCTELEM *workspace)
 {
   jsimd_quantize_sse2(coef_block, divisors, workspace);
 }
 
 GLOBAL(void)
-jsimd_quantize_float (JCOEFPTR coef_block, FAST_FLOAT * divisors,
-                      FAST_FLOAT * workspace)
+jsimd_quantize_float (JCOEFPTR coef_block, FAST_FLOAT *divisors,
+                      FAST_FLOAT *workspace)
 {
   jsimd_quantize_float_sse2(coef_block, divisors, workspace);
 }
@@ -607,6 +702,8 @@
 GLOBAL(int)
 jsimd_can_idct_2x2 (void)
 {
+  init_simd();
+
   /* The code is optimised for these values only */
   if (DCTSIZE != 8)
     return 0;
@@ -619,15 +716,17 @@
   if (sizeof(ISLOW_MULT_TYPE) != 2)
     return 0;
 
-  if (!IS_ALIGNED_SSE(jconst_idct_red_sse2))
-    return 0;
+  if ((simd_support & JSIMD_SSE2) && IS_ALIGNED_SSE(jconst_idct_red_sse2))
+    return 1;
 
-  return 1;
+  return 0;
 }
 
 GLOBAL(int)
 jsimd_can_idct_4x4 (void)
 {
+  init_simd();
+
   /* The code is optimised for these values only */
   if (DCTSIZE != 8)
     return 0;
@@ -640,14 +739,14 @@
   if (sizeof(ISLOW_MULT_TYPE) != 2)
     return 0;
 
-  if (!IS_ALIGNED_SSE(jconst_idct_red_sse2))
-    return 0;
+  if ((simd_support & JSIMD_SSE2) && IS_ALIGNED_SSE(jconst_idct_red_sse2))
+    return 1;
 
-  return 1;
+  return 0;
 }
 
 GLOBAL(void)
-jsimd_idct_2x2 (j_decompress_ptr cinfo, jpeg_component_info * compptr,
+jsimd_idct_2x2 (j_decompress_ptr cinfo, jpeg_component_info *compptr,
                 JCOEFPTR coef_block, JSAMPARRAY output_buf,
                 JDIMENSION output_col)
 {
@@ -655,7 +754,7 @@
 }
 
 GLOBAL(void)
-jsimd_idct_4x4 (j_decompress_ptr cinfo, jpeg_component_info * compptr,
+jsimd_idct_4x4 (j_decompress_ptr cinfo, jpeg_component_info *compptr,
                 JCOEFPTR coef_block, JSAMPARRAY output_buf,
                 JDIMENSION output_col)
 {
@@ -665,6 +764,8 @@
 GLOBAL(int)
 jsimd_can_idct_islow (void)
 {
+  init_simd();
+
   /* The code is optimised for these values only */
   if (DCTSIZE != 8)
     return 0;
@@ -677,15 +778,17 @@
   if (sizeof(ISLOW_MULT_TYPE) != 2)
     return 0;
 
-  if (!IS_ALIGNED_SSE(jconst_idct_islow_sse2))
-    return 0;
+  if ((simd_support & JSIMD_SSE2) && IS_ALIGNED_SSE(jconst_idct_islow_sse2))
+    return 1;
 
-  return 1;
+  return 0;
 }
 
 GLOBAL(int)
 jsimd_can_idct_ifast (void)
 {
+  init_simd();
+
   /* The code is optimised for these values only */
   if (DCTSIZE != 8)
     return 0;
@@ -700,15 +803,17 @@
   if (IFAST_SCALE_BITS != 2)
     return 0;
 
-  if (!IS_ALIGNED_SSE(jconst_idct_ifast_sse2))
-    return 0;
+  if ((simd_support & JSIMD_SSE2) && IS_ALIGNED_SSE(jconst_idct_ifast_sse2))
+    return 1;
 
-  return 1;
+  return 0;
 }
 
 GLOBAL(int)
 jsimd_can_idct_float (void)
 {
+  init_simd();
+
   if (DCTSIZE != 8)
     return 0;
   if (sizeof(JCOEF) != 2)
@@ -722,14 +827,14 @@
   if (sizeof(FLOAT_MULT_TYPE) != 4)
     return 0;
 
-  if (!IS_ALIGNED_SSE(jconst_idct_float_sse2))
-    return 0;
+  if ((simd_support & JSIMD_SSE2) && IS_ALIGNED_SSE(jconst_idct_float_sse2))
+    return 1;
 
-  return 1;
+  return 0;
 }
 
 GLOBAL(void)
-jsimd_idct_islow (j_decompress_ptr cinfo, jpeg_component_info * compptr,
+jsimd_idct_islow (j_decompress_ptr cinfo, jpeg_component_info *compptr,
                   JCOEFPTR coef_block, JSAMPARRAY output_buf,
                   JDIMENSION output_col)
 {
@@ -738,7 +843,7 @@
 }
 
 GLOBAL(void)
-jsimd_idct_ifast (j_decompress_ptr cinfo, jpeg_component_info * compptr,
+jsimd_idct_ifast (j_decompress_ptr cinfo, jpeg_component_info *compptr,
                   JCOEFPTR coef_block, JSAMPARRAY output_buf,
                   JDIMENSION output_col)
 {
@@ -747,7 +852,7 @@
 }
 
 GLOBAL(void)
-jsimd_idct_float (j_decompress_ptr cinfo, jpeg_component_info * compptr,
+jsimd_idct_float (j_decompress_ptr cinfo, jpeg_component_info *compptr,
                   JCOEFPTR coef_block, JSAMPARRAY output_buf,
                   JDIMENSION output_col)
 {
@@ -755,3 +860,28 @@
                         output_col);
 }
 
+GLOBAL(int)
+jsimd_can_huff_encode_one_block (void)
+{
+  init_simd();
+
+  if (DCTSIZE != 8)
+    return 0;
+  if (sizeof(JCOEF) != 2)
+    return 0;
+
+  if ((simd_support & JSIMD_SSE2) && simd_huffman &&
+      IS_ALIGNED_SSE(jconst_huff_encode_one_block))
+    return 1;
+
+  return 0;
+}
+
+GLOBAL(JOCTET*)
+jsimd_huff_encode_one_block (void *state, JOCTET *buffer, JCOEFPTR block,
+                             int last_dc_val, c_derived_tbl *dctbl,
+                             c_derived_tbl *actbl)
+{
+  return jsimd_huff_encode_one_block_sse2(state, buffer, block, last_dc_val,
+                                          dctbl, actbl);
+}
diff --git a/structure.txt b/structure.txt
index 4c9fe39..296d125 100644
--- a/structure.txt
+++ b/structure.txt
@@ -4,7 +4,7 @@
 Copyright (C) 1991-2012, Thomas G. Lane, Guido Vollbeding.
 It was modified by The libjpeg-turbo Project to include only information
 relevant to libjpeg-turbo.
-For conditions of distribution and use, see the accompanying README file.
+For conditions of distribution and use, see the accompanying README.ijg file.
 
 
 This file provides an overview of the architecture of the IJG JPEG software;
@@ -13,7 +13,7 @@
 convention, see the include files and comments in the source code.
 
 We assume that the reader is already somewhat familiar with the JPEG standard.
-The README file includes references for learning about JPEG.  The file
+The README.ijg file includes references for learning about JPEG.  The file
 libjpeg.txt describes the library from the viewpoint of an application
 programmer using the library; it's best to read that file before this one.
 Also, the file coderules.txt describes the coding style conventions we use.
diff --git a/tjbench.c b/tjbench.c
index 29ed3d0..81b36f6 100644
--- a/tjbench.c
+++ b/tjbench.c
@@ -1,5 +1,5 @@
 /*
- * Copyright (C)2009-2015 D. R. Commander.  All Rights Reserved.
+ * Copyright (C)2009-2016 D. R. Commander.  All Rights Reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions are met:
@@ -46,7 +46,7 @@
 #define _throwbmp(m) _throw(m, bmpgeterr())
 
 int flags=TJFLAG_NOREALLOC, componly=0, decomponly=0, doyuv=0, quiet=0,
-	dotile=0, pf=TJPF_BGR, yuvpad=1, warmup=1;
+	dotile=0, pf=TJPF_BGR, yuvpad=1, warmup=1, dowrite=1;
 char *ext="ppm";
 const char *pixFormatStr[TJ_NUMPF]=
 {
@@ -213,6 +213,9 @@
 				(double)(w*h)/1000000.*(double)iter/elapsedDecode);
 		}
 	}
+
+	if (!dowrite) goto bailout;
+
 	if(sf.num!=1 || sf.denom!=1)
 		snprintf(sizestr, 20, "%d_%d", sf.num, sf.denom);
 	else if(tilew!=w || tileh!=h)
@@ -422,7 +425,7 @@
 			printf("                  Output bit stream:  %f Megabits/sec\n",
 				(double)totaljpegsize*8./1000000.*(double)iter/elapsed);
 		}
-		if(tilew==w && tileh==h)
+		if(tilew==w && tileh==h && dowrite)
 		{
 			snprintf(tempstr, 1024, "%s_%s_Q%d.jpg", filename, subName[subsamp],
 				jpegqual);
@@ -756,7 +759,9 @@
 	printf("-benchtime <t> = Run each benchmark for at least <t> seconds (default = 5.0)\n");
 	printf("-warmup <w> = Execute each benchmark <w> times to prime the cache before\n");
 	printf("     taking performance measurements (default = 1)\n");
-	printf("-componly = Stop after running compression tests.  Do not test decompression.\n\n");
+	printf("-componly = Stop after running compression tests.  Do not test decompression.\n");
+	printf("-nowrite = Do not write reference or output images (improves consistency of\n");
+	printf("     performance measurements.)\n\n");
 	printf("NOTE:  If the quality is specified as a range (e.g. 90-100), a separate\n");
 	printf("test will be performed for all quality values in the range.\n\n");
 	exit(1);
@@ -906,6 +911,7 @@
 				}
 			}
 			if(!strcasecmp(argv[i], "-componly")) componly=1;
+			if(!strcasecmp(argv[i], "-nowrite")) dowrite=0;
 		}
 	}
 
diff --git a/transupp.c b/transupp.c
index 93444e3..d1c56c6 100644
--- a/transupp.c
+++ b/transupp.c
@@ -5,7 +5,8 @@
  * Copyright (C) 1997-2011, Thomas G. Lane, Guido Vollbeding.
  * libjpeg-turbo Modifications:
  * Copyright (C) 2010, D. R. Commander.
- * For conditions of distribution and use, see the accompanying README file.
+ * For conditions of distribution and use, see the accompanying README.ijg
+ * file.
  *
  * This file contains image transformation routines and other utility code
  * used by the jpegtran sample application.  These are NOT part of the core
@@ -763,9 +764,9 @@
  */
 
 LOCAL(boolean)
-jt_read_integer (const char ** strptr, JDIMENSION * result)
+jt_read_integer (const char **strptr, JDIMENSION *result)
 {
-  const char * ptr = *strptr;
+  const char *ptr = *strptr;
   JDIMENSION val = 0;
 
   for (; isdigit(*ptr); ptr++) {
@@ -1178,7 +1179,7 @@
 
 #if JPEG_LIB_VERSION >= 70
 LOCAL(void)
-adjust_exif_parameters (JOCTET * data, unsigned int length,
+adjust_exif_parameters (JOCTET *data, unsigned int length,
                         JDIMENSION new_width, JDIMENSION new_height)
 {
   boolean is_motorola; /* Flag for byte order */
diff --git a/transupp.h b/transupp.h
index 8fe9071..bf3118a 100644
--- a/transupp.h
+++ b/transupp.h
@@ -5,7 +5,8 @@
  * Copyright (C) 1997-2011, Thomas G. Lane, Guido Vollbeding.
  * It was modified by The libjpeg-turbo Project to include only code relevant
  * to libjpeg-turbo.
- * For conditions of distribution and use, see the accompanying README file.
+ * For conditions of distribution and use, see the accompanying README.ijg
+ * file.
  *
  * This file contains declarations for image transformation routines and
  * other utility code used by the jpegtran sample application.  These are
@@ -142,7 +143,7 @@
 
   /* Internal workspace: caller should not touch these */
   int num_components;           /* # of components in workspace */
-  jvirt_barray_ptr * workspace_coef_arrays; /* workspace for transformations */
+  jvirt_barray_ptr *workspace_coef_arrays; /* workspace for transformations */
   JDIMENSION output_width;      /* cropped destination dimensions */
   JDIMENSION output_height;
   JDIMENSION x_crop_offset;     /* destination crop offsets measured in iMCUs */
diff --git a/turbojpeg-jni.c b/turbojpeg-jni.c
index 1bf478f..eaba670 100644
--- a/turbojpeg-jni.c
+++ b/turbojpeg-jni.c
@@ -1,5 +1,5 @@
 /*
- * Copyright (C)2011-2014 D. R. Commander.  All Rights Reserved.
+ * Copyright (C)2011-2016 D. R. Commander.  All Rights Reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions are met:
@@ -39,23 +39,27 @@
 
 #define PAD(v, p) ((v+(p)-1)&(~((p)-1)))
 
-#define _throw(msg) {  \
-	jclass _exccls=(*env)->FindClass(env, "java/lang/Exception");  \
-	if(!_exccls) goto bailout;  \
+#define _throw(msg, exceptionClass) {  \
+	jclass _exccls=(*env)->FindClass(env, exceptionClass);  \
+	if(!_exccls || (*env)->ExceptionCheck(env)) goto bailout;  \
 	(*env)->ThrowNew(env, _exccls, msg);  \
 	goto bailout;  \
 }
 
-#define bailif0(f) {if(!(f)) {  \
-	char temps[80];  \
-	snprintf(temps, 80, "Unexpected NULL condition in line %d", __LINE__);  \
-	_throw(temps);  \
+#define _throwtj() _throw(tjGetErrorStr(), "org/libjpegturbo/turbojpeg/TJException")
+
+#define _throwarg(msg) _throw(msg, "java/lang/IllegalArgumentException")
+
+#define _throwmem() _throw("Memory allocation failure", "java/lang/OutOfMemoryError");
+
+#define bailif0(f) {if(!(f) || (*env)->ExceptionCheck(env)) {  \
+	goto bailout;  \
 }}
 
 #define gethandle()  \
 	jclass _cls=(*env)->GetObjectClass(env, obj);  \
 	jfieldID _fid;  \
-	if(!_cls) goto bailout;  \
+	if(!_cls || (*env)->ExceptionCheck(env)) goto bailout;  \
 	bailif0(_fid=(*env)->GetFieldID(env, _cls, "handle", "J"));  \
 	handle=(tjhandle)(size_t)(*env)->GetLongField(env, obj, _fid);  \
 
@@ -101,7 +105,7 @@
 	(JNIEnv *env, jclass cls, jint width, jint height, jint jpegSubsamp)
 {
 	jint retval=(jint)tjBufSize(width, height, jpegSubsamp);
-	if(retval==-1) _throw(tjGetErrorStr());
+	if(retval==-1) _throwarg(tjGetErrorStr());
 
 	bailout:
 	return retval;
@@ -112,7 +116,7 @@
 	(JNIEnv *env, jclass cls, jint width, jint pad, jint height, jint subsamp)
 {
 	jint retval=(jint)tjBufSizeYUV2(width, pad, height, subsamp);
-	if(retval==-1) _throw(tjGetErrorStr());
+	if(retval==-1) _throwarg(tjGetErrorStr());
 
 	bailout:
 	return retval;
@@ -133,7 +137,7 @@
 {
 	jint retval=(jint)tjPlaneSizeYUV(componentID, width, stride, height,
 		subsamp);
-	if(retval==-1) _throw(tjGetErrorStr());
+	if(retval==-1) _throwarg(tjGetErrorStr());
 
 	bailout:
 	return retval;
@@ -144,7 +148,7 @@
 	(JNIEnv *env, jclass cls, jint componentID, jint width, jint subsamp)
 {
 	jint retval=(jint)tjPlaneWidth(componentID, width, subsamp);
-	if(retval==-1) _throw(tjGetErrorStr());
+	if(retval==-1) _throwarg(tjGetErrorStr());
 
 	bailout:
 	return retval;
@@ -155,7 +159,7 @@
 	(JNIEnv *env, jclass cls, jint componentID, jint height, jint subsamp)
 {
 	jint retval=(jint)tjPlaneHeight(componentID, height, subsamp);
-	if(retval==-1) _throw(tjGetErrorStr());
+	if(retval==-1) _throwarg(tjGetErrorStr());
 
 	bailout:
 	return retval;
@@ -170,7 +174,7 @@
 	tjhandle handle;
 
 	if((handle=tjInitCompress())==NULL)
-		_throw(tjGetErrorStr());
+		_throwtj();
 
 	bailif0(cls=(*env)->GetObjectClass(env, obj));
 	bailif0(fid=(*env)->GetFieldID(env, cls, "handle", "J"));
@@ -194,17 +198,17 @@
 
 	if(pf<0 || pf>=org_libjpegturbo_turbojpeg_TJ_NUMPF || width<1 || height<1
 		|| pitch<0)
-		_throw("Invalid argument in compress()");
+		_throwarg("Invalid argument in compress()");
 	if(org_libjpegturbo_turbojpeg_TJ_NUMPF!=TJ_NUMPF)
-		_throw("Mismatch between Java and C API");
+		_throwarg("Mismatch between Java and C API");
 
 	actualPitch=(pitch==0)? width*tjPixelSize[pf]:pitch;
 	arraySize=(y+height-1)*actualPitch + (x+width)*tjPixelSize[pf];
 	if((*env)->GetArrayLength(env, src)*srcElementSize<arraySize)
-		_throw("Source buffer is not large enough");
+		_throwarg("Source buffer is not large enough");
 	jpegSize=tjBufSize(width, height, jpegSubsamp);
 	if((*env)->GetArrayLength(env, dst)<(jsize)jpegSize)
-		_throw("Destination buffer is not large enough");
+		_throwarg("Destination buffer is not large enough");
 
 	bailif0(srcBuf=(*env)->GetPrimitiveArrayCritical(env, src, 0));
 	bailif0(jpegBuf=(*env)->GetPrimitiveArrayCritical(env, dst, 0));
@@ -214,7 +218,7 @@
 	if(tjCompress2(handle, &srcBuf[y*actualPitch + x*tjPixelSize[pf]], width,
 		pitch, height, pf, &jpegBuf, &jpegSize, jpegSubsamp, jpegQual,
 		flags|TJFLAG_NOREALLOC)==-1)
-		_throw(tjGetErrorStr());
+		_throwtj();
 
 	bailout:
 	if(jpegBuf) (*env)->ReleasePrimitiveArrayCritical(env, dst, jpegBuf, 0);
@@ -249,9 +253,9 @@
 		jint jpegQual, jint flags)
 {
 	if(pf<0 || pf>=org_libjpegturbo_turbojpeg_TJ_NUMPF)
-		_throw("Invalid argument in compress()");
+		_throwarg("Invalid argument in compress()");
 	if(tjPixelSize[pf]!=sizeof(jint))
-		_throw("Pixel format must be 32-bit when compressing from an integer buffer.");
+		_throwarg("Pixel format must be 32-bit when compressing from an integer buffer.");
 
 	return TJCompressor_compress(env, obj, src, sizeof(jint), x, y, width,
 		stride*sizeof(jint), height, pf, dst, jpegSubsamp, jpegQual, flags);
@@ -267,9 +271,9 @@
 		jint flags)
 {
 	if(pf<0 || pf>=org_libjpegturbo_turbojpeg_TJ_NUMPF)
-		_throw("Invalid argument in compress()");
+		_throwarg("Invalid argument in compress()");
 	if(tjPixelSize[pf]!=sizeof(jint))
-		_throw("Pixel format must be 32-bit when compressing from an integer buffer.");
+		_throwarg("Pixel format must be 32-bit when compressing from an integer buffer.");
 
 	return TJCompressor_compress(env, obj, src, sizeof(jint), 0, 0, width,
 		stride*sizeof(jint), height, pf, dst, jpegSubsamp, jpegQual, flags);
@@ -287,27 +291,28 @@
 	tjhandle handle=0;
 	unsigned long jpegSize=0;
 	jbyteArray jSrcPlanes[3]={NULL, NULL, NULL};
-	unsigned char *srcPlanes[3], *jpegBuf=NULL;
+	const unsigned char *srcPlanes[3];
+	unsigned char *jpegBuf=NULL;
 	int *srcOffsets=NULL, *srcStrides=NULL;
 	int nc=(subsamp==org_libjpegturbo_turbojpeg_TJ_SAMP_GRAY? 1:3), i;
 
 	gethandle();
 
 	if(subsamp<0 || subsamp>=org_libjpegturbo_turbojpeg_TJ_NUMSAMP)
-		_throw("Invalid argument in compressFromYUV()");
+		_throwarg("Invalid argument in compressFromYUV()");
 	if(org_libjpegturbo_turbojpeg_TJ_NUMSAMP!=TJ_NUMSAMP)
-		_throw("Mismatch between Java and C API");
+		_throwarg("Mismatch between Java and C API");
 
 	if((*env)->GetArrayLength(env, srcobjs)<nc)
-		_throw("Planes array is too small for the subsampling type");
+		_throwarg("Planes array is too small for the subsampling type");
 	if((*env)->GetArrayLength(env, jSrcOffsets)<nc)
-		_throw("Offsets array is too small for the subsampling type");
+		_throwarg("Offsets array is too small for the subsampling type");
 	if((*env)->GetArrayLength(env, jSrcStrides)<nc)
-		_throw("Strides array is too small for the subsampling type");
+		_throwarg("Strides array is too small for the subsampling type");
 
 	jpegSize=tjBufSize(width, height, subsamp);
 	if((*env)->GetArrayLength(env, dst)<(jsize)jpegSize)
-		_throw("Destination buffer is not large enough");
+		_throwarg("Destination buffer is not large enough");
 
 	bailif0(srcOffsets=(*env)->GetPrimitiveArrayCritical(env, jSrcOffsets, 0));
 	bailif0(srcStrides=(*env)->GetPrimitiveArrayCritical(env, jSrcStrides, 0));
@@ -317,16 +322,16 @@
 		int pw=tjPlaneWidth(i, width, subsamp);
 
 		if(planeSize<0 || pw<0)
-			_throw(tjGetErrorStr());
+			_throwarg(tjGetErrorStr());
 
 		if(srcOffsets[i]<0)
-			_throw("Invalid argument in compressFromYUV()");
+			_throwarg("Invalid argument in compressFromYUV()");
 		if(srcStrides[i]<0 && srcOffsets[i]-planeSize+pw<0)
-			_throw("Negative plane stride would cause memory to be accessed below plane boundary");
+			_throwarg("Negative plane stride would cause memory to be accessed below plane boundary");
 
 		bailif0(jSrcPlanes[i]=(*env)->GetObjectArrayElement(env, srcobjs, i));
 		if((*env)->GetArrayLength(env, jSrcPlanes[i])<srcOffsets[i]+planeSize)
-			_throw("Source plane is not large enough");
+			_throwarg("Source plane is not large enough");
 
 		bailif0(srcPlanes[i]=(*env)->GetPrimitiveArrayCritical(env, jSrcPlanes[i],
 			0));
@@ -338,15 +343,15 @@
 
 	if(tjCompressFromYUVPlanes(handle, srcPlanes, width, srcStrides, height,
 		subsamp, &jpegBuf, &jpegSize, jpegQual, flags|TJFLAG_NOREALLOC)==-1)
-		_throw(tjGetErrorStr());
+		_throwtj();
 
 	bailout:
 	if(jpegBuf) (*env)->ReleasePrimitiveArrayCritical(env, dst, jpegBuf, 0);
 	for(i=0; i<nc; i++)
 	{
 		if(srcPlanes[i] && jSrcPlanes[i])
-			(*env)->ReleasePrimitiveArrayCritical(env, jSrcPlanes[i], srcPlanes[i],
-				0);
+			(*env)->ReleasePrimitiveArrayCritical(env, jSrcPlanes[i],
+				(unsigned char *)srcPlanes[i], 0);
 	}
 	if(srcStrides)
 		(*env)->ReleasePrimitiveArrayCritical(env, jSrcStrides, srcStrides, 0);
@@ -371,22 +376,22 @@
 
 	if(pf<0 || pf>=org_libjpegturbo_turbojpeg_TJ_NUMPF || width<1 || height<1
 		|| pitch<0 || subsamp<0 || subsamp>=org_libjpegturbo_turbojpeg_TJ_NUMSAMP)
-		_throw("Invalid argument in encodeYUV()");
+		_throwarg("Invalid argument in encodeYUV()");
 	if(org_libjpegturbo_turbojpeg_TJ_NUMPF!=TJ_NUMPF
 		|| org_libjpegturbo_turbojpeg_TJ_NUMSAMP!=TJ_NUMSAMP)
-		_throw("Mismatch between Java and C API");
+		_throwarg("Mismatch between Java and C API");
 
 	if((*env)->GetArrayLength(env, dstobjs)<nc)
-		_throw("Planes array is too small for the subsampling type");
+		_throwarg("Planes array is too small for the subsampling type");
 	if((*env)->GetArrayLength(env, jDstOffsets)<nc)
-		_throw("Offsets array is too small for the subsampling type");
+		_throwarg("Offsets array is too small for the subsampling type");
 	if((*env)->GetArrayLength(env, jDstStrides)<nc)
-		_throw("Strides array is too small for the subsampling type");
+		_throwarg("Strides array is too small for the subsampling type");
 
 	actualPitch=(pitch==0)? width*tjPixelSize[pf]:pitch;
 	arraySize=(y+height-1)*actualPitch + (x+width)*tjPixelSize[pf];
 	if((*env)->GetArrayLength(env, src)*srcElementSize<arraySize)
-		_throw("Source buffer is not large enough");
+		_throwarg("Source buffer is not large enough");
 
 	bailif0(dstOffsets=(*env)->GetPrimitiveArrayCritical(env, jDstOffsets, 0));
 	bailif0(dstStrides=(*env)->GetPrimitiveArrayCritical(env, jDstStrides, 0));
@@ -396,16 +401,16 @@
 		int pw=tjPlaneWidth(i, width, subsamp);
 
 		if(planeSize<0 || pw<0)
-			_throw(tjGetErrorStr());
+			_throwarg(tjGetErrorStr());
 
 		if(dstOffsets[i]<0)
-			_throw("Invalid argument in encodeYUV()");
+			_throwarg("Invalid argument in encodeYUV()");
 		if(dstStrides[i]<0 && dstOffsets[i]-planeSize+pw<0)
-			_throw("Negative plane stride would cause memory to be accessed below plane boundary");
+			_throwarg("Negative plane stride would cause memory to be accessed below plane boundary");
 
 		bailif0(jDstPlanes[i]=(*env)->GetObjectArrayElement(env, dstobjs, i));
 		if((*env)->GetArrayLength(env, jDstPlanes[i])<dstOffsets[i]+planeSize)
-			_throw("Destination plane is not large enough");
+			_throwarg("Destination plane is not large enough");
 
 		bailif0(dstPlanes[i]=(*env)->GetPrimitiveArrayCritical(env, jDstPlanes[i],
 			0));
@@ -415,7 +420,7 @@
 
 	if(tjEncodeYUVPlanes(handle, &srcBuf[y*actualPitch + x*tjPixelSize[pf]],
 		width, pitch, height, pf, dstPlanes, dstStrides, subsamp, flags)==-1)
-		_throw(tjGetErrorStr());
+		_throwtj();
 
 	bailout:
 	if(srcBuf) (*env)->ReleasePrimitiveArrayCritical(env, src, srcBuf, 0);
@@ -445,13 +450,13 @@
 /* TurboJPEG 1.4.x: TJCompressor::encodeYUV() int source */
 JNIEXPORT void JNICALL Java_org_libjpegturbo_turbojpeg_TJCompressor_encodeYUV___3IIIIIII_3_3B_3I_3III
 	(JNIEnv *env, jobject obj, jintArray src, jint x, jint y, jint width,
-		jint stride, jint height, jint pf, jobjectArray dstobjs, 
+		jint stride, jint height, jint pf, jobjectArray dstobjs,
 		jintArray jDstOffsets, jintArray jDstStrides, jint subsamp, jint flags)
 {
 	if(pf<0 || pf>=org_libjpegturbo_turbojpeg_TJ_NUMPF)
-		_throw("Invalid argument in encodeYUV()");
+		_throwarg("Invalid argument in encodeYUV()");
 	if(tjPixelSize[pf]!=sizeof(jint))
-		_throw("Pixel format must be 32-bit when encoding from an integer buffer.");
+		_throwarg("Pixel format must be 32-bit when encoding from an integer buffer.");
 
 	TJCompressor_encodeYUV(env, obj, src, sizeof(jint), x, y, width,
 		stride*sizeof(jint), height, pf, dstobjs, jDstOffsets, jDstStrides,
@@ -473,23 +478,23 @@
 
 	if(pf<0 || pf>=org_libjpegturbo_turbojpeg_TJ_NUMPF || width<1 || height<1
 		|| pitch<0)
-		_throw("Invalid argument in encodeYUV()");
+		_throwarg("Invalid argument in encodeYUV()");
 	if(org_libjpegturbo_turbojpeg_TJ_NUMPF!=TJ_NUMPF)
-		_throw("Mismatch between Java and C API");
+		_throwarg("Mismatch between Java and C API");
 
 	arraySize=(pitch==0)? width*tjPixelSize[pf]*height:pitch*height;
 	if((*env)->GetArrayLength(env, src)*srcElementSize<arraySize)
-		_throw("Source buffer is not large enough");
+		_throwarg("Source buffer is not large enough");
 	if((*env)->GetArrayLength(env, dst)
 		<(jsize)tjBufSizeYUV(width, height, subsamp))
-		_throw("Destination buffer is not large enough");
+		_throwarg("Destination buffer is not large enough");
 
 	bailif0(srcBuf=(*env)->GetPrimitiveArrayCritical(env, src, 0));
 	bailif0(dstBuf=(*env)->GetPrimitiveArrayCritical(env, dst, 0));
 
 	if(tjEncodeYUV2(handle, srcBuf, width, pitch, height, pf, dstBuf, subsamp,
 		flags)==-1)
-		_throw(tjGetErrorStr());
+		_throwtj();
 
 	bailout:
 	if(dstBuf) (*env)->ReleasePrimitiveArrayCritical(env, dst, dstBuf, 0);
@@ -512,9 +517,9 @@
 		jint height, jint pf, jbyteArray dst, jint subsamp, jint flags)
 {
 	if(pf<0 || pf>=org_libjpegturbo_turbojpeg_TJ_NUMPF)
-		_throw("Invalid argument in encodeYUV()");
+		_throwarg("Invalid argument in encodeYUV()");
 	if(tjPixelSize[pf]!=sizeof(jint))
-		_throw("Pixel format must be 32-bit when encoding from an integer buffer.");
+		_throwarg("Pixel format must be 32-bit when encoding from an integer buffer.");
 
 	TJCompressor_encodeYUV_12(env, obj, src, sizeof(jint), width,
 		stride*sizeof(jint), height, pf, dst, subsamp, flags);
@@ -531,7 +536,7 @@
 
 	gethandle();
 
-	if(tjDestroy(handle)==-1) _throw(tjGetErrorStr());
+	if(tjDestroy(handle)==-1) _throwtj();
 	(*env)->SetLongField(env, obj, _fid, 0);
 
 	bailout:
@@ -546,7 +551,7 @@
 	jfieldID fid;
 	tjhandle handle;
 
-	if((handle=tjInitDecompress())==NULL) _throw(tjGetErrorStr());
+	if((handle=tjInitDecompress())==NULL) _throwtj();
 
 	bailif0(cls=(*env)->GetObjectClass(env, obj));
 	bailif0(fid=(*env)->GetFieldID(env, cls, "handle", "J"));
@@ -566,7 +571,7 @@
 	jobjectArray sfjava=NULL;
 
 	if((sf=tjGetScalingFactors(&n))==NULL || n==0)
-		_throw(tjGetErrorStr());
+		_throwarg(tjGetErrorStr());
 
 	bailif0(sfcls=(*env)->FindClass(env, "org/libjpegturbo/turbojpeg/TJScalingFactor"));
 	bailif0(sfjava=(jobjectArray)(*env)->NewObjectArray(env, n, sfcls, 0));
@@ -596,13 +601,13 @@
 	gethandle();
 
 	if((*env)->GetArrayLength(env, src)<jpegSize)
-		_throw("Source buffer is not large enough");
+		_throwarg("Source buffer is not large enough");
 
 	bailif0(jpegBuf=(*env)->GetPrimitiveArrayCritical(env, src, 0));
 
 	if(tjDecompressHeader3(handle, jpegBuf, (unsigned long)jpegSize,
 		&width, &height, &jpegSubsamp, &jpegColorspace)==-1)
-		_throw(tjGetErrorStr());
+		_throwtj();
 
 	(*env)->ReleasePrimitiveArrayCritical(env, src, jpegBuf, 0);  jpegBuf=NULL;
 
@@ -634,16 +639,16 @@
 	gethandle();
 
 	if(pf<0 || pf>=org_libjpegturbo_turbojpeg_TJ_NUMPF)
-		_throw("Invalid argument in decompress()");
+		_throwarg("Invalid argument in decompress()");
 	if(org_libjpegturbo_turbojpeg_TJ_NUMPF!=TJ_NUMPF)
-		_throw("Mismatch between Java and C API");
+		_throwarg("Mismatch between Java and C API");
 
 	if((*env)->GetArrayLength(env, src)<jpegSize)
-		_throw("Source buffer is not large enough");
+		_throwarg("Source buffer is not large enough");
 	actualPitch=(pitch==0)? width*tjPixelSize[pf]:pitch;
 	arraySize=(y+height-1)*actualPitch + (x+width)*tjPixelSize[pf];
 	if((*env)->GetArrayLength(env, dst)*dstElementSize<arraySize)
-		_throw("Destination buffer is not large enough");
+		_throwarg("Destination buffer is not large enough");
 
 	bailif0(jpegBuf=(*env)->GetPrimitiveArrayCritical(env, src, 0));
 	bailif0(dstBuf=(*env)->GetPrimitiveArrayCritical(env, dst, 0));
@@ -651,7 +656,7 @@
 	if(tjDecompress2(handle, jpegBuf, (unsigned long)jpegSize,
 		&dstBuf[y*actualPitch + x*tjPixelSize[pf]], width, pitch, height, pf,
 		flags)==-1)
-		_throw(tjGetErrorStr());
+		_throwtj();
 
 	bailout:
 	if(dstBuf) (*env)->ReleasePrimitiveArrayCritical(env, dst, dstBuf, 0);
@@ -683,9 +688,9 @@
 		jint x, jint y, jint width, jint stride, jint height, jint pf, jint flags)
 {
 	if(pf<0 || pf>=org_libjpegturbo_turbojpeg_TJ_NUMPF)
-		_throw("Invalid argument in decompress()");
+		_throwarg("Invalid argument in decompress()");
 	if(tjPixelSize[pf]!=sizeof(jint))
-		_throw("Pixel format must be 32-bit when decompressing to an integer buffer.");
+		_throwarg("Pixel format must be 32-bit when decompressing to an integer buffer.");
 
 	TJDecompressor_decompress(env, obj, src, jpegSize, dst, sizeof(jint), x, y,
 		width, stride*sizeof(jint), height, pf, flags);
@@ -700,9 +705,9 @@
 		jint width, jint stride, jint height, jint pf, jint flags)
 {
 	if(pf<0 || pf>=org_libjpegturbo_turbojpeg_TJ_NUMPF)
-		_throw("Invalid argument in decompress()");
+		_throwarg("Invalid argument in decompress()");
 	if(tjPixelSize[pf]!=sizeof(jint))
-		_throw("Pixel format must be 32-bit when decompressing to an integer buffer.");
+		_throwarg("Pixel format must be 32-bit when decompressing to an integer buffer.");
 
 	TJDecompressor_decompress(env, obj, src, jpegSize, dst, sizeof(jint), 0, 0,
 		width, stride*sizeof(jint), height, pf, flags);
@@ -730,7 +735,7 @@
 	gethandle();
 
 	if((*env)->GetArrayLength(env, src)<jpegSize)
-		_throw("Source buffer is not large enough");
+		_throwarg("Source buffer is not large enough");
 	bailif0(_fid=(*env)->GetFieldID(env, _cls, "jpegSubsamp", "I"));
 	jpegSubsamp=(int)(*env)->GetIntField(env, obj, _fid);
 	bailif0(_fid=(*env)->GetFieldID(env, _cls, "jpegWidth", "I"));
@@ -745,7 +750,7 @@
 	if(height==0) height=jpegHeight;
 	sf=tjGetScalingFactors(&nsf);
 	if(!sf || nsf<1)
-		_throw(tjGetErrorStr());
+		_throwarg(tjGetErrorStr());
 	for(i=0; i<nsf; i++)
 	{
 		scaledWidth=TJSCALED(jpegWidth, sf[i]);
@@ -753,6 +758,8 @@
 		if(scaledWidth<=width && scaledHeight<=height)
 			break;
 	}
+	if(i>=nsf)
+		_throwarg("Could not scale down to desired image dimensions");
 
 	bailif0(dstOffsets=(*env)->GetPrimitiveArrayCritical(env, jDstOffsets, 0));
 	bailif0(dstStrides=(*env)->GetPrimitiveArrayCritical(env, jDstStrides, 0));
@@ -763,16 +770,16 @@
 		int pw=tjPlaneWidth(i, scaledWidth, jpegSubsamp);
 
 		if(planeSize<0 || pw<0)
-			_throw(tjGetErrorStr());
+			_throwarg(tjGetErrorStr());
 
 		if(dstOffsets[i]<0)
-			_throw("Invalid argument in decompressToYUV()");
+			_throwarg("Invalid argument in decompressToYUV()");
 		if(dstStrides[i]<0 && dstOffsets[i]-planeSize+pw<0)
-			_throw("Negative plane stride would cause memory to be accessed below plane boundary");
+			_throwarg("Negative plane stride would cause memory to be accessed below plane boundary");
 
 		bailif0(jDstPlanes[i]=(*env)->GetObjectArrayElement(env, dstobjs, i));
 		if((*env)->GetArrayLength(env, jDstPlanes[i])<dstOffsets[i]+planeSize)
-			_throw("Destination plane is not large enough");
+			_throwarg("Destination plane is not large enough");
 
 		bailif0(dstPlanes[i]=(*env)->GetPrimitiveArrayCritical(env, jDstPlanes[i],
 			0));
@@ -782,7 +789,7 @@
 
 	if(tjDecompressToYUVPlanes(handle, jpegBuf, (unsigned long)jpegSize,
 		dstPlanes, desiredWidth, dstStrides, desiredHeight, flags)==-1)
-		_throw(tjGetErrorStr());
+		_throwtj();
 
 	bailout:
 	if(jpegBuf) (*env)->ReleasePrimitiveArrayCritical(env, src, jpegBuf, 0);
@@ -811,7 +818,7 @@
 	gethandle();
 
 	if((*env)->GetArrayLength(env, src)<jpegSize)
-		_throw("Source buffer is not large enough");
+		_throwarg("Source buffer is not large enough");
 	bailif0(_fid=(*env)->GetFieldID(env, _cls, "jpegSubsamp", "I"));
 	jpegSubsamp=(int)(*env)->GetIntField(env, obj, _fid);
 	bailif0(_fid=(*env)->GetFieldID(env, _cls, "jpegWidth", "I"));
@@ -820,14 +827,14 @@
 	jpegHeight=(int)(*env)->GetIntField(env, obj, _fid);
 	if((*env)->GetArrayLength(env, dst)
 		<(jsize)tjBufSizeYUV(jpegWidth, jpegHeight, jpegSubsamp))
-		_throw("Destination buffer is not large enough");
+		_throwarg("Destination buffer is not large enough");
 
 	bailif0(jpegBuf=(*env)->GetPrimitiveArrayCritical(env, src, 0));
 	bailif0(dstBuf=(*env)->GetPrimitiveArrayCritical(env, dst, 0));
 
 	if(tjDecompressToYUV(handle, jpegBuf, (unsigned long)jpegSize, dstBuf,
 		flags)==-1)
-		_throw(tjGetErrorStr());
+		_throwtj();
 
 	bailout:
 	if(dstBuf) (*env)->ReleasePrimitiveArrayCritical(env, dst, dstBuf, 0);
@@ -843,7 +850,8 @@
 	tjhandle handle=0;
 	jsize arraySize=0, actualPitch;
 	jbyteArray jSrcPlanes[3]={NULL, NULL, NULL};
-	unsigned char *srcPlanes[3], *dstBuf=NULL;
+	const unsigned char *srcPlanes[3];
+	unsigned char *dstBuf=NULL;
 	int *srcOffsets=NULL, *srcStrides=NULL;
 	int nc=(subsamp==org_libjpegturbo_turbojpeg_TJ_SAMP_GRAY? 1:3), i;
 
@@ -851,22 +859,22 @@
 
 	if(pf<0 || pf>=org_libjpegturbo_turbojpeg_TJ_NUMPF || subsamp<0
 		|| subsamp>=org_libjpegturbo_turbojpeg_TJ_NUMSAMP)
-		_throw("Invalid argument in decodeYUV()");
+		_throwarg("Invalid argument in decodeYUV()");
 	if(org_libjpegturbo_turbojpeg_TJ_NUMPF!=TJ_NUMPF
 		|| org_libjpegturbo_turbojpeg_TJ_NUMSAMP!=TJ_NUMSAMP)
-		_throw("Mismatch between Java and C API");
+		_throwarg("Mismatch between Java and C API");
 
 	if((*env)->GetArrayLength(env, srcobjs)<nc)
-		_throw("Planes array is too small for the subsampling type");
+		_throwarg("Planes array is too small for the subsampling type");
 	if((*env)->GetArrayLength(env, jSrcOffsets)<nc)
-		_throw("Offsets array is too small for the subsampling type");
+		_throwarg("Offsets array is too small for the subsampling type");
 	if((*env)->GetArrayLength(env, jSrcStrides)<nc)
-		_throw("Strides array is too small for the subsampling type");
+		_throwarg("Strides array is too small for the subsampling type");
 
 	actualPitch=(pitch==0)? width*tjPixelSize[pf]:pitch;
 	arraySize=(y+height-1)*actualPitch + (x+width)*tjPixelSize[pf];
 	if((*env)->GetArrayLength(env, dst)*dstElementSize<arraySize)
-		_throw("Destination buffer is not large enough");
+		_throwarg("Destination buffer is not large enough");
 
 	bailif0(srcOffsets=(*env)->GetPrimitiveArrayCritical(env, jSrcOffsets, 0));
 	bailif0(srcStrides=(*env)->GetPrimitiveArrayCritical(env, jSrcStrides, 0));
@@ -876,16 +884,16 @@
 		int pw=tjPlaneWidth(i, width, subsamp);
 
 		if(planeSize<0 || pw<0)
-			_throw(tjGetErrorStr());
+			_throwarg(tjGetErrorStr());
 
 		if(srcOffsets[i]<0)
-			_throw("Invalid argument in decodeYUV()");
+			_throwarg("Invalid argument in decodeYUV()");
 		if(srcStrides[i]<0 && srcOffsets[i]-planeSize+pw<0)
-			_throw("Negative plane stride would cause memory to be accessed below plane boundary");
+			_throwarg("Negative plane stride would cause memory to be accessed below plane boundary");
 
 		bailif0(jSrcPlanes[i]=(*env)->GetObjectArrayElement(env, srcobjs, i));
 		if((*env)->GetArrayLength(env, jSrcPlanes[i])<srcOffsets[i]+planeSize)
-			_throw("Source plane is not large enough");
+			_throwarg("Source plane is not large enough");
 
 		bailif0(srcPlanes[i]=(*env)->GetPrimitiveArrayCritical(env, jSrcPlanes[i],
 			0));
@@ -896,15 +904,15 @@
 	if(tjDecodeYUVPlanes(handle, srcPlanes, srcStrides, subsamp,
 		&dstBuf[y*actualPitch + x*tjPixelSize[pf]], width, pitch, height, pf,
 		flags)==-1)
-		_throw(tjGetErrorStr());
+		_throwtj();
 
 	bailout:
 	if(dstBuf) (*env)->ReleasePrimitiveArrayCritical(env, dst, dstBuf, 0);
 	for(i=0; i<nc; i++)
 	{
 		if(srcPlanes[i] && jSrcPlanes[i])
-			(*env)->ReleasePrimitiveArrayCritical(env, jSrcPlanes[i], srcPlanes[i],
-				0);
+			(*env)->ReleasePrimitiveArrayCritical(env, jSrcPlanes[i],
+				(unsigned char *)srcPlanes[i], 0);
 	}
 	if(srcStrides)
 		(*env)->ReleasePrimitiveArrayCritical(env, jSrcStrides, srcStrides, 0);
@@ -930,9 +938,9 @@
 		jint width, jint stride, jint height, jint pf, jint flags)
 {
 	if(pf<0 || pf>=org_libjpegturbo_turbojpeg_TJ_NUMPF)
-		_throw("Invalid argument in decodeYUV()");
+		_throwarg("Invalid argument in decodeYUV()");
 	if(tjPixelSize[pf]!=sizeof(jint))
-		_throw("Pixel format must be 32-bit when decoding to an integer buffer.");
+		_throwarg("Pixel format must be 32-bit when decoding to an integer buffer.");
 
 	TJDecompressor_decodeYUV(env, obj, srcobjs, jSrcOffsets, jSrcStrides,
 		subsamp, dst, sizeof(jint), x, y, width, stride*sizeof(jint), height, pf,
@@ -950,7 +958,7 @@
 	jfieldID fid;
 	tjhandle handle;
 
-	if((handle=tjInitTransform())==NULL) _throw(tjGetErrorStr());
+	if((handle=tjInitTransform())==NULL) _throwtj();
 
 	bailif0(cls=(*env)->GetObjectClass(env, obj));
 	bailif0(fid=(*env)->GetFieldID(env, cls, "handle", "J"));
@@ -1040,7 +1048,7 @@
 	gethandle();
 
 	if((*env)->GetArrayLength(env, jsrcBuf)<jpegSize)
-		_throw("Source buffer is not large enough");
+		_throwarg("Source buffer is not large enough");
 	bailif0(_fid=(*env)->GetFieldID(env, _cls, "jpegWidth", "I"));
 	jpegWidth=(int)(*env)->GetIntField(env, obj, _fid);
 	bailif0(_fid=(*env)->GetFieldID(env, _cls, "jpegHeight", "I"));
@@ -1050,19 +1058,19 @@
 
 	n=(*env)->GetArrayLength(env, dstobjs);
 	if(n!=(*env)->GetArrayLength(env, tobjs))
-		_throw("Mismatch between size of transforms array and destination buffers array");
+		_throwarg("Mismatch between size of transforms array and destination buffers array");
 
 	if((dstBufs=(unsigned char **)malloc(sizeof(unsigned char *)*n))==NULL)
-		_throw("Memory allocation failure");
+		_throwmem();
 	if((jdstBufs=(jbyteArray *)malloc(sizeof(jbyteArray)*n))==NULL)
-		_throw("Memory allocation failure");
+		_throwmem();
 	if((dstSizes=(unsigned long *)malloc(sizeof(unsigned long)*n))==NULL)
-		_throw("Memory allocation failure");
+		_throwmem();
 	if((t=(tjtransform *)malloc(sizeof(tjtransform)*n))==NULL)
-		_throw("Memory allocation failure");
+		_throwmem();
 	if((params=(JNICustomFilterParams *)malloc(sizeof(JNICustomFilterParams)*n))
 		==NULL)
-		_throw("Memory allocation failure");
+		_throwmem();
 	for(i=0; i<n; i++)
 	{
 		dstBufs[i]=NULL;  jdstBufs[i]=NULL;  dstSizes[i]=0;
@@ -1110,7 +1118,7 @@
 		bailif0(jdstBufs[i]=(*env)->GetObjectArrayElement(env, dstobjs, i));
 		if((unsigned long)(*env)->GetArrayLength(env, jdstBufs[i])
 			<tjBufSize(w, h, jpegSubsamp))
-			_throw("Destination buffer is not large enough");
+			_throwarg("Destination buffer is not large enough");
 	}
 	bailif0(jpegBuf=(*env)->GetPrimitiveArrayCritical(env, jsrcBuf, 0));
 	for(i=0; i<n; i++)
@@ -1118,7 +1126,7 @@
 
 	if(tjTransform(handle, jpegBuf, jpegSize, n, dstBufs, dstSizes, t,
 		flags|TJFLAG_NOREALLOC)==-1)
-		_throw(tjGetErrorStr());
+		_throwtj();
 
 	for(i=0; i<n; i++)
 	{
diff --git a/turbojpeg.c b/turbojpeg.c
index d0e9704..421b5f8 100644
--- a/turbojpeg.c
+++ b/turbojpeg.c
@@ -44,7 +44,8 @@
 
 extern void jpeg_mem_dest_tj(j_compress_ptr, unsigned char **,
 	unsigned long *, boolean);
-extern void jpeg_mem_src_tj(j_decompress_ptr, unsigned char *, unsigned long);
+extern void jpeg_mem_src_tj(j_decompress_ptr, const unsigned char *,
+	unsigned long);
 
 #define PAD(v, p) ((v+(p)-1)&(~((p)-1)))
 #define isPow2(x) (((x)&(x-1))==0)
@@ -729,7 +730,7 @@
 }
 
 
-DLLEXPORT int DLLCALL tjCompress2(tjhandle handle, unsigned char *srcBuf,
+DLLEXPORT int DLLCALL tjCompress2(tjhandle handle, const unsigned char *srcBuf,
 	int width, int pitch, int height, int pixelFormat, unsigned char **jpegBuf,
 	unsigned long *jpegSize, int jpegSubsamp, int jpegQual, int flags)
 {
@@ -786,8 +787,9 @@
 		_throw("tjCompress2(): Memory allocation failure");
 	for(i=0; i<height; i++)
 	{
-		if(flags&TJFLAG_BOTTOMUP) row_pointer[i]=&srcBuf[(height-i-1)*pitch];
-		else row_pointer[i]=&srcBuf[i*pitch];
+		if(flags&TJFLAG_BOTTOMUP)
+			row_pointer[i]=(JSAMPROW)&srcBuf[(height-i-1)*pitch];
+		else row_pointer[i]=(JSAMPROW)&srcBuf[i*pitch];
 	}
 	while(cinfo->next_scanline<cinfo->image_height)
 	{
@@ -828,9 +830,10 @@
 }
 
 
-DLLEXPORT int DLLCALL tjEncodeYUVPlanes(tjhandle handle, unsigned char *srcBuf,
-	int width, int pitch, int height, int pixelFormat, unsigned char **dstPlanes,
-	int *strides, int subsamp, int flags)
+DLLEXPORT int DLLCALL tjEncodeYUVPlanes(tjhandle handle,
+	const unsigned char *srcBuf, int width, int pitch, int height,
+	int pixelFormat, unsigned char **dstPlanes, int *strides, int subsamp,
+	int flags)
 {
 	int i, retval=0;  JSAMPROW *row_pointer=NULL;
 	JSAMPLE *_tmpbuf[MAX_COMPONENTS], *_tmpbuf2[MAX_COMPONENTS];
@@ -911,8 +914,9 @@
 		_throw("tjEncodeYUVPlanes(): Memory allocation failure");
 	for(i=0; i<height; i++)
 	{
-		if(flags&TJFLAG_BOTTOMUP) row_pointer[i]=&srcBuf[(height-i-1)*pitch];
-		else row_pointer[i]=&srcBuf[i*pitch];
+		if(flags&TJFLAG_BOTTOMUP)
+			row_pointer[i]=(JSAMPROW)&srcBuf[(height-i-1)*pitch];
+		else row_pointer[i]=(JSAMPROW)&srcBuf[i*pitch];
 	}
 	if(height<ph0)
 		for(i=height; i<ph0; i++) row_pointer[i]=row_pointer[height-1];
@@ -989,9 +993,9 @@
 	return retval;
 }
 
-DLLEXPORT int DLLCALL tjEncodeYUV3(tjhandle handle, unsigned char *srcBuf,
-	int width, int pitch, int height, int pixelFormat, unsigned char *dstBuf,
-	int pad, int subsamp, int flags)
+DLLEXPORT int DLLCALL tjEncodeYUV3(tjhandle handle,
+	const unsigned char *srcBuf, int width, int pitch, int height,
+	int pixelFormat, unsigned char *dstBuf, int pad, int subsamp, int flags)
 {
 	unsigned char *dstPlanes[3];
 	int pw0, ph0, strides[3], retval=-1;
@@ -1043,8 +1047,9 @@
 
 
 DLLEXPORT int DLLCALL tjCompressFromYUVPlanes(tjhandle handle,
-	unsigned char **srcPlanes, int width, int *strides, int height, int subsamp,
-	unsigned char **jpegBuf, unsigned long *jpegSize, int jpegQual, int flags)
+	const unsigned char **srcPlanes, int width, const int *strides, int height,
+	int subsamp, unsigned char **jpegBuf, unsigned long *jpegSize, int jpegQual,
+	int flags)
 {
 	int i, row, retval=0, alloc=1;  JSAMPROW *inbuf[MAX_COMPONENTS];
 	int pw[MAX_COMPONENTS], ph[MAX_COMPONENTS], iw[MAX_COMPONENTS],
@@ -1107,7 +1112,7 @@
 		tmpbufsize+=iw[i]*th[i];
 		if((inbuf[i]=(JSAMPROW *)malloc(sizeof(JSAMPROW)*ph[i]))==NULL)
 			_throw("tjCompressFromYUVPlanes(): Memory allocation failure");
-		ptr=srcPlanes[i];
+		ptr=(JSAMPLE *)srcPlanes[i];
 		for(row=0; row<ph[i]; row++)
 		{
 			inbuf[i][row]=ptr;
@@ -1173,11 +1178,11 @@
 	return retval;
 }
 
-DLLEXPORT int DLLCALL tjCompressFromYUV(tjhandle handle, unsigned char *srcBuf,
-	int width, int pad, int height, int subsamp, unsigned char **jpegBuf,
-	unsigned long *jpegSize, int jpegQual, int flags)
+DLLEXPORT int DLLCALL tjCompressFromYUV(tjhandle handle,
+	const unsigned char *srcBuf, int width, int pad, int height, int subsamp,
+	unsigned char **jpegBuf, unsigned long *jpegSize, int jpegQual, int flags)
 {
-	unsigned char *srcPlanes[3];
+	const unsigned char *srcPlanes[3];
 	int pw0, ph0, strides[3], retval=-1;
 
 	if(srcBuf==NULL || width<=0 || pad<1 || height<=0 || subsamp<0
@@ -1252,8 +1257,8 @@
 
 
 DLLEXPORT int DLLCALL tjDecompressHeader3(tjhandle handle,
-	unsigned char *jpegBuf, unsigned long jpegSize, int *width, int *height,
-	int *jpegSubsamp, int *jpegColorspace)
+	const unsigned char *jpegBuf, unsigned long jpegSize, int *width,
+	int *height, int *jpegSubsamp, int *jpegColorspace)
 {
 	int retval=0;
 
@@ -1333,9 +1338,9 @@
 }
 
 
-DLLEXPORT int DLLCALL tjDecompress2(tjhandle handle, unsigned char *jpegBuf,
-	unsigned long jpegSize, unsigned char *dstBuf, int width, int pitch,
-	int height, int pixelFormat, int flags)
+DLLEXPORT int DLLCALL tjDecompress2(tjhandle handle,
+	const unsigned char *jpegBuf, unsigned long jpegSize, unsigned char *dstBuf,
+	int width, int pitch, int height, int pixelFormat, int flags)
 {
 	int i, retval=0;  JSAMPROW *row_pointer=NULL;
 	int jpegwidth, jpegheight, scaledw, scaledh;
@@ -1501,8 +1506,9 @@
 }
 
 DLLEXPORT int DLLCALL tjDecodeYUVPlanes(tjhandle handle,
-	unsigned char **srcPlanes, int *strides, int subsamp, unsigned char *dstBuf,
-	int width, int pitch, int height, int pixelFormat, int flags)
+	const unsigned char **srcPlanes, const int *strides, int subsamp,
+	unsigned char *dstBuf, int width, int pitch, int height, int pixelFormat,
+	int flags)
 {
 	int i, retval=0;  JSAMPROW *row_pointer=NULL;
 	JSAMPLE *_tmpbuf[MAX_COMPONENTS];
@@ -1621,7 +1627,7 @@
 		ph[i]=ph0*compptr->v_samp_factor/dinfo->max_v_samp_factor;
 		inbuf[i]=(JSAMPROW *)malloc(sizeof(JSAMPROW)*ph[i]);
 		if(!inbuf[i]) _throw("tjDecodeYUVPlanes(): Memory allocation failure");
-		ptr=srcPlanes[i];
+		ptr=(JSAMPLE *)srcPlanes[i];
 		for(row=0; row<ph[i]; row++)
 		{
 			inbuf[i][row]=ptr;
@@ -1662,11 +1668,11 @@
 	return retval;
 }
 
-DLLEXPORT int DLLCALL tjDecodeYUV(tjhandle handle, unsigned char *srcBuf,
+DLLEXPORT int DLLCALL tjDecodeYUV(tjhandle handle, const unsigned char *srcBuf,
 	int pad, int subsamp, unsigned char *dstBuf, int width, int pitch,
 	int height, int pixelFormat, int flags)
 {
-	unsigned char *srcPlanes[3];
+	const unsigned char *srcPlanes[3];
 	int pw0, ph0, strides[3], retval=-1;
 
 	if(srcBuf==NULL || pad<0 || !isPow2(pad) || subsamp<0 || subsamp>=NUMSUBOPT
@@ -1699,8 +1705,8 @@
 }
 
 DLLEXPORT int DLLCALL tjDecompressToYUVPlanes(tjhandle handle,
-	unsigned char *jpegBuf, unsigned long jpegSize, unsigned char **dstPlanes,
-	int width, int *strides, int height, int flags)
+	const unsigned char *jpegBuf, unsigned long jpegSize,
+	unsigned char **dstPlanes, int width, int *strides, int height, int flags)
 {
 	int i, sfi, row, retval=0;  JSAMPROW *outbuf[MAX_COMPONENTS];
 	int jpegwidth, jpegheight, jpegSubsamp, scaledw, scaledh;
@@ -1873,7 +1879,7 @@
 }
 
 DLLEXPORT int DLLCALL tjDecompressToYUV2(tjhandle handle,
-	unsigned char *jpegBuf, unsigned long jpegSize, unsigned char *dstBuf,
+	const unsigned char *jpegBuf, unsigned long jpegSize, unsigned char *dstBuf,
 	int width, int pad, int height, int flags)
 {
 	unsigned char *dstPlanes[3];
@@ -1966,9 +1972,9 @@
 }
 
 
-DLLEXPORT int DLLCALL tjTransform(tjhandle handle, unsigned char *jpegBuf,
-	unsigned long jpegSize, int n, unsigned char **dstBufs,
-	unsigned long *dstSizes, tjtransform *t, int flags)
+DLLEXPORT int DLLCALL tjTransform(tjhandle handle,
+	const unsigned char *jpegBuf, unsigned long jpegSize, int n,
+	unsigned char **dstBufs, unsigned long *dstSizes, tjtransform *t, int flags)
 {
 	jpeg_transform_info *xinfo=NULL;
 	jvirt_barray_ptr *srccoefs, *dstcoefs;
diff --git a/turbojpeg.h b/turbojpeg.h
index d5c624e..583029f 100644
--- a/turbojpeg.h
+++ b/turbojpeg.h
@@ -619,7 +619,7 @@
  * @param handle a handle to a TurboJPEG compressor or transformer instance
  *
  * @param srcBuf pointer to an image buffer containing RGB, grayscale, or
- * CMYK pixels to be compressed.  This buffer is not modified.
+ * CMYK pixels to be compressed
  *
  * @param width width (in pixels) of the source image
  *
@@ -672,7 +672,7 @@
  *
  * @return 0 if successful, or -1 if an error occurred (see #tjGetErrorStr().)
 */
-DLLEXPORT int DLLCALL tjCompress2(tjhandle handle, unsigned char *srcBuf,
+DLLEXPORT int DLLCALL tjCompress2(tjhandle handle, const unsigned char *srcBuf,
   int width, int pitch, int height, int pixelFormat, unsigned char **jpegBuf,
   unsigned long *jpegSize, int jpegSubsamp, int jpegQual, int flags);
 
@@ -687,7 +687,7 @@
  * #tjBufSizeYUV2() for the given image width, height, padding, and level of
  * chrominance subsampling.  The Y, U (Cb), and V (Cr) image planes should be
  * stored sequentially in the source buffer (refer to @ref YUVnotes
- * "YUV Image Format Notes".)  This buffer is not modified.
+ * "YUV Image Format Notes".)
  *
  * @param width width (in pixels) of the source image.  If the width is not an
  * even multiple of the MCU block width (see #tjMCUWidth), then an intermediate
@@ -736,9 +736,9 @@
  *
  * @return 0 if successful, or -1 if an error occurred (see #tjGetErrorStr().)
 */
-DLLEXPORT int DLLCALL tjCompressFromYUV(tjhandle handle, unsigned char *srcBuf,
-  int width, int pad, int height, int subsamp, unsigned char **jpegBuf,
-  unsigned long *jpegSize, int jpegQual, int flags);
+DLLEXPORT int DLLCALL tjCompressFromYUV(tjhandle handle,
+  const unsigned char *srcBuf, int width, int pad, int height, int subsamp,
+  unsigned char **jpegBuf, unsigned long *jpegSize, int jpegQual, int flags);
 
 
 /**
@@ -752,7 +752,7 @@
  * memory.  The size of each plane should match the value returned by
  * #tjPlaneSizeYUV() for the given image width, height, strides, and level of
  * chrominance subsampling.  Refer to @ref YUVnotes "YUV Image Format Notes"
- * for more details.  These image planes are not modified.
+ * for more details.
  *
  * @param width width (in pixels) of the source image.  If the width is not an
  * even multiple of the MCU block width (see #tjMCUWidth), then an intermediate
@@ -807,8 +807,9 @@
  * @return 0 if successful, or -1 if an error occurred (see #tjGetErrorStr().)
 */
 DLLEXPORT int DLLCALL tjCompressFromYUVPlanes(tjhandle handle,
-  unsigned char **srcPlanes, int width, int *strides, int height, int subsamp,
-  unsigned char **jpegBuf, unsigned long *jpegSize, int jpegQual, int flags);
+  const unsigned char **srcPlanes, int width, const int *strides, int height,
+  int subsamp, unsigned char **jpegBuf, unsigned long *jpegSize, int jpegQual,
+  int flags);
 
 
 /**
@@ -926,7 +927,7 @@
  * @param handle a handle to a TurboJPEG compressor or transformer instance
  *
  * @param srcBuf pointer to an image buffer containing RGB or grayscale pixels
- * to be encoded.  This buffer is not modified.
+ * to be encoded
  *
  * @param width width (in pixels) of the source image
  *
@@ -966,8 +967,8 @@
  * @return 0 if successful, or -1 if an error occurred (see #tjGetErrorStr().)
 */
 DLLEXPORT int DLLCALL tjEncodeYUV3(tjhandle handle,
-  unsigned char *srcBuf, int width, int pitch, int height, int pixelFormat,
-  unsigned char *dstBuf, int pad, int subsamp, int flags);
+  const unsigned char *srcBuf, int width, int pitch, int height,
+  int pixelFormat, unsigned char *dstBuf, int pad, int subsamp, int flags);
 
 
 /**
@@ -979,7 +980,7 @@
  * @param handle a handle to a TurboJPEG compressor or transformer instance
  *
  * @param srcBuf pointer to an image buffer containing RGB or grayscale pixels
- * to be encoded.  This buffer is not modified.
+ * to be encoded
  *
  * @param width width (in pixels) of the source image
  *
@@ -1024,8 +1025,9 @@
  * @return 0 if successful, or -1 if an error occurred (see #tjGetErrorStr().)
 */
 DLLEXPORT int DLLCALL tjEncodeYUVPlanes(tjhandle handle,
-  unsigned char *srcBuf, int width, int pitch, int height, int pixelFormat,
-  unsigned char **dstPlanes, int *strides, int subsamp, int flags);
+  const unsigned char *srcBuf, int width, int pitch, int height,
+  int pixelFormat, unsigned char **dstPlanes, int *strides, int subsamp,
+  int flags);
 
 
 /**
@@ -1042,8 +1044,7 @@
  *
  * @param handle a handle to a TurboJPEG decompressor or transformer instance
  *
- * @param jpegBuf pointer to a buffer containing a JPEG image.  This buffer is
- * not modified.
+ * @param jpegBuf pointer to a buffer containing a JPEG image
  *
  * @param jpegSize size of the JPEG image (in bytes)
  *
@@ -1064,8 +1065,8 @@
  * @return 0 if successful, or -1 if an error occurred (see #tjGetErrorStr().)
 */
 DLLEXPORT int DLLCALL tjDecompressHeader3(tjhandle handle,
-  unsigned char *jpegBuf, unsigned long jpegSize, int *width, int *height,
-  int *jpegSubsamp, int *jpegColorspace);
+  const unsigned char *jpegBuf, unsigned long jpegSize, int *width,
+  int *height, int *jpegSubsamp, int *jpegColorspace);
 
 
 /**
@@ -1086,8 +1087,7 @@
  *
  * @param handle a handle to a TurboJPEG decompressor or transformer instance
  *
- * @param jpegBuf pointer to a buffer containing the JPEG image to decompress.
- * This buffer is not modified.
+ * @param jpegBuf pointer to a buffer containing the JPEG image to decompress
  *
  * @param jpegSize size of the JPEG image (in bytes)
  *
@@ -1132,7 +1132,7 @@
  * @return 0 if successful, or -1 if an error occurred (see #tjGetErrorStr().)
  */
 DLLEXPORT int DLLCALL tjDecompress2(tjhandle handle,
-  unsigned char *jpegBuf, unsigned long jpegSize, unsigned char *dstBuf,
+  const unsigned char *jpegBuf, unsigned long jpegSize, unsigned char *dstBuf,
   int width, int pitch, int height, int pixelFormat, int flags);
 
 
@@ -1143,8 +1143,7 @@
  *
  * @param handle a handle to a TurboJPEG decompressor or transformer instance
  *
- * @param jpegBuf pointer to a buffer containing the JPEG image to decompress.
- * This buffer is not modified.
+ * @param jpegBuf pointer to a buffer containing the JPEG image to decompress
  *
  * @param jpegSize size of the JPEG image (in bytes)
  *
@@ -1183,7 +1182,7 @@
  * @return 0 if successful, or -1 if an error occurred (see #tjGetErrorStr().)
  */
 DLLEXPORT int DLLCALL tjDecompressToYUV2(tjhandle handle,
-  unsigned char *jpegBuf, unsigned long jpegSize, unsigned char *dstBuf,
+  const unsigned char *jpegBuf, unsigned long jpegSize, unsigned char *dstBuf,
   int width, int pad, int height, int flags);
 
 
@@ -1194,8 +1193,7 @@
  *
  * @param handle a handle to a TurboJPEG decompressor or transformer instance
  *
- * @param jpegBuf pointer to a buffer containing the JPEG image to decompress.
- * This buffer is not modified.
+ * @param jpegBuf pointer to a buffer containing the JPEG image to decompress
  *
  * @param jpegSize size of the JPEG image (in bytes)
  *
@@ -1240,8 +1238,8 @@
  * @return 0 if successful, or -1 if an error occurred (see #tjGetErrorStr().)
  */
 DLLEXPORT int DLLCALL tjDecompressToYUVPlanes(tjhandle handle,
-  unsigned char *jpegBuf, unsigned long jpegSize, unsigned char **dstPlanes,
-  int width, int *strides, int height, int flags);
+  const unsigned char *jpegBuf, unsigned long jpegSize,
+  unsigned char **dstPlanes, int width, int *strides, int height, int flags);
 
 
 /**
@@ -1257,7 +1255,7 @@
  * #tjBufSizeYUV2() for the given image width, height, padding, and level of
  * chrominance subsampling.  The Y, U (Cb), and V (Cr) image planes should be
  * stored sequentially in the source buffer (refer to @ref YUVnotes
- * "YUV Image Format Notes".)  This buffer is not modified.
+ * "YUV Image Format Notes".)
  *
  * @param pad Use this parameter to specify that the width of each line in each
  * plane of the YUV source image is padded to the nearest multiple of this
@@ -1291,7 +1289,7 @@
  *
  * @return 0 if successful, or -1 if an error occurred (see #tjGetErrorStr().)
  */
-DLLEXPORT int DLLCALL tjDecodeYUV(tjhandle handle, unsigned char *srcBuf,
+DLLEXPORT int DLLCALL tjDecodeYUV(tjhandle handle, const unsigned char *srcBuf,
   int pad, int subsamp, unsigned char *dstBuf, int width, int pitch,
   int height, int pixelFormat, int flags);
 
@@ -1310,7 +1308,7 @@
  * The size of each plane should match the value returned by #tjPlaneSizeYUV()
  * for the given image width, height, strides, and level of chrominance
  * subsampling.  Refer to @ref YUVnotes "YUV Image Format Notes" for more
- * details.  These image planes are not modified.
+ * details.
  *
  * @param strides an array of integers, each specifying the number of bytes per
  * line in the corresponding plane of the YUV source image.  Setting the stride
@@ -1349,8 +1347,9 @@
  * @return 0 if successful, or -1 if an error occurred (see #tjGetErrorStr().)
  */
 DLLEXPORT int DLLCALL tjDecodeYUVPlanes(tjhandle handle,
-  unsigned char **srcPlanes, int *strides, int subsamp, unsigned char *dstBuf,
-  int width, int pitch, int height, int pixelFormat, int flags);
+  const unsigned char **srcPlanes, const int *strides, int subsamp,
+  unsigned char *dstBuf, int width, int pitch, int height, int pixelFormat,
+  int flags);
 
 
 /**
@@ -1378,7 +1377,7 @@
  * @param handle a handle to a TurboJPEG transformer instance
  *
  * @param jpegBuf pointer to a buffer containing the JPEG source image to
- * transform.  This buffer is not modified.
+ * transform
  *
  * @param jpegSize size of the JPEG source image (in bytes)
  *
@@ -1417,9 +1416,10 @@
  *
  * @return 0 if successful, or -1 if an error occurred (see #tjGetErrorStr().)
  */
-DLLEXPORT int DLLCALL tjTransform(tjhandle handle, unsigned char *jpegBuf,
-  unsigned long jpegSize, int n, unsigned char **dstBufs,
-  unsigned long *dstSizes, tjtransform *transforms, int flags);
+DLLEXPORT int DLLCALL tjTransform(tjhandle handle,
+  const unsigned char *jpegBuf, unsigned long jpegSize, int n,
+  unsigned char **dstBufs, unsigned long *dstSizes, tjtransform *transforms,
+  int flags);
 
 
 /**
diff --git a/usage.txt b/usage.txt
index ef8e6d0..5abda4e 100644
--- a/usage.txt
+++ b/usage.txt
@@ -51,9 +51,10 @@
 
 The currently supported image file formats are: PPM (PBMPLUS color format),
 PGM (PBMPLUS grayscale format), BMP, Targa, and RLE (Utah Raster Toolkit
-format).  (RLE is supported only if the URT library is available.)
-cjpeg recognizes the input image format automatically, with the exception
-of some Targa-format files.  You have to tell djpeg which format to generate.
+format).  (RLE is supported only if the URT library is available, which it
+isn't on most non-Unix systems.)  cjpeg recognizes the input image format
+automatically, with the exception of some Targa files.  You have to tell djpeg
+which format to generate.
 
 JPEG files are in the defacto standard JFIF file format.  There are other,
 less widely used JPEG-based file formats, but we don't support them.
@@ -104,17 +105,17 @@
 file, and the closer the output image will be to the original input.  Normally
 you want to use the lowest quality setting (smallest file) that decompresses
 into something visually indistinguishable from the original image.  For this
-purpose the quality setting should be between 50 and 95; the default of 75 is
-often about right.  If you see defects at -quality 75, then go up 5 or 10
-counts at a time until you are happy with the output image.  (The optimal
-setting will vary from one image to another.)
+purpose the quality setting should generally be between 50 and 95 (the default
+is 75) for photographic images.  If you see defects at -quality 75, then go up
+5 or 10 counts at a time until you are happy with the output image.  (The
+optimal setting will vary from one image to another.)
 
 -quality 100 will generate a quantization table of all 1's, minimizing loss
 in the quantization step (but there is still information loss in subsampling,
-as well as roundoff error).  This setting is mainly of interest for
-experimental purposes.  Quality values above about 95 are NOT recommended for
-normal use; the compressed file size goes up dramatically for hardly any gain
-in output image quality.
+as well as roundoff error.)  For most images, specifying a quality value above
+about 95 will increase the size of the compressed file dramatically, and while
+the quality gain from these higher quality values is measurable (using metrics
+such as PSNR or SSIM), it is rarely perceivable by human vision.
 
 In the other direction, quality values below 50 will produce very small files
 of low image quality.  Settings around 5 to 10 might be useful in preparing an
@@ -275,6 +276,8 @@
                         Useful for viewing on monochrome displays; also,
                         djpeg runs noticeably faster in this mode.
 
+        -rgb            Force RGB output even if JPEG file is grayscale.
+
         -scale M/N      Scale the output image by a factor M/N.  Currently
                         the scale factor must be M/8, where M is an integer
                         between 1 and 16 inclusive, or any reduced fraction
@@ -425,8 +428,9 @@
 decompress, with some loss of image quality, by specifying -onepass for
 one-pass quantization.
 
-To avoid the Unisys LZW patent, djpeg produces uncompressed GIF files.  These
-are larger than they should be, but are readable by standard GIF decoders.
+To avoid the Unisys LZW patent (now expired), djpeg produces uncompressed GIF
+files.  These are larger than they should be, but are readable by standard GIF
+decoders.
 
 
 HINTS FOR BOTH PROGRAMS
@@ -468,14 +472,16 @@
 It can translate the coded representation from one variant of JPEG to another,
 for example from baseline JPEG to progressive JPEG or vice versa.  It can also
 perform some rearrangements of the image data, for example turning an image
-from landscape to portrait format by rotation.
+from landscape to portrait format by rotation.  For EXIF files and JPEG files
+containing Exif data, you may prefer to use exiftran instead.
 
 jpegtran works by rearranging the compressed data (DCT coefficients), without
 ever fully decoding the image.  Therefore, its transformations are lossless:
 there is no image degradation at all, which would not be true if you used
 djpeg followed by cjpeg to accomplish the same conversion.  But by the same
 token, jpegtran cannot perform lossy operations such as changing the image
-quality.
+quality.  However, while the image data is losslessly transformed, metadata
+can be removed.  See the -copy option for specifics.
 
 jpegtran uses a command line syntax similar to cjpeg or djpeg.
 On Unix-like systems, you say:
@@ -543,7 +549,10 @@
 Like the rotate and flip transforms, lossless crop is restricted by the current
 JPEG format; the upper left corner of the selected region must fall on an iMCU
 boundary.  If it doesn't, then it is silently moved up and/or left to the
-nearest iMCU boundary (the lower right corner is unchanged.)
+nearest iMCU boundary (the lower right corner is unchanged.)  Thus, the output
+image covers at least the requested region, but it may cover more.  The
+adjustment of the region dimensions may be optionally disabled by attaching an
+'f' character ("force") to the width or height number.
 
 The image can be losslessly cropped by giving the switch:
         -crop WxH+X+Y   Crop to a rectangular region of width W and height H,
@@ -564,15 +573,17 @@
 jpegtran also recognizes these switches that control what to do with "extra"
 markers, such as comment blocks:
         -copy none      Copy no extra markers from source file.  This setting
-                        suppresses all comments and other excess baggage
-                        present in the source file.
+                        suppresses all comments and other metadata in the
+                        source file.
         -copy comments  Copy only comment markers.  This setting copies
-                        comments from the source file but discards
-                        any other data that is inessential for image display.
+                        comments from the source file but discards any other
+                        metadata.
         -copy all       Copy all extra markers.  This setting preserves
                         miscellaneous markers found in the source file, such
                         as JFIF thumbnails, Exif data, and Photoshop settings.
                         In some files, these extra markers can be sizable.
+                        Note that this option will copy thumbnails as-is;
+                        they will not be transformed.
 The default behavior is -copy comments.  (Note: in IJG releases v6 and v6a,
 jpegtran always did the equivalent of -copy none.)
 
diff --git a/win/jconfig.h.in b/win/jconfig.h.in
index 8783900..9d35121 100644
--- a/win/jconfig.h.in
+++ b/win/jconfig.h.in
@@ -3,6 +3,7 @@
 
 #define JPEG_LIB_VERSION @JPEG_LIB_VERSION@
 #define LIBJPEG_TURBO_VERSION @VERSION@
+#define LIBJPEG_TURBO_VERSION_NUMBER @LIBJPEG_TURBO_VERSION_NUMBER@
 #cmakedefine C_ARITH_CODING_SUPPORTED
 #cmakedefine D_ARITH_CODING_SUPPORTED
 #cmakedefine MEM_SRCDST_SUPPORTED
diff --git a/win/jpeg62-memsrcdst.def b/win/jpeg62-memsrcdst.def
index 4511c8e..6499316 100755
--- a/win/jpeg62-memsrcdst.def
+++ b/win/jpeg62-memsrcdst.def
@@ -102,3 +102,5 @@
 	jzero_far @ 101 ; 
 	jpeg_mem_dest @ 102 ; 
 	jpeg_mem_src @ 103 ; 
+	jpeg_skip_scanlines @ 104 ; 
+	jpeg_crop_scanline @ 105 ; 
diff --git a/win/jpeg62.def b/win/jpeg62.def
index 3c33fbf..9f30b1a 100755
--- a/win/jpeg62.def
+++ b/win/jpeg62.def
@@ -100,3 +100,5 @@
 	jpeg_write_tables @ 99 ; 
 	jround_up @ 100 ; 
 	jzero_far @ 101 ; 
+	jpeg_skip_scanlines @ 102 ; 
+	jpeg_crop_scanline @ 103 ; 
diff --git a/win/jpeg7.def b/win/jpeg7.def
index 5ca227b..92463c5 100644
--- a/win/jpeg7.def
+++ b/win/jpeg7.def
@@ -102,3 +102,5 @@
 	jpeg_write_tables @ 101 ; 
 	jround_up @ 102 ; 
 	jzero_far @ 103 ; 
+	jpeg_skip_scanlines @ 104 ; 
+	jpeg_crop_scanline @ 105 ; 
diff --git a/win/jpeg8.def b/win/jpeg8.def
index 3fa6111..19246ac 100644
--- a/win/jpeg8.def
+++ b/win/jpeg8.def
@@ -105,3 +105,5 @@
 	jpeg_write_tables @ 104 ; 
 	jround_up @ 105 ; 
 	jzero_far @ 106 ; 
+	jpeg_skip_scanlines @ 107 ; 
+	jpeg_crop_scanline @ 108 ; 
diff --git a/wrbmp.c b/wrbmp.c
index b7ecb49..50e469c 100644
--- a/wrbmp.c
+++ b/wrbmp.c
@@ -5,8 +5,9 @@
  * Copyright (C) 1994-1996, Thomas G. Lane.
  * libjpeg-turbo Modifications:
  * Copyright (C) 2013, Linaro Limited.
- * Copyright (C) 2014, D. R. Commander.
- * For conditions of distribution and use, see the accompanying README file.
+ * Copyright (C) 2014-2015, D. R. Commander.
+ * For conditions of distribution and use, see the accompanying README.ijg
+ * file.
  *
  * This file contains routines to write output images in Microsoft "BMP"
  * format (MS Windows 3.x and OS/2 1.x flavors).
@@ -57,7 +58,7 @@
   JDIMENSION cur_output_row;    /* next row# to write to virtual array */
 } bmp_dest_struct;
 
-typedef bmp_dest_struct * bmp_dest_ptr;
+typedef bmp_dest_struct *bmp_dest_ptr;
 
 
 /* Forward declarations */
@@ -199,7 +200,7 @@
          array[offset+1] = (char) (((value) >> 8) & 0xFF), \
          array[offset+2] = (char) (((value) >> 16) & 0xFF), \
          array[offset+3] = (char) (((value) >> 24) & 0xFF))
-  INT32 headersize, bfSize;
+  long headersize, bfSize;
   int bits_per_pixel, cmap_entries;
 
   /* Compute colormap size and total file size */
@@ -223,7 +224,7 @@
   }
   /* File size */
   headersize = 14 + 40 + cmap_entries * 4; /* Header and colormap */
-  bfSize = headersize + (INT32) dest->row_width * (INT32) cinfo->output_height;
+  bfSize = headersize + (long) dest->row_width * (long) cinfo->output_height;
 
   /* Set unused fields of header to 0 */
   MEMZERO(bmpfileheader, sizeof(bmpfileheader));
@@ -245,8 +246,8 @@
   /* we leave biCompression = 0, for none */
   /* we leave biSizeImage = 0; this is correct for uncompressed data */
   if (cinfo->density_unit == 2) { /* if have density in dots/cm, then */
-    PUT_4B(bmpinfoheader, 24, (INT32) (cinfo->X_density*100)); /* XPels/M */
-    PUT_4B(bmpinfoheader, 28, (INT32) (cinfo->Y_density*100)); /* XPels/M */
+    PUT_4B(bmpinfoheader, 24, (long) (cinfo->X_density*100)); /* XPels/M */
+    PUT_4B(bmpinfoheader, 28, (long) (cinfo->Y_density*100)); /* XPels/M */
   }
   PUT_2B(bmpinfoheader, 32, cmap_entries); /* biClrUsed */
   /* we leave biClrImportant = 0 */
@@ -267,7 +268,7 @@
 {
   char bmpfileheader[14];
   char bmpcoreheader[12];
-  INT32 headersize, bfSize;
+  long headersize, bfSize;
   int bits_per_pixel, cmap_entries;
 
   /* Compute colormap size and total file size */
@@ -291,7 +292,7 @@
   }
   /* File size */
   headersize = 14 + 12 + cmap_entries * 3; /* Header and colormap */
-  bfSize = headersize + (INT32) dest->row_width * (INT32) cinfo->output_height;
+  bfSize = headersize + (long) dest->row_width * (long) cinfo->output_height;
 
   /* Set unused fields of header to 0 */
   MEMZERO(bmpfileheader, sizeof(bmpfileheader));
@@ -332,7 +333,7 @@
 {
   JSAMPARRAY colormap = cinfo->colormap;
   int num_colors = cinfo->actual_number_of_colors;
-  FILE * outfile = dest->pub.output_file;
+  FILE *outfile = dest->pub.output_file;
   int i;
 
   if (colormap != NULL) {
@@ -382,7 +383,7 @@
 finish_output_bmp (j_decompress_ptr cinfo, djpeg_dest_ptr dinfo)
 {
   bmp_dest_ptr dest = (bmp_dest_ptr) dinfo;
-  register FILE * outfile = dest->pub.output_file;
+  register FILE *outfile = dest->pub.output_file;
   JSAMPARRAY image_ptr;
   register JSAMPROW data_ptr;
   JDIMENSION row;
diff --git a/wrgif.c b/wrgif.c
index d260ee0..cc06f1d 100644
--- a/wrgif.c
+++ b/wrgif.c
@@ -3,9 +3,10 @@
  *
  * This file was part of the Independent JPEG Group's software:
  * Copyright (C) 1991-1997, Thomas G. Lane.
- * It was modified by The libjpeg-turbo Project to include only code relevant
- * to libjpeg-turbo.
- * For conditions of distribution and use, see the accompanying README file.
+ * libjpeg-turbo Modifications:
+ * Copyright (C) 2015, D. R. Commander.
+ * For conditions of distribution and use, see the accompanying README.ijg
+ * file.
  *
  * This file contains routines to write output images in GIF format.
  *
@@ -54,7 +55,7 @@
   /* State for packing variable-width codes into a bitstream */
   int n_bits;                   /* current number of bits/code */
   int maxcode;                  /* maximum code, given n_bits */
-  INT32 cur_accum;              /* holds bits not yet output */
+  long cur_accum;               /* holds bits not yet output */
   int cur_bits;                 /* # of bits in cur_accum */
 
   /* State for GIF code assignment */
@@ -68,7 +69,7 @@
 
 } gif_dest_struct;
 
-typedef gif_dest_struct * gif_dest_ptr;
+typedef gif_dest_struct *gif_dest_ptr;
 
 /* Largest value that will fit in N bits */
 #define MAXCODE(n_bits) ((1 << (n_bits)) - 1)
@@ -108,7 +109,7 @@
 /* Emit a code of n_bits bits */
 /* Uses cur_accum and cur_bits to reblock into 8-bit bytes */
 {
-  dinfo->cur_accum |= ((INT32) code) << dinfo->cur_bits;
+  dinfo->cur_accum |= ((long) code) << dinfo->cur_bits;
   dinfo->cur_bits += dinfo->n_bits;
 
   while (dinfo->cur_bits >= 8) {
diff --git a/wrjpgcom.c b/wrjpgcom.c
index 0a22f62..cd67afd 100644
--- a/wrjpgcom.c
+++ b/wrjpgcom.c
@@ -5,7 +5,8 @@
  * Copyright (C) 1994-1997, Thomas G. Lane.
  * libjpeg-turbo Modifications:
  * Copyright (C) 2014, D. R. Commander
- * For conditions of distribution and use, see the accompanying README file.
+ * For conditions of distribution and use, see the accompanying README.ijg
+ * file.
  *
  * This file contains a very simple stand-alone application that inserts
  * user-supplied text as a COM (comment) marker in a JFIF file.
@@ -17,7 +18,7 @@
 #include "jinclude.h"           /* get auto-config symbols, <stdio.h> */
 
 #ifndef HAVE_STDLIB_H           /* <stdlib.h> should declare malloc() */
-extern void * malloc ();
+extern void *malloc ();
 #endif
 #include <ctype.h>              /* to declare isupper(), tolower() */
 #ifdef USE_SETMODE
@@ -65,12 +66,12 @@
  * To reuse this code in another application, you might need to change these.
  */
 
-static FILE * infile;           /* input JPEG file */
+static FILE *infile;            /* input JPEG file */
 
 /* Return next input byte, or EOF if no more */
 #define NEXTBYTE()  getc(infile)
 
-static FILE * outfile;          /* output JPEG file */
+static FILE *outfile;           /* output JPEG file */
 
 /* Emit an output byte */
 #define PUTBYTE(x)  putc((x), outfile)
@@ -337,7 +338,7 @@
 
 /* Command line parsing code */
 
-static const char * progname;   /* program name for error messages */
+static const char *progname;    /* program name for error messages */
 
 
 static void
@@ -374,7 +375,7 @@
 
 
 static int
-keymatch (char * arg, const char * keyword, int minchars)
+keymatch (char *arg, const char *keyword, int minchars)
 /* Case-insensitive matching of (possibly abbreviated) keyword switches. */
 /* keyword is the constant keyword (must be lower case already), */
 /* minchars is length of minimum legal abbreviation. */
@@ -406,10 +407,10 @@
 main (int argc, char **argv)
 {
   int argn;
-  char * arg;
+  char *arg;
   int keep_COM = 1;
-  char * comment_arg = NULL;
-  FILE * comment_file = NULL;
+  char *comment_arg = NULL;
+  FILE *comment_file = NULL;
   unsigned int comment_length = 0;
   int marker;
 
@@ -543,7 +544,7 @@
 
   /* Collect comment text from comment_file or stdin, if necessary */
   if (comment_arg == NULL) {
-    FILE * src_file;
+    FILE *src_file;
     int c;
 
     comment_arg = (char *) malloc((size_t) MAX_COM_LENGTH);
diff --git a/wrppm.c b/wrppm.c
index d3a613c..40fbf1f 100644
--- a/wrppm.c
+++ b/wrppm.c
@@ -6,7 +6,8 @@
  * Modified 2009 by Guido Vollbeding.
  * It was modified by The libjpeg-turbo Project to include only code and
  * information relevant to libjpeg-turbo.
- * For conditions of distribution and use, see the accompanying README file.
+ * For conditions of distribution and use, see the accompanying README.ijg
+ * file.
  *
  * This file contains routines to write output images in PPM/PGM format.
  * The extended 2-byte-per-sample raw PPM/PGM formats are supported.
@@ -19,6 +20,7 @@
  */
 
 #include "cdjpeg.h"             /* Common decls for cjpeg/djpeg applications */
+#include "wrppm.h"
 
 #ifdef PPM_SUPPORTED
 
@@ -61,21 +63,6 @@
  */
 
 
-/* Private version of data destination object */
-
-typedef struct {
-  struct djpeg_dest_struct pub; /* public fields */
-
-  /* Usually these two pointers point to the same place: */
-  char *iobuffer;               /* fwrite's I/O buffer */
-  JSAMPROW pixrow;              /* decompressor output buffer */
-  size_t buffer_width;          /* width of I/O buffer */
-  JDIMENSION samples_per_row;   /* JSAMPLEs per output row */
-} ppm_dest_struct;
-
-typedef ppm_dest_struct * ppm_dest_ptr;
-
-
 /*
  * Write some pixel data.
  * In this module rows_supplied will always be 1.
@@ -104,7 +91,7 @@
                  JDIMENSION rows_supplied)
 {
   ppm_dest_ptr dest = (ppm_dest_ptr) dinfo;
-  register char * bufferptr;
+  register char *bufferptr;
   register JSAMPROW ptr;
   register JDIMENSION col;
 
@@ -127,7 +114,7 @@
                   JDIMENSION rows_supplied)
 {
   ppm_dest_ptr dest = (ppm_dest_ptr) dinfo;
-  register char * bufferptr;
+  register char *bufferptr;
   register int pixval;
   register JSAMPROW ptr;
   register JSAMPROW color_map0 = cinfo->colormap[0];
@@ -152,7 +139,7 @@
                    JDIMENSION rows_supplied)
 {
   ppm_dest_ptr dest = (ppm_dest_ptr) dinfo;
-  register char * bufferptr;
+  register char *bufferptr;
   register JSAMPROW ptr;
   register JSAMPROW color_map = cinfo->colormap[0];
   register JDIMENSION col;
diff --git a/wrppm.h b/wrppm.h
new file mode 100644
index 0000000..aa6c562
--- /dev/null
+++ b/wrppm.h
@@ -0,0 +1,26 @@
+/*
+ * wrppm.h
+ *
+ * This file was part of the Independent JPEG Group's software:
+ * Copyright (C) 1994, Thomas G. Lane.
+ * For conditions of distribution and use, see the accompanying README.ijg
+ * file.
+ */
+
+#ifdef PPM_SUPPORTED
+
+/* Private version of data destination object */
+
+typedef struct {
+  struct djpeg_dest_struct pub; /* public fields */
+
+  /* Usually these two pointers point to the same place: */
+  char *iobuffer;               /* fwrite's I/O buffer */
+  JSAMPROW pixrow;              /* decompressor output buffer */
+  size_t buffer_width;          /* width of I/O buffer */
+  JDIMENSION samples_per_row;   /* JSAMPLEs per output row */
+} ppm_dest_struct;
+
+typedef ppm_dest_struct *ppm_dest_ptr;
+
+#endif
diff --git a/wrrle.c b/wrrle.c
index 6f35ad9..cc95b41 100644
--- a/wrrle.c
+++ b/wrrle.c
@@ -5,7 +5,8 @@
  * Copyright (C) 1991-1996, Thomas G. Lane.
  * It was modified by The libjpeg-turbo Project to include only code and
  * information relevant to libjpeg-turbo.
- * For conditions of distribution and use, see the accompanying README file.
+ * For conditions of distribution and use, see the accompanying README.ijg
+ * file.
  *
  * This file contains routines to write output images in RLE format.
  * The Utah Raster Toolkit library is required (version 3.1 or later).
@@ -61,7 +62,7 @@
 
 } rle_dest_struct;
 
-typedef rle_dest_struct * rle_dest_ptr;
+typedef rle_dest_struct *rle_dest_ptr;
 
 /* Forward declarations */
 METHODDEF(void) rle_put_pixel_rows
@@ -236,7 +237,7 @@
   } else {
     for (row = cinfo->output_height-1; row >= 0; row--) {
       rle_row = (rle_pixel **) dest->rle_row;
-      output_row = * (*cinfo->mem->access_virt_sarray)
+      output_row = *(*cinfo->mem->access_virt_sarray)
         ((j_common_ptr) cinfo, dest->image,
          (JDIMENSION) row, (JDIMENSION) 1, FALSE);
       red = rle_row[0];
diff --git a/wrtarga.c b/wrtarga.c
index 5fbfc53..c02b332 100644
--- a/wrtarga.c
+++ b/wrtarga.c
@@ -5,7 +5,8 @@
  * Copyright (C) 1991-1996, Thomas G. Lane.
  * It was modified by The libjpeg-turbo Project to include only code and
  * information relevant to libjpeg-turbo.
- * For conditions of distribution and use, see the accompanying README file.
+ * For conditions of distribution and use, see the accompanying README.ijg
+ * file.
  *
  * This file contains routines to write output images in Targa format.
  *
@@ -40,7 +41,7 @@
   JDIMENSION buffer_width;      /* width of one row */
 } tga_dest_struct;
 
-typedef tga_dest_struct * tga_dest_ptr;
+typedef tga_dest_struct *tga_dest_ptr;
 
 
 LOCAL(void)
@@ -95,7 +96,7 @@
 {
   tga_dest_ptr dest = (tga_dest_ptr) dinfo;
   register JSAMPROW inptr;
-  register char * outptr;
+  register char *outptr;
   register JDIMENSION col;
 
   inptr = dest->pub.buffer[0];
@@ -116,7 +117,7 @@
 {
   tga_dest_ptr dest = (tga_dest_ptr) dinfo;
   register JSAMPROW inptr;
-  register char * outptr;
+  register char *outptr;
   register JDIMENSION col;
 
   inptr = dest->pub.buffer[0];
@@ -139,7 +140,7 @@
 {
   tga_dest_ptr dest = (tga_dest_ptr) dinfo;
   register JSAMPROW inptr;
-  register char * outptr;
+  register char *outptr;
   register JSAMPROW color_map0 = cinfo->colormap[0];
   register JDIMENSION col;