src/hb-simd.hh - third_party/harfbuzz - Git at Google

 /*
  * Copyright © 2019  Facebook, Inc.
  *
  *  This is part of HarfBuzz, a text shaping library.
  *
  * Permission is hereby granted, without written agreement and without
  * license or royalty fees, to use, copy, modify, and distribute this
  * software and its documentation for any purpose, provided that the
  * above copyright notice and the following two paragraphs appear in
  * all copies of this software.
  *
  * IN NO EVENT SHALL THE COPYRIGHT HOLDER BE LIABLE TO ANY PARTY FOR
  * DIRECT, INDIRECT, SPECIAL, INCIDENTAL, OR CONSEQUENTIAL DAMAGES
  * ARISING OUT OF THE USE OF THIS SOFTWARE AND ITS DOCUMENTATION, EVEN
  * IF THE COPYRIGHT HOLDER HAS BEEN ADVISED OF THE POSSIBILITY OF SUCH
  * DAMAGE.
  *
  * THE COPYRIGHT HOLDER SPECIFICALLY DISCLAIMS ANY WARRANTIES, INCLUDING,
  * BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND
  * FITNESS FOR A PARTICULAR PURPOSE.  THE SOFTWARE PROVIDED HEREUNDER IS
  * ON AN "AS IS" BASIS, AND THE COPYRIGHT HOLDER HAS NO OBLIGATION TO
  * PROVIDE MAINTENANCE, SUPPORT, UPDATES, ENHANCEMENTS, OR MODIFICATIONS.
  *
  * Facebook Author(s): Behdad Esfahbod
  */

 #ifndef HB_SIMD_HH
 #define HB_SIMD_HH

 #include "hb.hh"
 #include "hb-meta.hh"
 #include "hb-algs.hh"

 /*
  * = MOTIVATION
  *
  * We hit a performance plateau with HarfBuzz many many years ago.  The last
  * *major* speedup was when hb_set_digest_t was added back in 2012.  Since then
  * we have shaved off a lot or extra work, but none of that matters ince
  * shaping time is dominated by two hot spots, which no tweaks whatsoever could
  * speed up any more.  SIMD is the only chance to gain a major speedup over
  * that, and that is what this file is about.
  *
  * For heavy fonts, ie. those having dozens, or over a hundred, lookups (Amiri,
  * IranNastaliq, NotoNastaliqUrdu, heavy Indic fonts, etc), the bottleneck is
  * just looping over lookups and for each lookup over the buffer contents and
  * testing wheater current lookup applies to current glyph; normally that would
  * be a binary search in the Coverage table, but that's where the
  * hb_set_digest_t speedup came from: hb_set_digest_t are narrow (3 or 4
  * integers) structures that implement approximate matching, similar to Bloom
  * Filters or Quotient Filters. These digests do all their work using bitwise
  * operations, so they can be easily vectorized. Combined with a gather
  * operation, or just multiple fetches in a row (which should parallelize) when
  * gather is not available.  This will allow us to skip over 8 or 16 glyphs at
  * a time.
  *
  * For fast fonts, like simple Latin fonts, like Roboto, the majority of time
  * is spent in binary searching in the Coverage table of kern and liga lookups.
  * We can, again, use vector gather and comparison operations to implement a
  * 9ary or 17ary search instead of binary search, which will reduce search
  * depth by 3x / 4x respectively.  It's important to keep in mind that a
  * 16-at-a-time 17ary search is /not/ in any way 17 times faster.  Only 4 times
  * faster at best since the number of search steps compared to binary search is
  * log(17)/log(2) ~= 4.  That should be taken into account while assessing
  * various designs.
  *
  * The rest of this files adds facilities to implement those, and possibly
  * more.
  *
  *
  * = INTRO to VECTOR EXTENSIONS
  *
  * A great series of short articles to get started with vector extensions is:
  *
  *   https://www.codingame.com/playgrounds/283/sse-avx-vectorization/
  *
  * If analyzing existing vector implementation to improve performance, the
  * following comes handy since it has instruction latencies, throughputs, and
  * micro-operation breakdown:
  *
  *   https://www.agner.org/optimize/instruction_tables.pdf
  *
  * For x86 programming the following intrinsics guide and cheatsheet are
  * indispensible:
  *
  *   https://software.intel.com/sites/landingpage/IntrinsicsGuide/
  *   https://db.in.tum.de/~finis/x86-intrin-cheatsheet-v2.2.pdf
  *
  *
  * = WHICH EXTENSIONS TO TARGET
  *
  * There are four vector extensions that matter for integer work like we need:
  *
  * On x86 / x86_64 (Intel / AMD), those are (in order of introduction):
  *
  * - SSE2	128 bits
  * - AVX2	256 bits
  * - AVX-512	512 bits
  *
  * On ARM there seems to be just:
  *
  * - NEON	128 bits
  *
  * Intel also provides an implementation of the NEON intrinsics API backed by SSE:
  *
  *   https://github.com/intel/ARM_NEON_2_x86_SSE
  *
  * So it's possible we can skip implementing SSE2 separately.
  *
  * To see which extensions are available on your machine you can, eg.:
  *
  * $ gcc -march=native -dM -E - < /dev/null | egrep "SSE|AVX" | sort
  * #define __AVX__ 1
  * #define __AVX2__ 1
  * #define __SSE__ 1
  * #define __SSE2__ 1
  * #define __SSE2_MATH__ 1
  * #define __SSE3__ 1
  * #define __SSE4_1__ 1
  * #define __SSE4_2__ 1
  * #define __SSE_MATH__ 1
  * #define __SSSE3__ 1
  *
  * If no arch is set, my Fedora defaults to enabling SSE2 but not any AVX:
  *
  * $ gcc  -dM -E - < /dev/null | egrep "SSE|AVX" | sort
  * #define __SSE__ 1
  * #define __SSE2__ 1
  * #define __SSE2_MATH__ 1
  * #define __SSE_MATH__ 1
  *
  * Given that, we should explore an SSE2 target at some point for sure
  * (possibly via the NEON path).  But initially, targetting AVX2 makes most
  * sense, for the following reasons:
  *
  * - The number of the integer operations we do on each vector is very few.
  *   This means that how fast we can load data into the vector registers is of
  *   paramount importance on the resulting performance.  AVX2 is the first
  *   extension to add a "gather" operation.  We will use that to load 8 or 16
  *   glyphs from non-consecutive memory addresses into one vector.
  *
  * - AVX-512 is practically the same as AVX2 but extended to 512 bits instead
  *   of 256.  However, it's not widely available on lower-power CPUs.  For
  *   example, my 2019 ThinkPad Yoga X1 does *not* support it.  We should
  *   definitely explore that, but not initially.  Also, it is possible that the
  *   extra memory load that puts will defeat the speedup we can gain from it.
  *   Also do note that for the search usecase, doubling the bitwidth from 256
  *   to 512, as discussed, only has hard max benefit cap of less than 30%
  *   speedup (log(17) / log(9)).  Must be implemented and measured carefully.
  */

 /* DESIGN
  *
  * There are a few complexities to using AVX2 though, which we need to
  * overcome.  The main one that sticks out is:
  *
  * While the 256 bit integer register (called __m256i) can be used as 16
  * uint16_t values, there /doens't/ exist a 16bit version of the gather
  * operation.  The closest there is gathers 8 uint32_t values
  * (_mm_[mask_]i32gather_epi32()).  I believe this is because there are only
  * eight ports to fetch from memory concurrently.
  *
  * Whatever the reason, this is wasteful.  OpenType glyph indices are 16bit
  * numbers, so in theory we should be able to process 16 at a time, if there
  * was a 16bit gather opperator.
  *
  * This is also problematic for loading GlyphID values in that we should make
  * sure the extra two bytes being loaded by the 32bit gather operator does
  * not cause invalid memory access, before we throw those extra bytes away.
  *
  * There are a couple of different approaches we can take to mitigate these:
  *
  * Making two gather calls sequentically and swizzling the results together
  * might work just fine.  Specially since the second one will be fulfilled
  * straight from the L1 cachelines.
  *
  * Another snag I hit is that AVX2 only has signed comparisons, not unsigned.
  * So we add a shift to convert uint16_t numbers to int16_t before comparing.
  *
  * Also note that the order of arguments to _mm256_set_epi32() and family
  * is opposite of what I originally assumed.  Docs are correct, just not
  * what you assume.
  *
  * PREFETCH
  *
  * We should use prefetch instructions to request load of the next batch of
  * hb_glyph_info_t's while we process the current batch.
  */


 /*
  * Choose intrinsics set to use.
  */


 #if !defined(HB_NO_SIMD) && (defined(HB_SIMD_AVX2) || defined(__AVX2__))

 #pragma GCC target("avx2")

 #include <x86intrin.h>

 /* TODO: Test -mvzeroupper. */


 template <bool is_range = false>
 static inline bool
 hb_simd_ksearch_glyphid (unsigned *pos, /* Out */
 			 hb_codepoint_t k,
 			 const void *base,
 			 size_t length,
 			 size_t stride)
 {
   if (unlikely (k & ~0xFFFF))
   {
     *pos = length;
     return false;
   }

   *pos = 0;

   /* Find deptch of search tree. */
   static const unsigned steps[] = {1, 9, 81, 729, 6561, 59049};
   unsigned rank = 1;
   while (rank < ARRAY_LENGTH (steps) && length >= steps[rank])
     rank++;

   static const __m256i _1x8 = _mm256_set1_epi32 (+1);
   static const __m256i __1x8 = _mm256_set1_epi32 (-1);
   static const __m256i _12345678 = _mm256_set_epi32 (8, 7, 6, 5, 4, 3, 2, 1);
   static const __m256i __32768x16 = _mm256_set1_epi16 (-32768);

   /* Set up key vector. */
   const __m256i K = _mm256_add_epi16 (_mm256_set1_epi16 ((signed) k - 32768), _1x8);

   while (rank)
   {
     unsigned step = steps[--rank];

     /* Load multiple ranges to test against. */
     const unsigned limit = stride * length;
     const __m256i limits = _mm256_set1_epi32 (limit + 1);
     const unsigned pitch = stride * step;
     const __m256i pitches = _mm256_set1_epi32 (pitch);
     const __m256i offsets = _mm256_mullo_epi32 (pitches, _12345678);
     const __m256i mask = _mm256_cmpgt_epi32 (limits, offsets);
     unsigned back_off = stride;

     /* The actual load... */
     __m256i V = _mm256_mask_i32gather_epi32 (__1x8,
 					     (const int *) ((const char *) base - back_off),
 					     offsets, mask, 1);
 #if __BYTE_ORDER == __LITTLE_ENDIAN
     static const __m256i bswap16_shuffle = _mm256_set_epi16 (0x0E0F,0x0C0D,0x0A0B,0x0809,
 							     0x0607,0x0405,0x0203,0x0001,
 							     0x0E0F,0x0C0D,0x0A0B,0x0809,
 							     0x0607,0x0405,0x0203,0x0001);
       V = _mm256_shuffle_epi8 (V, bswap16_shuffle);
 #endif
     if (!is_range)
     {
       static const __m256i dup_shuffle = _mm256_set_epi16 (0x0D0C,0x0D0C,0x0908,0x0908,
 							   0x0504,0x0504,0x0100,0x0100,
 							   0x0D0C,0x0D0C,0x0908,0x0908,
 							   0x0504,0x0504,0x0100,0x0100);
       V = _mm256_shuffle_epi8 (V, dup_shuffle);
     }
     V = _mm256_add_epi16 (V, __32768x16);

     /* Compare and locate. */
     unsigned answer = hb_ctz (~_mm256_movemask_epi8 (_mm256_cmpgt_epi16 (K, V))) >> 1;
     bool found = answer & 1;
     answer = (answer + 1) >> 1;
     unsigned move = step * answer;
     *pos += move;
     if (found)
     {
       *pos -= 1;
       return true;
     }
     length -= move;
     base = (const void *) ((const char *) base + stride * move);
   }
   return false;
 }

 static inline bool
 hb_simd_ksearch_glyphid_range (unsigned *pos, /* Out */
 			       hb_codepoint_t k,
 			       const void *base,
 			       size_t length,
 			       size_t stride)
 {
   return hb_simd_ksearch_glyphid<true> (pos, k, base, length, stride);
 }


 #elif !defined(HB_NO_SIMD)
 #define HB_NO_SIMD
 #endif

 #endif /* HB_SIMD_HH */
	/*
	* Copyright © 2019 Facebook, Inc.
	*
	* This is part of HarfBuzz, a text shaping library.
	*
	* Permission is hereby granted, without written agreement and without
	* license or royalty fees, to use, copy, modify, and distribute this
	* software and its documentation for any purpose, provided that the
	* above copyright notice and the following two paragraphs appear in
	* all copies of this software.
	*
	* IN NO EVENT SHALL THE COPYRIGHT HOLDER BE LIABLE TO ANY PARTY FOR
	* DIRECT, INDIRECT, SPECIAL, INCIDENTAL, OR CONSEQUENTIAL DAMAGES
	* ARISING OUT OF THE USE OF THIS SOFTWARE AND ITS DOCUMENTATION, EVEN
	* IF THE COPYRIGHT HOLDER HAS BEEN ADVISED OF THE POSSIBILITY OF SUCH
	* DAMAGE.
	*
	* THE COPYRIGHT HOLDER SPECIFICALLY DISCLAIMS ANY WARRANTIES, INCLUDING,
	* BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND
	* FITNESS FOR A PARTICULAR PURPOSE. THE SOFTWARE PROVIDED HEREUNDER IS
	* ON AN "AS IS" BASIS, AND THE COPYRIGHT HOLDER HAS NO OBLIGATION TO
	* PROVIDE MAINTENANCE, SUPPORT, UPDATES, ENHANCEMENTS, OR MODIFICATIONS.
	*
	* Facebook Author(s): Behdad Esfahbod
	*/

	#ifndef HB_SIMD_HH
	#define HB_SIMD_HH

	#include "hb.hh"
	#include "hb-meta.hh"
	#include "hb-algs.hh"

	/*
	* = MOTIVATION
	*
	* We hit a performance plateau with HarfBuzz many many years ago. The last
	* major speedup was when hb_set_digest_t was added back in 2012. Since then
	* we have shaved off a lot or extra work, but none of that matters ince
	* shaping time is dominated by two hot spots, which no tweaks whatsoever could
	* speed up any more. SIMD is the only chance to gain a major speedup over
	* that, and that is what this file is about.
	*
	* For heavy fonts, ie. those having dozens, or over a hundred, lookups (Amiri,
	* IranNastaliq, NotoNastaliqUrdu, heavy Indic fonts, etc), the bottleneck is
	* just looping over lookups and for each lookup over the buffer contents and
	* testing wheater current lookup applies to current glyph; normally that would
	* be a binary search in the Coverage table, but that's where the
	* hb_set_digest_t speedup came from: hb_set_digest_t are narrow (3 or 4
	* integers) structures that implement approximate matching, similar to Bloom
	* Filters or Quotient Filters. These digests do all their work using bitwise
	* operations, so they can be easily vectorized. Combined with a gather
	* operation, or just multiple fetches in a row (which should parallelize) when
	* gather is not available. This will allow us to skip over 8 or 16 glyphs at
	* a time.
	*
	* For fast fonts, like simple Latin fonts, like Roboto, the majority of time
	* is spent in binary searching in the Coverage table of kern and liga lookups.
	* We can, again, use vector gather and comparison operations to implement a
	* 9ary or 17ary search instead of binary search, which will reduce search
	* depth by 3x / 4x respectively. It's important to keep in mind that a
	* 16-at-a-time 17ary search is /not/ in any way 17 times faster. Only 4 times
	* faster at best since the number of search steps compared to binary search is
	* log(17)/log(2) ~= 4. That should be taken into account while assessing
	* various designs.
	*
	* The rest of this files adds facilities to implement those, and possibly
	* more.
	*
	*
	* = INTRO to VECTOR EXTENSIONS
	*
	* A great series of short articles to get started with vector extensions is:
	*
	* https://www.codingame.com/playgrounds/283/sse-avx-vectorization/
	*
	* If analyzing existing vector implementation to improve performance, the
	* following comes handy since it has instruction latencies, throughputs, and
	* micro-operation breakdown:
	*
	* https://www.agner.org/optimize/instruction_tables.pdf
	*
	* For x86 programming the following intrinsics guide and cheatsheet are
	* indispensible:
	*
	* https://software.intel.com/sites/landingpage/IntrinsicsGuide/
	* https://db.in.tum.de/~finis/x86-intrin-cheatsheet-v2.2.pdf
	*
	*
	* = WHICH EXTENSIONS TO TARGET
	*
	* There are four vector extensions that matter for integer work like we need:
	*
	* On x86 / x86_64 (Intel / AMD), those are (in order of introduction):
	*
	* - SSE2 128 bits
	* - AVX2 256 bits
	* - AVX-512 512 bits
	*
	* On ARM there seems to be just:
	*
	* - NEON 128 bits
	*
	* Intel also provides an implementation of the NEON intrinsics API backed by SSE:
	*
	* https://github.com/intel/ARM_NEON_2_x86_SSE
	*
	* So it's possible we can skip implementing SSE2 separately.
	*
	* To see which extensions are available on your machine you can, eg.:
	*
	* $ gcc -march=native -dM -E - < /dev/null \| egrep "SSE\|AVX" \| sort
	* #define __AVX__ 1
	* #define __AVX2__ 1
	* #define __SSE__ 1
	* #define __SSE2__ 1
	* #define __SSE2_MATH__ 1
	* #define __SSE3__ 1
	* #define __SSE4_1__ 1
	* #define __SSE4_2__ 1
	* #define __SSE_MATH__ 1
	* #define __SSSE3__ 1
	*
	* If no arch is set, my Fedora defaults to enabling SSE2 but not any AVX:
	*
	* $ gcc -dM -E - < /dev/null \| egrep "SSE\|AVX" \| sort
	* #define __SSE__ 1
	* #define __SSE2__ 1
	* #define __SSE2_MATH__ 1
	* #define __SSE_MATH__ 1
	*
	* Given that, we should explore an SSE2 target at some point for sure
	* (possibly via the NEON path). But initially, targetting AVX2 makes most
	* sense, for the following reasons:
	*
	* - The number of the integer operations we do on each vector is very few.
	* This means that how fast we can load data into the vector registers is of
	* paramount importance on the resulting performance. AVX2 is the first
	* extension to add a "gather" operation. We will use that to load 8 or 16
	* glyphs from non-consecutive memory addresses into one vector.
	*
	* - AVX-512 is practically the same as AVX2 but extended to 512 bits instead
	* of 256. However, it's not widely available on lower-power CPUs. For
	* example, my 2019 ThinkPad Yoga X1 does not support it. We should
	* definitely explore that, but not initially. Also, it is possible that the
	* extra memory load that puts will defeat the speedup we can gain from it.
	* Also do note that for the search usecase, doubling the bitwidth from 256
	* to 512, as discussed, only has hard max benefit cap of less than 30%
	* speedup (log(17) / log(9)). Must be implemented and measured carefully.
	*/

	/* DESIGN
	*
	* There are a few complexities to using AVX2 though, which we need to
	* overcome. The main one that sticks out is:
	*
	* While the 256 bit integer register (called __m256i) can be used as 16
	* uint16_t values, there /doens't/ exist a 16bit version of the gather
	* operation. The closest there is gathers 8 uint32_t values
	* (_mm_[mask_]i32gather_epi32()). I believe this is because there are only
	* eight ports to fetch from memory concurrently.
	*
	* Whatever the reason, this is wasteful. OpenType glyph indices are 16bit
	* numbers, so in theory we should be able to process 16 at a time, if there
	* was a 16bit gather opperator.
	*
	* This is also problematic for loading GlyphID values in that we should make
	* sure the extra two bytes being loaded by the 32bit gather operator does
	* not cause invalid memory access, before we throw those extra bytes away.
	*
	* There are a couple of different approaches we can take to mitigate these:
	*
	* Making two gather calls sequentically and swizzling the results together
	* might work just fine. Specially since the second one will be fulfilled
	* straight from the L1 cachelines.
	*
	* Another snag I hit is that AVX2 only has signed comparisons, not unsigned.
	* So we add a shift to convert uint16_t numbers to int16_t before comparing.
	*
	* Also note that the order of arguments to _mm256_set_epi32() and family
	* is opposite of what I originally assumed. Docs are correct, just not
	* what you assume.
	*
	* PREFETCH
	*
	* We should use prefetch instructions to request load of the next batch of
	* hb_glyph_info_t's while we process the current batch.
	*/


	/*
	* Choose intrinsics set to use.
	*/


	#if !defined(HB_NO_SIMD) && (defined(HB_SIMD_AVX2) \|\| defined(__AVX2__))

	#pragma GCC target("avx2")

	#include <x86intrin.h>

	/* TODO: Test -mvzeroupper. */


	template <bool is_range = false>
	static inline bool
	hb_simd_ksearch_glyphid (unsigned pos, / Out */
	hb_codepoint_t k,
	const void *base,
	size_t length,
	size_t stride)
	{
	if (unlikely (k & ~0xFFFF))
	{
	*pos = length;
	return false;
	}

	*pos = 0;

	/* Find deptch of search tree. */
	static const unsigned steps[] = {1, 9, 81, 729, 6561, 59049};
	unsigned rank = 1;
	while (rank < ARRAY_LENGTH (steps) && length >= steps[rank])
	rank++;

	static const __m256i _1x8 = _mm256_set1_epi32 (+1);
	static const __m256i __1x8 = _mm256_set1_epi32 (-1);
	static const __m256i _12345678 = _mm256_set_epi32 (8, 7, 6, 5, 4, 3, 2, 1);
	static const __m256i __32768x16 = _mm256_set1_epi16 (-32768);

	/* Set up key vector. */
	const __m256i K = _mm256_add_epi16 (_mm256_set1_epi16 ((signed) k - 32768), _1x8);

	while (rank)
	{
	unsigned step = steps[--rank];

	/* Load multiple ranges to test against. */
	const unsigned limit = stride * length;
	const __m256i limits = _mm256_set1_epi32 (limit + 1);
	const unsigned pitch = stride * step;
	const __m256i pitches = _mm256_set1_epi32 (pitch);
	const __m256i offsets = _mm256_mullo_epi32 (pitches, _12345678);
	const __m256i mask = _mm256_cmpgt_epi32 (limits, offsets);
	unsigned back_off = stride;

	/* The actual load... */
	__m256i V = _mm256_mask_i32gather_epi32 (__1x8,
	(const int ) ((const char ) base - back_off),
	offsets, mask, 1);
	#if __BYTE_ORDER == __LITTLE_ENDIAN
	static const __m256i bswap16_shuffle = _mm256_set_epi16 (0x0E0F,0x0C0D,0x0A0B,0x0809,
	0x0607,0x0405,0x0203,0x0001,
	0x0E0F,0x0C0D,0x0A0B,0x0809,
	0x0607,0x0405,0x0203,0x0001);
	V = _mm256_shuffle_epi8 (V, bswap16_shuffle);
	#endif
	if (!is_range)
	{
	static const __m256i dup_shuffle = _mm256_set_epi16 (0x0D0C,0x0D0C,0x0908,0x0908,
	0x0504,0x0504,0x0100,0x0100,
	0x0D0C,0x0D0C,0x0908,0x0908,
	0x0504,0x0504,0x0100,0x0100);
	V = _mm256_shuffle_epi8 (V, dup_shuffle);
	}
	V = _mm256_add_epi16 (V, __32768x16);

	/* Compare and locate. */
	unsigned answer = hb_ctz (~_mm256_movemask_epi8 (_mm256_cmpgt_epi16 (K, V))) >> 1;
	bool found = answer & 1;
	answer = (answer + 1) >> 1;
	unsigned move = step * answer;
	*pos += move;
	if (found)
	{
	*pos -= 1;
	return true;
	}
	length -= move;
	base = (const void ) ((const char ) base + stride * move);
	}
	return false;
	}

	static inline bool
	hb_simd_ksearch_glyphid_range (unsigned pos, / Out */
	hb_codepoint_t k,
	const void *base,
	size_t length,
	size_t stride)
	{
	return hb_simd_ksearch_glyphid<true> (pos, k, base, length, stride);
	}


	#elif !defined(HB_NO_SIMD)
	#define HB_NO_SIMD
	#endif

	#endif /* HB_SIMD_HH */