| /* |
| * Copyright © 2019 Facebook, Inc. |
| * |
| * This is part of HarfBuzz, a text shaping library. |
| * |
| * Permission is hereby granted, without written agreement and without |
| * license or royalty fees, to use, copy, modify, and distribute this |
| * software and its documentation for any purpose, provided that the |
| * above copyright notice and the following two paragraphs appear in |
| * all copies of this software. |
| * |
| * IN NO EVENT SHALL THE COPYRIGHT HOLDER BE LIABLE TO ANY PARTY FOR |
| * DIRECT, INDIRECT, SPECIAL, INCIDENTAL, OR CONSEQUENTIAL DAMAGES |
| * ARISING OUT OF THE USE OF THIS SOFTWARE AND ITS DOCUMENTATION, EVEN |
| * IF THE COPYRIGHT HOLDER HAS BEEN ADVISED OF THE POSSIBILITY OF SUCH |
| * DAMAGE. |
| * |
| * THE COPYRIGHT HOLDER SPECIFICALLY DISCLAIMS ANY WARRANTIES, INCLUDING, |
| * BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND |
| * FITNESS FOR A PARTICULAR PURPOSE. THE SOFTWARE PROVIDED HEREUNDER IS |
| * ON AN "AS IS" BASIS, AND THE COPYRIGHT HOLDER HAS NO OBLIGATION TO |
| * PROVIDE MAINTENANCE, SUPPORT, UPDATES, ENHANCEMENTS, OR MODIFICATIONS. |
| * |
| * Facebook Author(s): Behdad Esfahbod |
| */ |
| |
| #ifndef HB_SIMD_HH |
| #define HB_SIMD_HH |
| |
| #include "hb.hh" |
| #include "hb-meta.hh" |
| #include "hb-algs.hh" |
| |
| /* |
| * = MOTIVATION |
| * |
| * We hit a performance plateau with HarfBuzz many many years ago. The last |
| * *major* speedup was when hb_set_digest_t was added back in 2012. Since then |
| * we have shaved off a lot or extra work, but none of that matters ince |
| * shaping time is dominated by two hot spots, which no tweaks whatsoever could |
| * speed up any more. SIMD is the only chance to gain a major speedup over |
| * that, and that is what this file is about. |
| * |
| * For heavy fonts, ie. those having dozens, or over a hundred, lookups (Amiri, |
| * IranNastaliq, NotoNastaliqUrdu, heavy Indic fonts, etc), the bottleneck is |
| * just looping over lookups and for each lookup over the buffer contents and |
| * testing wheater current lookup applies to current glyph; normally that would |
| * be a binary search in the Coverage table, but that's where the |
| * hb_set_digest_t speedup came from: hb_set_digest_t are narrow (3 or 4 |
| * integers) structures that implement approximate matching, similar to Bloom |
| * Filters or Quotient Filters. These digests do all their work using bitwise |
| * operations, so they can be easily vectorized. Combined with a gather |
| * operation, or just multiple fetches in a row (which should parallelize) when |
| * gather is not available. This will allow us to skip over 8 or 16 glyphs at |
| * a time. |
| * |
| * For fast fonts, like simple Latin fonts, like Roboto, the majority of time |
| * is spent in binary searching in the Coverage table of kern and liga lookups. |
| * We can, again, use vector gather and comparison operations to implement a |
| * 9ary or 17ary search instead of binary search, which will reduce search |
| * depth by 3x / 4x respectively. It's important to keep in mind that a |
| * 16-at-a-time 17ary search is /not/ in any way 17 times faster. Only 4 times |
| * faster at best since the number of search steps compared to binary search is |
| * log(17)/log(2) ~= 4. That should be taken into account while assessing |
| * various designs. |
| * |
| * The rest of this files adds facilities to implement those, and possibly |
| * more. |
| * |
| * |
| * = INTRO to VECTOR EXTENSIONS |
| * |
| * A great series of short articles to get started with vector extensions is: |
| * |
| * https://www.codingame.com/playgrounds/283/sse-avx-vectorization/ |
| * |
| * If analyzing existing vector implementation to improve performance, the |
| * following comes handy since it has instruction latencies, throughputs, and |
| * micro-operation breakdown: |
| * |
| * https://www.agner.org/optimize/instruction_tables.pdf |
| * |
| * For x86 programming the following intrinsics guide and cheatsheet are |
| * indispensible: |
| * |
| * https://software.intel.com/sites/landingpage/IntrinsicsGuide/ |
| * https://db.in.tum.de/~finis/x86-intrin-cheatsheet-v2.2.pdf |
| * |
| * |
| * = WHICH EXTENSIONS TO TARGET |
| * |
| * There are four vector extensions that matter for integer work like we need: |
| * |
| * On x86 / x86_64 (Intel / AMD), those are (in order of introduction): |
| * |
| * - SSE2 128 bits |
| * - AVX2 256 bits |
| * - AVX-512 512 bits |
| * |
| * On ARM there seems to be just: |
| * |
| * - NEON 128 bits |
| * |
| * Intel also provides an implementation of the NEON intrinsics API backed by SSE: |
| * |
| * https://github.com/intel/ARM_NEON_2_x86_SSE |
| * |
| * So it's possible we can skip implementing SSE2 separately. |
| * |
| * To see which extensions are available on your machine you can, eg.: |
| * |
| * $ gcc -march=native -dM -E - < /dev/null | egrep "SSE|AVX" | sort |
| * #define __AVX__ 1 |
| * #define __AVX2__ 1 |
| * #define __SSE__ 1 |
| * #define __SSE2__ 1 |
| * #define __SSE2_MATH__ 1 |
| * #define __SSE3__ 1 |
| * #define __SSE4_1__ 1 |
| * #define __SSE4_2__ 1 |
| * #define __SSE_MATH__ 1 |
| * #define __SSSE3__ 1 |
| * |
| * If no arch is set, my Fedora defaults to enabling SSE2 but not any AVX: |
| * |
| * $ gcc -dM -E - < /dev/null | egrep "SSE|AVX" | sort |
| * #define __SSE__ 1 |
| * #define __SSE2__ 1 |
| * #define __SSE2_MATH__ 1 |
| * #define __SSE_MATH__ 1 |
| * |
| * Given that, we should explore an SSE2 target at some point for sure |
| * (possibly via the NEON path). But initially, targetting AVX2 makes most |
| * sense, for the following reasons: |
| * |
| * - The number of the integer operations we do on each vector is very few. |
| * This means that how fast we can load data into the vector registers is of |
| * paramount importance on the resulting performance. AVX2 is the first |
| * extension to add a "gather" operation. We will use that to load 8 or 16 |
| * glyphs from non-consecutive memory addresses into one vector. |
| * |
| * - AVX-512 is practically the same as AVX2 but extended to 512 bits instead |
| * of 256. However, it's not widely available on lower-power CPUs. For |
| * example, my 2019 ThinkPad Yoga X1 does *not* support it. We should |
| * definitely explore that, but not initially. Also, it is possible that the |
| * extra memory load that puts will defeat the speedup we can gain from it. |
| * Also do note that for the search usecase, doubling the bitwidth from 256 |
| * to 512, as discussed, only has hard max benefit cap of less than 30% |
| * speedup (log(17) / log(9)). Must be implemented and measured carefully. |
| */ |
| |
| /* DESIGN |
| * |
| * There are a few complexities to using AVX2 though, which we need to |
| * overcome. The main one that sticks out is: |
| * |
| * While the 256 bit integer register (called __m256i) can be used as 16 |
| * uint16_t values, there /doens't/ exist a 16bit version of the gather |
| * operation. The closest there is gathers 8 uint32_t values |
| * (_mm_[mask_]i32gather_epi32()). I believe this is because there are only |
| * eight ports to fetch from memory concurrently. |
| * |
| * Whatever the reason, this is wasteful. OpenType glyph indices are 16bit |
| * numbers, so in theory we should be able to process 16 at a time, if there |
| * was a 16bit gather opperator. |
| * |
| * This is also problematic for loading GlyphID values in that we should make |
| * sure the extra two bytes being loaded by the 32bit gather operator does |
| * not cause invalid memory access, before we throw those extra bytes away. |
| * |
| * There are a couple of different approaches we can take to mitigate these: |
| * |
| * Making two gather calls sequentically and swizzling the results together |
| * might work just fine. Specially since the second one will be fulfilled |
| * straight from the L1 cachelines. |
| * |
| * Another snag I hit is that AVX2 only has signed comparisons, not unsigned. |
| * So we add a shift to convert uint16_t numbers to int16_t before comparing. |
| * |
| * Also note that the order of arguments to _mm256_set_epi32() and family |
| * is opposite of what I originally assumed. Docs are correct, just not |
| * what you assume. |
| * |
| * PREFETCH |
| * |
| * We should use prefetch instructions to request load of the next batch of |
| * hb_glyph_info_t's while we process the current batch. |
| */ |
| |
| |
| /* |
| * Choose intrinsics set to use. |
| */ |
| |
| |
| #if !defined(HB_NO_SIMD) && (defined(HB_SIMD_AVX2) || defined(__AVX2__)) |
| |
| #pragma GCC target("avx2") |
| |
| #include <x86intrin.h> |
| |
| /* TODO: Test -mvzeroupper. */ |
| |
| |
| template <bool is_range = false> |
| static inline bool |
| hb_simd_ksearch_glyphid (unsigned *pos, /* Out */ |
| hb_codepoint_t k, |
| const void *base, |
| size_t length, |
| size_t stride) |
| { |
| if (unlikely (k & ~0xFFFF)) |
| { |
| *pos = length; |
| return false; |
| } |
| |
| *pos = 0; |
| |
| /* Find deptch of search tree. */ |
| static const unsigned steps[] = {1, 9, 81, 729, 6561, 59049}; |
| unsigned rank = 1; |
| while (rank < ARRAY_LENGTH (steps) && length >= steps[rank]) |
| rank++; |
| |
| static const __m256i _1x8 = _mm256_set1_epi32 (+1); |
| static const __m256i __1x8 = _mm256_set1_epi32 (-1); |
| static const __m256i _12345678 = _mm256_set_epi32 (8, 7, 6, 5, 4, 3, 2, 1); |
| static const __m256i __32768x16 = _mm256_set1_epi16 (-32768); |
| |
| /* Set up key vector. */ |
| const __m256i K = _mm256_add_epi16 (_mm256_set1_epi16 ((signed) k - 32768), _1x8); |
| |
| while (rank) |
| { |
| unsigned step = steps[--rank]; |
| |
| /* Load multiple ranges to test against. */ |
| const unsigned limit = stride * length; |
| const __m256i limits = _mm256_set1_epi32 (limit + 1); |
| const unsigned pitch = stride * step; |
| const __m256i pitches = _mm256_set1_epi32 (pitch); |
| const __m256i offsets = _mm256_mullo_epi32 (pitches, _12345678); |
| const __m256i mask = _mm256_cmpgt_epi32 (limits, offsets); |
| unsigned back_off = stride; |
| |
| /* The actual load... */ |
| __m256i V = _mm256_mask_i32gather_epi32 (__1x8, |
| (const int *) ((const char *) base - back_off), |
| offsets, mask, 1); |
| #if __BYTE_ORDER == __LITTLE_ENDIAN |
| static const __m256i bswap16_shuffle = _mm256_set_epi16 (0x0E0F,0x0C0D,0x0A0B,0x0809, |
| 0x0607,0x0405,0x0203,0x0001, |
| 0x0E0F,0x0C0D,0x0A0B,0x0809, |
| 0x0607,0x0405,0x0203,0x0001); |
| V = _mm256_shuffle_epi8 (V, bswap16_shuffle); |
| #endif |
| if (!is_range) |
| { |
| static const __m256i dup_shuffle = _mm256_set_epi16 (0x0D0C,0x0D0C,0x0908,0x0908, |
| 0x0504,0x0504,0x0100,0x0100, |
| 0x0D0C,0x0D0C,0x0908,0x0908, |
| 0x0504,0x0504,0x0100,0x0100); |
| V = _mm256_shuffle_epi8 (V, dup_shuffle); |
| } |
| V = _mm256_add_epi16 (V, __32768x16); |
| |
| /* Compare and locate. */ |
| unsigned answer = hb_ctz (~_mm256_movemask_epi8 (_mm256_cmpgt_epi16 (K, V))) >> 1; |
| bool found = answer & 1; |
| answer = (answer + 1) >> 1; |
| unsigned move = step * answer; |
| *pos += move; |
| if (found) |
| { |
| *pos -= 1; |
| return true; |
| } |
| length -= move; |
| base = (const void *) ((const char *) base + stride * move); |
| } |
| return false; |
| } |
| |
| static inline bool |
| hb_simd_ksearch_glyphid_range (unsigned *pos, /* Out */ |
| hb_codepoint_t k, |
| const void *base, |
| size_t length, |
| size_t stride) |
| { |
| return hb_simd_ksearch_glyphid<true> (pos, k, base, length, stride); |
| } |
| |
| |
| #elif !defined(HB_NO_SIMD) |
| #define HB_NO_SIMD |
| #endif |
| |
| #endif /* HB_SIMD_HH */ |