src/hb-gpu.cc - third_party/harfbuzz - Git at Google

 /*
  * Copyright (C) 2026  Behdad Esfahbod
  *
  *  This is part of HarfBuzz, a text shaping library.
  *
  * Permission is hereby granted, without written agreement and without
  * license or royalty fees, to use, copy, modify, and distribute this
  * software and its documentation for any purpose, provided that the
  * above copyright notice and the following two paragraphs appear in
  * all copies of this software.
  *
  * IN NO EVENT SHALL THE COPYRIGHT HOLDER BE LIABLE TO ANY PARTY FOR
  * DIRECT, INDIRECT, SPECIAL, INCIDENTAL, OR CONSEQUENTIAL DAMAGES
  * ARISING OUT OF THE USE OF THIS SOFTWARE AND ITS DOCUMENTATION, EVEN
  * IF THE COPYRIGHT HOLDER HAS BEEN ADVISED OF THE POSSIBILITY OF SUCH
  * DAMAGE.
  *
  * THE COPYRIGHT HOLDER SPECIFICALLY DISCLAIMS ANY WARRANTIES, INCLUDING,
  * BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND
  * FITNESS FOR A PARTICULAR PURPOSE.  THE SOFTWARE PROVIDED HEREUNDER IS
  * ON AN "AS IS" BASIS, AND THE COPYRIGHT HOLDER HAS NO OBLIGATION TO
  * PROVIDE MAINTENANCE, SUPPORT, UPDATES, ENHANCEMENTS, OR MODIFICATIONS.
  *
  * Author(s): Behdad Esfahbod
  */

 #include "hb.hh"

 #include "hb-gpu.h"
 #include "hb-gpu.hh"
 #include "hb-blob.hh"

 /**
  * SECTION:hb-gpu
  * @title: hb-gpu
  * @short_description: GPU text rendering
  * @include: hb-gpu.h
  *
  * This module provides GPU-side text rendering.  Glyph data is
  * encoded on the CPU into compact blobs that the GPU decodes and
  * rasterizes directly in the fragment shader, with no intermediate
  * bitmap atlas.
  *
  * Two renderers share the same atlas, vertex stage, and pipeline:
  *
  * - Draw, an antialiased monochrome coverage mask for outline
  *   glyphs.
  * - Paint, a premultiplied-RGBA color accumulator for COLRv0 and
  *   COLRv1 glyphs.  Paint also renders plain monochrome outlines
  *   (as a single foreground-colored layer), at some cost over the
  *   Draw path.
  *
  * Applications typically pick one renderer per font, based on
  * whether the font has a color paint table (see
  * hb_ot_color_has_paint()).  The Draw path is meaningfully faster
  * than Paint on monochrome fonts, so using Paint unconditionally
  * leaves performance on the table.
  *
  * See the [live web demo](https://harfbuzz.github.io/hb-gpu-demo/).
  *
  * # Draw
  *
  * The draw renderer implements the
  * [Slug](https://github.com/EricLengyel/Slug) algorithm by Eric
  * Lengyel.  It produces an antialiased coverage mask in the
  * fragment shader, suitable for compositing over any background
  * or multiplying with any foreground color.
  *
  * ## Encoding
  *
  * Each glyph is encoded independently into an opaque blob of
  * RGBA16I texels (8 bytes each).  Typical encoding flow:
  *
  * |[<!-- language="plain" -->
  * hb_gpu_draw_t *draw = hb_gpu_draw_create_or_fail ();
  *
  * hb_gpu_draw_glyph (draw, font, gid);
  * hb_blob_t *blob = hb_gpu_draw_encode (draw);
  *
  * // Upload hb_blob_get_data(blob) to the atlas at some offset.
  * unsigned atlas_offset = upload_to_atlas (blob);
  *
  * hb_gpu_draw_recycle_blob (draw, blob);
  * // repeat for more glyphs...
  *
  * hb_gpu_draw_destroy (draw);
  * ]|
  *
  * Create the encoder once and reuse it across glyphs.
  * hb_gpu_draw_encode() auto-clears the accumulated outlines on
  * return, so the next call to hb_gpu_draw_glyph() starts fresh;
  * user configuration (font scale) is preserved.  Pass the
  * encoded blob back via hb_gpu_draw_recycle_blob() after upload
  * to recycle the buffer across encodes.
  *
  * ## Atlas setup
  *
  * All encoded blobs are uploaded into a single GL_TEXTURE_BUFFER
  * backed by a GL_RGBA16I format buffer.  Each glyph occupies a
  * contiguous range of texels at a known offset:
  *
  * |[<!-- language="plain" -->
  * GLuint buf, tex;
  * glGenBuffers (1, &buf);
  * glGenTextures (1, &tex);
  *
  * glBindBuffer (GL_TEXTURE_BUFFER, buf);
  * glBufferData (GL_TEXTURE_BUFFER, capacity * 8, NULL, GL_STATIC_DRAW);
  *
  * glBindTexture (GL_TEXTURE_BUFFER, tex);
  * glTexBuffer (GL_TEXTURE_BUFFER, GL_RGBA16I, buf);
  *
  * // Upload a glyph blob at 'offset' texels:
  * glBufferSubData (GL_TEXTURE_BUFFER,
  *                  offset * 8,
  *                  hb_blob_get_length (blob),
  *                  hb_blob_get_data (blob, NULL));
  * ]|
  *
  * ## Shader compilation
  *
  * The library provides GLSL source strings for shared helpers
  * (atlas access, vertex dilation) and for the draw-renderer stage
  * code.  Prepend a #version directive, concatenate shared and
  * draw sources in order, and append your own main():
  *
  * |[<!-- language="plain" -->
  * const char *vert_sources[] = {
  *     "#version 330\n",
  *     hb_gpu_shader_source      (HB_GPU_SHADER_STAGE_VERTEX,
  *                                HB_GPU_SHADER_LANG_GLSL),
  *     hb_gpu_draw_shader_source (HB_GPU_SHADER_STAGE_VERTEX,
  *                                HB_GPU_SHADER_LANG_GLSL),
  *     your_vertex_main
  * };
  * glShaderSource (vert_shader, 4, vert_sources, NULL);
  * // Same pattern for the fragment shader.
  * ]|
  *
  * hb_gpu_draw_shader_source() returns an empty string for the
  * vertex stage, so the assembly order above is safe.
  *
  * ## Vertex shader
  *
  * The vertex library provides one function:
  *
  * |[<!-- language="plain" -->
  * void hb_gpu_dilate (inout vec2 position, inout vec2 texcoord,
  *                     vec2 normal, vec4 jac,
  *                     mat4 m, vec2 viewport);
  * ]|
  *
  * This expands each glyph quad by approximately half a pixel on
  * screen to ensure proper antialiasing coverage at the edges.
  * It must be called in the vertex shader before computing
  * gl_Position.
  *
  * Per-vertex attributes for each glyph quad (2 triangles, 6
  * vertices, 4 unique corners):
  *
  * - position (vec2): object-space vertex position, computed from
  *   the glyph's bounding box.  For corner (cx, cy) where cx,cy
  *   are 0 or 1:
  *   |[<!-- language="plain" -->
  *   pos.x = pen_x + scale * lerp(extent_min_x, extent_max_x, cx)
  *   pos.y = pen_y - scale * lerp(extent_min_y, extent_max_y, cy)
  *   ]|
  *   where scale = font_size / upem.
  *
  * - texcoord (vec2): em-space texture coordinates, equal to the
  *   raw extent values: `(ex, ey)` where ex/ey are the glyph's
  *   bounding box corners in font design units.  The fragment shader
  *   samples the encoded blob in this coordinate space.
  *
  * - normal (vec2): outward normal at each corner.
  *   `(-1, +1)` for (0,0), `(+1, +1)` for (1,0),
  *   `(-1, -1)` for (0,1), `(+1, -1)` for (1,1).
  *   Note: y is negated relative to cx,cy because the common
  *   em-to-object transform flips y.
  *
  * - jac (vec4): maps object-space displacements back to
  *   em-space after dilation, stored row-major as
  *   (j00, j01, j10, j11).  For the common case of uniform
  *   scaling with y-flip:
  *   |[<!-- language="plain" -->
  *   jac = vec4(emPerPos, 0.0, 0.0, -emPerPos)
  *   ]|
  *   where emPerPos = upem / font_size.  For non-trivial
  *   transforms, see the hb_gpu_dilate source.
  *
  * - m (mat4): the model-view-projection matrix (uniform).
  *
  * - viewport (vec2): the viewport size in pixels (uniform).
  *
  * A typical vertex main:
  *
  * |[<!-- language="plain" -->
  * uniform mat4 u_matViewProjection;
  * uniform vec2 u_viewport;
  *
  * in vec2 a_position;
  * in vec2 a_texcoord;
  * in vec2 a_normal;
  * in float a_emPerPos;
  * in uint a_glyphLoc;
  *
  * out vec2 v_texcoord;
  * flat out uint v_glyphLoc;
  *
  * void main () {
  *   vec2 pos = a_position;
  *   vec2 tex = a_texcoord;
  *   vec4 jac = vec4 (a_emPerPos, 0.0, 0.0, -a_emPerPos);
  *   hb_gpu_dilate (pos, tex, a_normal, jac,
  *                  u_matViewProjection, u_viewport);
  *   gl_Position = u_matViewProjection * vec4 (pos, 0.0, 1.0);
  *   v_texcoord = tex;
  *   v_glyphLoc = a_glyphLoc;
  * }
  * ]|
  *
  * ## Font scale
  *
  * The encoder works in font design units (upem).  For best
  * results, do not set a scale on the #hb_font_t passed to
  * hb_gpu_draw_glyph() — leave it at the default (upem).
  * Scaling is applied later when computing vertex positions:
  *
  * |[<!-- language="plain" -->
  * float scale = font_size / upem;
  * pos.x = pen_x + scale * extent_x;
  * pos.y = pen_y - scale * extent_y;
  * ]|
  *
  * If you do set a font scale (e.g. for hinting), the encoded
  * blob and extents will be in the scaled coordinate space,
  * and you must adjust emPerPos accordingly.
  *
  * ## Dilation
  *
  * Each glyph is rendered on a screen-aligned quad whose
  * corners match the glyph's bounding box.  Without dilation,
  * the antialiased edges at the quad boundary get clipped — the
  * fragment shader needs at least half a pixel of padding
  * around the glyph to produce smooth coverage gradients.
  *
  * hb_gpu_dilate computes a perspective-correct half-pixel
  * expansion.  It adjusts both the vertex position (expanding
  * the quad) and the texcoord (so the fragment shader samples
  * the right location in em-space after expansion).
  *
  * The `jac` parameter maps object-space displacements back to
  * em-space.  For the common case of uniform scaling with
  * y-flip (`pos.y = pen_y - scale * ey`):
  *
  * |[<!-- language="plain" -->
  * float emPerPos = upem / font_size;
  * jac = vec4(emPerPos, 0.0, 0.0, -emPerPos);
  * ]|
  *
  * For non-uniform scaling or shearing transforms, jac is the
  * 2x2 inverse of the em-to-object linear transform, stored
  * row-major.
  *
  * ### Static dilation alternative
  *
  * Applications that do not need perspective correctness (e.g.
  * strictly 2D rendering at known sizes) can skip hb_gpu_dilate
  * and instead expand each quad vertex by a fixed amount along
  * its normal, based on the minimum expected font size:
  *
  * |[<!-- language="plain" -->
  * float min_ppem = 10.0;  // smallest expected size in pixels
  * float pad = 0.5 * upem / min_ppem;  // half-pixel in em units
  * pos += normal * pad * scale;
  * texcoord += normal * pad;
  * ]|
  *
  * This over-dilates at larger sizes (wasting fill rate on
  * transparent pixels) but is simpler to implement and avoids
  * the need for the MVP matrix and viewport size in the vertex
  * shader.
  *
  * ## Fragment shader
  *
  * The fragment library provides two functions:
  *
  * |[<!-- language="plain" -->
  * float hb_gpu_draw (vec2 renderCoord, uint glyphLoc);
  * ]|
  *
  * It requires the `hb_gpu_atlas` uniform to be bound to the
  * texture containing the encoded glyph data.  By default this
  * is an `isamplerBuffer` (GL_TEXTURE_BUFFER).  For platforms
  * without texture buffers (e.g. WebGL2), define
  * `HB_GPU_ATLAS_2D` before including the shader source; the
  * atlas then becomes an `isampler2D` and a
  * `uniform int hb_gpu_atlas_width` must also be set to the
  * texture width.
  *
  * Parameters:
  *
  * - renderCoord: the interpolated em-space coordinate from
  *   the vertex shader (v_texcoord).
  *
  * - glyphLoc: the texel offset of this glyph's encoded blob
  *   in the atlas buffer.  Passed as a flat varying from the
  *   vertex shader.
  *
  * Returns the coverage in [0, 1]: 1.0 inside the glyph, 0.0
  * outside, with antialiased edges.
  *
  * A typical fragment main:
  *
  * |[<!-- language="plain" -->
  * in vec2 v_texcoord;
  * flat in uint v_glyphLoc;
  * out vec4 fragColor;
  *
  * void main () {
  *   float coverage = hb_gpu_draw (v_texcoord, v_glyphLoc);
  *   fragColor = vec4 (0.0, 0.0, 0.0, coverage);
  * }
  * ]|
  *
  * The coverage value can be used with alpha blending
  * (GL_SRC_ALPHA, GL_ONE_MINUS_SRC_ALPHA) to composite
  * over a background, or multiplied with any color.
  *
  * ## Stem darkening
  *
  * |[<!-- language="plain" -->
  * float hb_gpu_stem_darken (float coverage, float brightness, float ppem);
  * ]|
  *
  * Optional stem darkening adjusts coverage at small sizes so
  * thin stems are not washed out by gamma correction.
  *
  * Parameters:
  *
  * - coverage: a coverage / alpha value in [0, 1].  For
  *   hb_gpu_draw() this is the returned coverage; for
  *   hb_gpu_paint() it is the alpha of the returned vec4.
  *
  * - brightness: brightness of the color that will end up on
  *   screen for this fragment, in [0, 1], computed as
  *   `dot (straight_color.rgb, vec3 (1.0 / 3.0))`.
  *
  * - ppem: pixels per em at this fragment, computed as
  *   `1.0 / max (fwidth (v_texcoord).x, fwidth (v_texcoord).y)`.
  *
  * Example — draw path:
  *
  * |[<!-- language="plain" -->
  * float coverage = hb_gpu_draw (v_texcoord, v_glyphLoc);
  * float brightness = dot (u_foreground.rgb, vec3 (1.0 / 3.0));
  * float ppem = 1.0 / max (fwidth (v_texcoord).x,
  *                          fwidth (v_texcoord).y);
  * coverage = hb_gpu_stem_darken (coverage, brightness, ppem);
  * ]|
  *
  * Example — paint path: hb_gpu_paint() returns premultiplied
  * RGBA, and each paint layer has its own color, so compute
  * brightness from the fragment's own straight (un-premultiplied)
  * color rather than any shader uniform.  This gives the right
  * per-layer darkening on a multi-color glyph, and reduces to the
  * draw-path behaviour on a monochrome glyph (whose single
  * synthesized layer's color is the foreground uniform).
  *
  * |[<!-- language="plain" -->
  * vec4 c = hb_gpu_paint (v_texcoord, v_glyphLoc, u_foreground);
  * if (c.a > 0.0) {
  *   float brightness = dot (c.rgb, vec3 (1.0 / 3.0)) / c.a;
  *   float ppem = 1.0 / max (fwidth (v_texcoord).x,
  *                            fwidth (v_texcoord).y);
  *   float darkened = hb_gpu_stem_darken (c.a, brightness, ppem);
  *   c *= darkened / c.a;  // keep premultiplied RGB and A in sync
  * }
  * ]|
  *
  * # Paint
  *
  * The paint renderer walks the font's paint tree (COLRv0 layers
  * or COLRv1 paint subtree) and records one layer op per solid or
  * gradient fill, each with its own clip outline encoded as a Slug
  * (so the paint renderer reuses the draw renderer internally for
  * clips).  Palette colors and custom overrides are resolved and
  * baked into the blob at encode time, so the shader does not need
  * a separate palette uniform.
  *
  * Monochrome (non-color) glyphs are handled transparently:
  * hb_gpu_paint_glyph() synthesizes a single foreground-colored
  * layer from the glyph's outline, so callers can use the paint
  * renderer uniformly for any font.  This costs more per fragment
  * than the Draw path, but the rendered result is the same.
  *
  * ## Encoding
  *
  * |[<!-- language="plain" -->
  * hb_gpu_paint_t *paint = hb_gpu_paint_create_or_fail ();
  *
  * // Optional: select a palette and install any custom-palette
  * // overrides before hb_gpu_paint_glyph — their values are baked
  * // into the encoded blob.  (The foreground color is NOT set here:
  * // it stays a shader uniform so dark-mode and color-change
  * // toggles take effect without re-encoding.)
  * hb_gpu_paint_set_palette (paint, 0);
  * hb_gpu_paint_set_custom_palette_color (paint, 2,
  *                                        HB_COLOR (0xff, 0, 0, 0xff));
  *
  * hb_gpu_paint_glyph (paint, font, gid);
  * hb_glyph_extents_t ext;
  * hb_blob_t *blob = hb_gpu_paint_encode (paint, &ext);
  *
  * // Upload hb_blob_get_data(blob) to the same atlas used by draw.
  * unsigned atlas_offset = upload_to_atlas (blob);
  *
  * hb_gpu_paint_recycle_blob (paint, blob);
  * // repeat for more glyphs...
  *
  * hb_gpu_paint_destroy (paint);
  * ]|
  *
  * As with the draw encoder, create the paint encoder once and
  * reuse it.  hb_gpu_paint_encode() auto-clears on return; user
  * configuration (palette, custom-palette overrides) is preserved
  * across encodes.  Because colors are baked, changing any of that
  * configuration afterwards requires re-encoding the affected
  * glyphs.  The encoded blob's texel format is the same RGBA16I
  * as the draw renderer's, so both kinds of blobs can coexist in
  * one atlas at different offsets.
  *
  * ## Shader composition
  *
  * Same pattern as the draw renderer but using
  * hb_gpu_paint_shader_source() for the paint-specific helpers.
  * The vertex stage is identical — hb_gpu_paint_shader_source()
  * returns an empty string for HB_GPU_SHADER_STAGE_VERTEX, so the
  * fixed assembly order still works:
  *
  * |[<!-- language="plain" -->
  * const char *frag_sources[] = {
  *     "#version 330\n",
  *     hb_gpu_shader_source       (HB_GPU_SHADER_STAGE_FRAGMENT,
  *                                 HB_GPU_SHADER_LANG_GLSL),
  *     hb_gpu_paint_shader_source (HB_GPU_SHADER_STAGE_FRAGMENT,
  *                                 HB_GPU_SHADER_LANG_GLSL),
  *     your_fragment_main
  * };
  * glShaderSource (frag_shader, 4, frag_sources, NULL);
  * ]|
  *
  * The atlas setup, vertex attributes, dilation, and font scaling
  * guidance from the Draw section all apply unchanged.
  *
  * ## Fragment shader
  *
  * The paint library provides one function:
  *
  * |[<!-- language="plain" -->
  * vec4 hb_gpu_paint (vec2 renderCoord, uint glyphLoc, vec4 foreground);
  * ]|
  *
  * Parameters:
  *
  * - renderCoord: the interpolated em-space coordinate from the
  *   vertex shader (v_texcoord), as for hb_gpu_draw().
  *
  * - glyphLoc: the texel offset of this glyph's encoded paint blob
  *   in the atlas, passed as a flat varying from the vertex shader.
  *
  * - foreground: the foreground color used to resolve palette
  *   entries that COLRv1 marks as "foreground color".  The encoder
  *   only records the is-foreground flag per layer/stop; the actual
  *   color is substituted here at render time, so runtime
  *   foreground-color changes (e.g. a dark-mode toggle) take effect
  *   without re-encoding any glyphs.
  *
  * Returns premultiplied RGBA: fully transparent outside the glyph,
  * composited layers inside.  Use (GL_ONE, GL_ONE_MINUS_SRC_ALPHA)
  * blending to composite over a background.
  *
  * A typical fragment main:
  *
  * |[<!-- language="plain" -->
  * uniform vec4 u_foreground;
  * in vec2 v_texcoord;
  * flat in uint v_glyphLoc;
  * out vec4 fragColor;
  *
  * void main () {
  *   fragColor = hb_gpu_paint (v_texcoord, v_glyphLoc, u_foreground);
  * }
  * ]|
  *
  * Stem darkening applies to the paint output the same way it does
  * for draw; the alpha channel of the returned vec4 carries the
  * coverage.  See the Draw section's Stem darkening notes.
  **/


 #include "hb-gpu-fragment-glsl.hh"
 #include "hb-gpu-fragment-msl.hh"
 #include "hb-gpu-fragment-wgsl.hh"
 #include "hb-gpu-fragment-hlsl.hh"
 #include "hb-gpu-vertex-glsl.hh"
 #include "hb-gpu-vertex-msl.hh"
 #include "hb-gpu-vertex-wgsl.hh"
 #include "hb-gpu-vertex-hlsl.hh"

 /**
  * hb_gpu_shader_source:
  * @stage: pipeline stage (vertex or fragment)
  * @lang: shader language variant
  *
  * Returns the shared helper shader source used by the hb-gpu
  * renderers for the specified stage and language.  The returned
  * string is static and must not be freed.
  *
  * The shared source defines utilities such as the atlas sampler
  * and the `hb_gpu_fetch()` accessor for the fragment stage, and
  * the `hb_gpu_dilate()` helper for the vertex stage.  Each
  * renderer-specific source (for example,
  * hb_gpu_draw_shader_source()) assumes these helpers are already
  * in scope — callers should concatenate a `#version` directive,
  * then this shared source, then the renderer-specific source,
  * then their own `main()`.
  *
  * Return value: (transfer none):
  * A shader source string, or `NULL` if @stage or @lang is
  * unsupported.
  *
  * XSince: REPLACEME
  **/
 const char *
 hb_gpu_shader_source (hb_gpu_shader_stage_t stage,
 		      hb_gpu_shader_lang_t  lang)
 {
   switch (stage) {
   case HB_GPU_SHADER_STAGE_FRAGMENT:
     switch (lang) {
     case HB_GPU_SHADER_LANG_GLSL: return hb_gpu_fragment_glsl;
     case HB_GPU_SHADER_LANG_MSL:  return hb_gpu_fragment_msl;
     case HB_GPU_SHADER_LANG_WGSL: return hb_gpu_fragment_wgsl;
     case HB_GPU_SHADER_LANG_HLSL: return hb_gpu_fragment_hlsl;
     default: return nullptr;
     }
   case HB_GPU_SHADER_STAGE_VERTEX:
     switch (lang) {
     case HB_GPU_SHADER_LANG_GLSL: return hb_gpu_vertex_glsl;
     case HB_GPU_SHADER_LANG_MSL:  return hb_gpu_vertex_msl;
     case HB_GPU_SHADER_LANG_WGSL: return hb_gpu_vertex_wgsl;
     case HB_GPU_SHADER_LANG_HLSL: return hb_gpu_vertex_hlsl;
     default: return nullptr;
     }
   default:
     return nullptr;
   }
 }


 /* ---- Internal blob-recycling helpers (shared by draw and paint) ---- */

 void
 _hb_gpu_blob_data_destroy (void *user_data)
 {
   auto *bd = (hb_gpu_blob_data_t *) user_data;
   hb_free (bd->buf);
   hb_free (bd);
 }

 char *
 _hb_gpu_blob_acquire (hb_blob_t *recycled,
 		      unsigned   needed,
 		      unsigned  *out_capacity,
 		      char     **out_replaced_buf)
 {
   *out_replaced_buf = nullptr;

   if (recycled && recycled->destroy == _hb_gpu_blob_data_destroy)
   {
     auto *bd = (hb_gpu_blob_data_t *) recycled->user_data;
     if (bd->capacity >= needed)
     {
       *out_capacity = bd->capacity;
       return bd->buf;
     }
     /* Grow with a 1.5x ramp to amortize repeated growth. */
     unsigned alloc_bytes = needed;
     if (unlikely (hb_unsigned_add_overflows (needed, needed / 2,
 					     &alloc_bytes)))
       alloc_bytes = needed;
     char *new_buf = (char *) hb_realloc (bd->buf, alloc_bytes);
     if (new_buf)
     {
       bd->buf = new_buf;
       bd->capacity = alloc_bytes;
       *out_capacity = alloc_bytes;
       return new_buf;
     }
     /* Realloc failed.  Fall through to a fresh hb_malloc and stash
      * the old buf for the caller to free after _finalize. */
     *out_replaced_buf = bd->buf;
   }

   char *buf = (char *) hb_malloc (needed);
   if (unlikely (!buf))
     return nullptr;
   *out_capacity = needed;
   return buf;
 }

 hb_blob_t *
 _hb_gpu_blob_finalize (char      *buf,
 		       unsigned   capacity,
 		       unsigned   length,
 		       hb_blob_t *recycled,
 		       char      *replaced_recycled_buf)
 {
   if (recycled && recycled->destroy == _hb_gpu_blob_data_destroy)
   {
     auto *bd = (hb_gpu_blob_data_t *) recycled->user_data;
     if (replaced_recycled_buf && replaced_recycled_buf != buf)
       hb_free (replaced_recycled_buf);
     bd->buf = buf;
     bd->capacity = capacity;
     recycled->data = (const char *) buf;
     recycled->length = length;
     return recycled;
   }

   /* No recycled blob to update -- create a fresh one with our
    * destroy closure so the next recycle round can reuse it. */
   hb_gpu_blob_data_t *bd = (hb_gpu_blob_data_t *) hb_malloc (sizeof (*bd));
   if (unlikely (!bd))
   {
     hb_free (buf);
     return nullptr;
   }
   bd->buf = buf;
   bd->capacity = capacity;

   return hb_blob_create ((const char *) buf, length,
 			 HB_MEMORY_MODE_WRITABLE,
 			 bd, _hb_gpu_blob_data_destroy);
 }

 void
 _hb_gpu_blob_abort (char *buf, hb_blob_t *recycled)
 {
   if (!buf) return;
   if (recycled && recycled->destroy == _hb_gpu_blob_data_destroy)
   {
     auto *bd = (hb_gpu_blob_data_t *) recycled->user_data;
     if (buf == bd->buf) return;  /* owned by the recycled blob */
   }
   hb_free (buf);
 }

 void
 _hb_gpu_blob_recycle (hb_blob_t **slot, hb_blob_t *blob)
 {
   hb_blob_destroy (*slot);
   *slot = nullptr;
   if (!blob || blob == hb_blob_get_empty ())
     return;
   *slot = blob;
 }