[Impeller] offload all text computation into vertex shader (#42417)

TextContents::Render occassionally shows up in the highest CPU functions. We can actually unload most of this computation into the vertex shader.
diff --git a/impeller/entity/contents/text_contents.cc b/impeller/entity/contents/text_contents.cc
index 09b6264..c4cd636 100644
--- a/impeller/entity/contents/text_contents.cc
+++ b/impeller/entity/contents/text_contents.cc
@@ -78,27 +78,32 @@
   return bounds->TransformBounds(entity.GetTransformation());
 }
 
-static bool CommonRender(
-    const ContentContext& renderer,
-    const Entity& entity,
-    RenderPass& pass,
-    const Color& color,
-    const TextFrame& frame,
-    Vector2 offset,
-    std::shared_ptr<GlyphAtlas>
-        atlas,  // NOLINT(performance-unnecessary-value-param)
-    Command& cmd) {
+static bool CommonRender(const ContentContext& renderer,
+                         const Entity& entity,
+                         RenderPass& pass,
+                         const Color& color,
+                         const TextFrame& frame,
+                         Vector2 offset,
+                         const std::shared_ptr<GlyphAtlas>& atlas,
+                         Command& cmd) {
   using VS = GlyphAtlasPipeline::VertexShader;
   using FS = GlyphAtlasPipeline::FragmentShader;
 
   // Common vertex uniforms for all glyphs.
   VS::FrameInfo frame_info;
-
   frame_info.mvp = Matrix::MakeOrthographic(pass.GetRenderTargetSize());
+  frame_info.atlas_size =
+      Vector2{static_cast<Scalar>(atlas->GetTexture()->GetSize().width),
+              static_cast<Scalar>(atlas->GetTexture()->GetSize().height)};
+  frame_info.offset = offset;
+  frame_info.is_translation_scale =
+      entity.GetTransformation().IsTranslationScaleOnly();
+  frame_info.entity_transform = entity.GetTransformation();
+
   VS::BindFrameInfo(cmd, pass.GetTransientsBuffer().EmplaceUniform(frame_info));
 
   SamplerDescriptor sampler_desc;
-  if (entity.GetTransformation().IsTranslationScaleOnly()) {
+  if (frame_info.is_translation_scale) {
     sampler_desc.min_filter = MinMagFilter::kNearest;
     sampler_desc.mag_filter = MinMagFilter::kNearest;
   } else {
@@ -152,12 +157,6 @@
     index_offset += 4;
   }
 
-  auto atlas_size =
-      Point{static_cast<Scalar>(atlas->GetTexture()->GetSize().width),
-            static_cast<Scalar>(atlas->GetTexture()->GetSize().height)};
-
-  Vector2 screen_offset = (entity.GetTransformation() * offset).Round();
-
   for (const auto& run : frame.GetRuns()) {
     const Font& font = run.GetFont();
 
@@ -168,40 +167,22 @@
         VALIDATION_LOG << "Could not find glyph position in the atlas.";
         return false;
       }
-
-      // For each glyph, we compute two rectangles. One for the vertex positions
-      // and one for the texture coordinates (UVs).
-
-      auto uv_origin =
-          (atlas_glyph_bounds->origin - Point(0.5, 0.5)) / atlas_size;
-      auto uv_size = (atlas_glyph_bounds->size + Size(1, 1)) / atlas_size;
-
-      // Rounding here prevents most jitter between glyphs in the run when
-      // nearest sampling.
-      auto screen_glyph_position =
-          screen_offset +
-          (entity.GetTransformation().Basis() *
-           (glyph_position.position + glyph_position.glyph.bounds.origin))
-              .Round();
+      Vector4 atlas_glyph_bounds_vec = Vector4(
+          atlas_glyph_bounds->origin.x, atlas_glyph_bounds->origin.y,
+          atlas_glyph_bounds->size.width, atlas_glyph_bounds->size.height);
+      Vector4 glyph_bounds_vec =
+          Vector4(glyph_position.glyph.bounds.origin.x,
+                  glyph_position.glyph.bounds.origin.y,
+                  glyph_position.glyph.bounds.size.width,
+                  glyph_position.glyph.bounds.size.height);
 
       for (const auto& point : unit_points) {
-        VS::PerVertexData vtx;
-
-        if (entity.GetTransformation().IsTranslationScaleOnly()) {
-          // Rouding up here prevents the bounds from becoming 1 pixel too small
-          // when nearest sampling. This path breaks down for projections.
-          vtx.position =
-              screen_glyph_position + (entity.GetTransformation().Basis() *
-                                       point * glyph_position.glyph.bounds.size)
-                                          .Ceil();
-        } else {
-          vtx.position = entity.GetTransformation() *
-                         Vector4(offset + glyph_position.position +
-                                 glyph_position.glyph.bounds.origin +
-                                 point * glyph_position.glyph.bounds.size);
-        }
-        vtx.uv = uv_origin + point * uv_size;
-        vertex_builder.AppendVertex(vtx);
+        vertex_builder.AppendVertex(VS::PerVertexData{
+            .atlas_glyph_bounds = atlas_glyph_bounds_vec,
+            .glyph_bounds = glyph_bounds_vec,
+            .unit_position = point,
+            .glyph_position = glyph_position.position,
+        });
       }
     }
   }
@@ -209,11 +190,7 @@
       vertex_builder.CreateVertexBuffer(pass.GetTransientsBuffer());
   cmd.BindVertices(vertex_buffer);
 
-  if (!pass.AddCommand(cmd)) {
-    return false;
-  }
-
-  return true;
+  return pass.AddCommand(cmd);
 }
 
 bool TextContents::Render(const ContentContext& renderer,
diff --git a/impeller/entity/shaders/glyph_atlas.vert b/impeller/entity/shaders/glyph_atlas.vert
index 3cf2ce7..e2a7e04 100644
--- a/impeller/entity/shaders/glyph_atlas.vert
+++ b/impeller/entity/shaders/glyph_atlas.vert
@@ -7,15 +7,75 @@
 
 uniform FrameInfo {
   mat4 mvp;
+  mat4 entity_transform;
+  vec2 atlas_size;
+  vec2 offset;
+  float is_translation_scale;
 }
 frame_info;
 
-in highp vec4 position;
-in vec2 uv;
+// XYWH.
+in vec4 atlas_glyph_bounds;
+// XYWH
+in vec4 glyph_bounds;
+
+in vec2 unit_position;
+in vec2 glyph_position;
 
 out vec2 v_uv;
 
+mat4 basis(mat4 m) {
+  return mat4(m[0][0], m[0][1], m[0][2], 0.0,  //
+              m[1][0], m[1][1], m[1][2], 0.0,  //
+              m[2][0], m[2][1], m[2][2], 0.0,  //
+              0.0, 0.0, 0.0, 1.0               //
+  );
+}
+
+vec2 project(mat4 m, vec2 v) {
+  float w = v.x * m[0][3] + v.y * m[1][3] + m[3][3];
+  vec2 result = vec2(v.x * m[0][0] + v.y * m[1][0] + m[3][0],
+                     v.x * m[0][1] + v.y * m[1][1] + m[3][1]);
+
+  // This is Skia's behavior, but it may be reasonable to allow UB for the w=0
+  // case.
+  if (w != 0) {
+    w = 1 / w;
+  }
+  return result * w;
+}
+
 void main() {
+  vec2 screen_offset =
+      round(project(frame_info.entity_transform, frame_info.offset));
+
+  // For each glyph, we compute two rectangles. One for the vertex positions
+  // and one for the texture coordinates (UVs).
+  vec2 uv_origin = (atlas_glyph_bounds.xy - vec2(0.5)) / frame_info.atlas_size;
+  vec2 uv_size = (atlas_glyph_bounds.zw + vec2(1)) / frame_info.atlas_size;
+
+  // Rounding here prevents most jitter between glyphs in the run when
+  // nearest sampling.
+  mat4 basis_transform = basis(frame_info.entity_transform);
+  vec2 screen_glyph_position =
+      screen_offset +
+      round(project(basis_transform, (glyph_position + glyph_bounds.xy)));
+
+  vec4 position;
+  if (frame_info.is_translation_scale == 1.0) {
+    // Rouding up here prevents the bounds from becoming 1 pixel too small
+    // when nearest sampling. This path breaks down for projections.
+    position = vec4(
+        screen_glyph_position +
+            ceil(project(basis_transform, unit_position * glyph_bounds.zw)),
+        0.0, 1.0);
+  } else {
+    position = frame_info.entity_transform *
+               vec4(frame_info.offset + glyph_position + glyph_bounds.xy +
+                        unit_position * glyph_bounds.zw,
+                    0.0, 1.0);
+  }
+
   gl_Position = frame_info.mvp * position;
-  v_uv = uv;
+  v_uv = uv_origin + unit_position * uv_size;
 }
diff --git a/impeller/tools/malioc.json b/impeller/tools/malioc.json
index 3697d3a..4d232a0 100644
--- a/impeller/tools/malioc.json
+++ b/impeller/tools/malioc.json
@@ -7552,22 +7552,22 @@
     "Mali-G78": {
       "core": "Mali-G78",
       "filename": "flutter/impeller/entity/gles/glyph_atlas.vert.gles",
-      "has_uniform_computation": false,
+      "has_uniform_computation": true,
       "type": "Vertex",
       "variants": {
         "Position": {
-          "fp16_arithmetic": 0,
+          "fp16_arithmetic": 96,
           "has_stack_spilling": false,
           "performance": {
             "longest_path_bound_pipelines": [
               "load_store"
             ],
             "longest_path_cycles": [
-              0.265625,
-              0.265625,
+              0.34375,
+              0.34375,
+              0.171875,
               0.0,
-              0.0,
-              2.0,
+              4.0,
               0.0
             ],
             "pipelines": [
@@ -7582,43 +7582,43 @@
               "load_store"
             ],
             "shortest_path_cycles": [
-              0.265625,
-              0.265625,
+              0.25,
+              0.25,
+              0.078125,
               0.0,
-              0.0,
-              2.0,
+              4.0,
               0.0
             ],
             "total_bound_pipelines": [
               "load_store"
             ],
             "total_cycles": [
-              0.265625,
-              0.265625,
+              0.453125,
+              0.453125,
+              0.1875,
               0.0,
-              0.0,
-              2.0,
+              4.0,
               0.0
             ]
           },
           "stack_spill_bytes": 0,
           "thread_occupancy": 100,
-          "uniform_registers_used": 16,
+          "uniform_registers_used": 26,
           "work_registers_used": 32
         },
         "Varying": {
-          "fp16_arithmetic": null,
+          "fp16_arithmetic": 100,
           "has_stack_spilling": false,
           "performance": {
             "longest_path_bound_pipelines": [
               "load_store"
             ],
             "longest_path_cycles": [
+              0.078125,
+              0.078125,
               0.0,
               0.0,
-              0.0,
-              0.0,
-              3.0,
+              4.0,
               0.0
             ],
             "pipelines": [
@@ -7633,36 +7633,36 @@
               "load_store"
             ],
             "shortest_path_cycles": [
+              0.078125,
+              0.078125,
               0.0,
               0.0,
-              0.0,
-              0.0,
-              3.0,
+              4.0,
               0.0
             ],
             "total_bound_pipelines": [
               "load_store"
             ],
             "total_cycles": [
+              0.078125,
+              0.078125,
               0.0,
               0.0,
-              0.0,
-              0.0,
-              3.0,
+              4.0,
               0.0
             ]
           },
           "stack_spill_bytes": 0,
           "thread_occupancy": 100,
-          "uniform_registers_used": 8,
-          "work_registers_used": 6
+          "uniform_registers_used": 12,
+          "work_registers_used": 9
         }
       }
     },
     "Mali-T880": {
       "core": "Mali-T880",
       "filename": "flutter/impeller/entity/gles/glyph_atlas.vert.gles",
-      "has_uniform_computation": false,
+      "has_uniform_computation": true,
       "type": "Vertex",
       "variants": {
         "Main": {
@@ -7672,8 +7672,8 @@
               "load_store"
             ],
             "longest_path_cycles": [
-              2.9700000286102295,
-              5.0,
+              6.929999828338623,
+              7.0,
               0.0
             ],
             "pipelines": [
@@ -7685,21 +7685,21 @@
               "load_store"
             ],
             "shortest_path_cycles": [
-              2.9700000286102295,
-              5.0,
+              5.940000057220459,
+              7.0,
               0.0
             ],
             "total_bound_pipelines": [
-              "load_store"
+              "arithmetic"
             ],
             "total_cycles": [
-              3.0,
-              5.0,
+              9.0,
+              7.0,
               0.0
             ]
           },
           "thread_occupancy": 100,
-          "uniform_registers_used": 4,
+          "uniform_registers_used": 7,
           "work_registers_used": 2
         }
       }
@@ -10963,18 +10963,18 @@
       "type": "Vertex",
       "variants": {
         "Position": {
-          "fp16_arithmetic": 0,
+          "fp16_arithmetic": 100,
           "has_stack_spilling": false,
           "performance": {
             "longest_path_bound_pipelines": [
               "load_store"
             ],
             "longest_path_cycles": [
-              0.25,
-              0.25,
+              0.3125,
+              0.3125,
+              0.15625,
               0.0,
-              0.0,
-              2.0,
+              4.0,
               0.0
             ],
             "pipelines": [
@@ -10991,41 +10991,41 @@
             "shortest_path_cycles": [
               0.25,
               0.25,
+              0.09375,
               0.0,
-              0.0,
-              2.0,
+              4.0,
               0.0
             ],
             "total_bound_pipelines": [
               "load_store"
             ],
             "total_cycles": [
-              0.25,
-              0.25,
+              0.4375,
+              0.4375,
+              0.171875,
               0.0,
-              0.0,
-              2.0,
+              4.0,
               0.0
             ]
           },
           "stack_spill_bytes": 0,
           "thread_occupancy": 100,
-          "uniform_registers_used": 24,
+          "uniform_registers_used": 34,
           "work_registers_used": 32
         },
         "Varying": {
-          "fp16_arithmetic": null,
+          "fp16_arithmetic": 100,
           "has_stack_spilling": false,
           "performance": {
             "longest_path_bound_pipelines": [
               "load_store"
             ],
             "longest_path_cycles": [
+              0.09375,
+              0.09375,
               0.0,
               0.0,
-              0.0,
-              0.0,
-              3.0,
+              4.0,
               0.0
             ],
             "pipelines": [
@@ -11040,29 +11040,29 @@
               "load_store"
             ],
             "shortest_path_cycles": [
+              0.09375,
+              0.09375,
               0.0,
               0.0,
-              0.0,
-              0.0,
-              3.0,
+              4.0,
               0.0
             ],
             "total_bound_pipelines": [
               "load_store"
             ],
             "total_cycles": [
+              0.09375,
+              0.09375,
               0.0,
               0.0,
-              0.0,
-              0.0,
-              3.0,
+              4.0,
               0.0
             ]
           },
           "stack_spill_bytes": 0,
           "thread_occupancy": 100,
-          "uniform_registers_used": 24,
-          "work_registers_used": 6
+          "uniform_registers_used": 30,
+          "work_registers_used": 9
         }
       }
     }