Add A/B test mode to local devicelab runner (#54494)

* Add A/B test mode to local devicelab runner
diff --git a/dev/devicelab/README.md b/dev/devicelab/README.md
index 6310f3f..8cc0021 100644
--- a/dev/devicelab/README.md
+++ b/dev/devicelab/README.md
@@ -162,6 +162,47 @@
 
 An example of a local engine architecture is `android_debug_unopt_x86`.
 
+## Running an A/B test for engine changes
+
+You can run an A/B test that compares the performance of the default engine
+against a local engine build. The test runs the same benchmark a specified
+number of times against both engines, then outputs a tab-separated spreadsheet
+with the results. The results can be copied to a Google Spreadsheet for further
+inspection.
+
+Example:
+
+```sh
+../../bin/cache/dart-sdk/bin/dart bin/run.dart --ab=10 \
+  --local-engine=host_debug_unopt \
+  -t bin/tasks/web_benchmarks_canvaskit.dart
+```
+
+The `--ab=10` tells the runner to run an A/B test 10 times.
+
+`--local-engine=host_debug_unopt` tells the A/B test to use the `host_debug_unopt`
+engine build. `--local-engine` is required for A/B test.
+
+A/B can run exactly one task. Multiple tasks are not supported.
+
+Example output:
+
+```
+Score	Average A (noise)	Average B (noise)	Speed-up
+bench_card_infinite_scroll.canvaskit.drawFrameDuration.average	2900.20 (8.44%)	2426.70 (8.94%)	1.20x
+bench_card_infinite_scroll.canvaskit.totalUiFrame.average	4964.00 (6.29%)	4098.00 (8.03%)	1.21x
+draw_rect.canvaskit.windowRenderDuration.average	1959.45 (16.56%)	2286.65 (0.61%)	0.86x
+draw_rect.canvaskit.sceneBuildDuration.average	1969.45 (16.37%)	2294.90 (0.58%)	0.86x
+draw_rect.canvaskit.drawFrameDuration.average	5335.20 (17.59%)	6437.60 (0.59%)	0.83x
+draw_rect.canvaskit.totalUiFrame.average	6832.00 (13.16%)	7932.00 (0.34%)	0.86x
+```
+
+The output contains averages and noises for each score. More importantly, it
+contains the speed-up value, i.e. how much _faster_ is the local engine than
+the default engine. Values less than 1.0 indicate a slow-down. For example,
+0.5x means the local engine is twice as slow as the default engine, and 2.0x
+means it's twice as fast. Higher is better.
+
 # Reproducing broken builds locally
 
 To reproduce the breakage locally `git checkout` the corresponding Flutter
diff --git a/dev/devicelab/bin/run.dart b/dev/devicelab/bin/run.dart
index a3f1762..7a2daa6 100644
--- a/dev/devicelab/bin/run.dart
+++ b/dev/devicelab/bin/run.dart
@@ -9,18 +9,34 @@
 import 'package:args/args.dart';
 import 'package:path/path.dart' as path;
 
+import 'package:flutter_devicelab/framework/ab.dart';
 import 'package:flutter_devicelab/framework/manifest.dart';
 import 'package:flutter_devicelab/framework/runner.dart';
 import 'package:flutter_devicelab/framework/utils.dart';
 
+ArgResults args;
+
 List<String> _taskNames = <String>[];
 
+/// Suppresses standard output, prints only standard error output.
+bool silent;
+
+/// The build of the local engine to use.
+///
+/// Required for A/B test mode.
+String localEngine;
+
+/// The path to the engine "src/" directory.
+String localEngineSrcPath;
+
+/// Whether to exit on first test failure.
+bool exitOnFirstTestFailure;
+
 /// Runs tasks.
 ///
 /// The tasks are chosen depending on the command-line options
 /// (see [_argParser]).
 Future<void> main(List<String> rawArgs) async {
-  ArgResults args;
   try {
     args = _argParser.parse(rawArgs);
   } on FormatException catch (error) {
@@ -55,10 +71,19 @@
     return;
   }
 
-  final bool silent = args['silent'] as bool;
-  final String localEngine = args['local-engine'] as String;
-  final String localEngineSrcPath = args['local-engine-src-path'] as String;
+  silent = args['silent'] as bool;
+  localEngine = args['local-engine'] as String;
+  localEngineSrcPath = args['local-engine-src-path'] as String;
+  exitOnFirstTestFailure = args['exit'] as bool;
 
+  if (args.wasParsed('ab')) {
+    await _runABTest();
+  } else {
+    await _runTasks();
+  }
+}
+
+Future<void> _runTasks() async {
   for (final String taskName in _taskNames) {
     section('Running task "$taskName"');
     final Map<String, dynamic> result = await runTask(
@@ -74,13 +99,73 @@
 
     if (!(result['success'] as bool)) {
       exitCode = 1;
-      if (args['exit'] as bool) {
+      if (exitOnFirstTestFailure) {
         return;
       }
     }
   }
 }
 
+Future<void> _runABTest() async {
+  final int runsPerTest = int.parse(args['ab'] as String);
+
+  if (_taskNames.length > 1) {
+    stderr.writeln('When running in A/B test mode exactly one task must be passed but got ${_taskNames.join(', ')}.\n');
+    stderr.writeln(_argParser.usage);
+    exit(1);
+  }
+
+  if (!args.wasParsed('local-engine')) {
+    stderr.writeln('When running in A/B test mode --local-engine is required.\n');
+    stderr.writeln(_argParser.usage);
+    exit(1);
+  }
+
+  final String taskName = _taskNames.single;
+
+  print('$taskName A/B test. Will run $runsPerTest times.');
+
+  final ABTest abTest = ABTest();
+  for (int i = 1; i <= runsPerTest; i++) {
+    section('Run #$i');
+
+    print('Running with the default engine (A)');
+    final Map<String, dynamic> defaultEngineResult = await runTask(
+      taskName,
+      silent: silent,
+    );
+
+    print('Default engine result:');
+    print(const JsonEncoder.withIndent('  ').convert(defaultEngineResult));
+
+    if (!(defaultEngineResult['success'] as bool)) {
+      stderr.writeln('Task failed on the default engine.');
+      exit(1);
+    }
+
+    abTest.addAResult(defaultEngineResult);
+
+    print('Running with the local engine (B)');
+    final Map<String, dynamic> localEngineResult = await runTask(
+      taskName,
+      silent: silent,
+      localEngine: localEngine,
+      localEngineSrcPath: localEngineSrcPath,
+    );
+
+    print('Task localEngineResult:');
+    print(const JsonEncoder.withIndent('  ').convert(localEngineResult));
+
+    if (!(localEngineResult['success'] as bool)) {
+      stderr.writeln('Task failed on the local engine.');
+      exit(1);
+    }
+
+    abTest.addBResult(localEngineResult);
+  }
+  print(abTest.printSummary());
+}
+
 void addTasks({
   List<ManifestTask> tasks,
   ArgResults args,
@@ -132,6 +217,22 @@
       }
     },
   )
+  ..addOption(
+    'ab',
+    help: 'Runs an A/B test comparing the default engine with the local\n'
+          'engine build for one task. This option does not support running\n'
+          'multiple tasks. The value is the number of times to run the task.\n'
+          'The task is expected to be a benchmark that reports score keys.\n'
+          'The A/B test collects the metrics collected by the test and\n'
+          'produces a report containing averages, noise, and the speed-up\n'
+          'between the two engines. --local-engine is required when running\n'
+          'an A/B test.',
+    callback: (String value) {
+      if (value != null && int.tryParse(value) == null) {
+        throw ArgParserException('Option --ab must be a number, but was "$value".');
+      }
+    },
+  )
   ..addFlag(
     'all',
     abbr: 'a',
@@ -152,7 +253,8 @@
     help: 'Name of a build output within the engine out directory, if you\n'
           'are building Flutter locally. Use this to select a specific\n'
           'version of the engine if you have built multiple engine targets.\n'
-          'This path is relative to --local-engine-src-path/out.',
+          'This path is relative to --local-engine-src-path/out. This option\n'
+          'is required when running an A/B test (see the --ab option).',
   )
   ..addFlag(
     'list',
diff --git a/dev/devicelab/lib/framework/ab.dart b/dev/devicelab/lib/framework/ab.dart
new file mode 100644
index 0000000..ad1de24
--- /dev/null
+++ b/dev/devicelab/lib/framework/ab.dart
@@ -0,0 +1,136 @@
+// Copyright 2014 The Flutter Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file.
+
+import 'dart:math' as math;
+import 'package:meta/meta.dart';
+
+/// Collects data from an A/B test and produces a summary for human evaluation.
+///
+/// See [printSummary] for more.
+class ABTest {
+  final Map<String, List<double>> _aResults = <String, List<double>>{};
+  final Map<String, List<double>> _bResults = <String, List<double>>{};
+
+  /// Adds the result of a single A run of the benchmark.
+  ///
+  /// The result may contain multiple score keys.
+  ///
+  /// [result] is expected to be a serialization of [TaskResult].
+  void addAResult(Map<String, dynamic> result) {
+    _addResult(result, _aResults);
+  }
+
+  /// Adds the result of a single B run of the benchmark.
+  ///
+  /// The result may contain multiple score keys.
+  ///
+  /// [result] is expected to be a serialization of [TaskResult].
+  void addBResult(Map<String, dynamic> result) {
+    _addResult(result, _bResults);
+  }
+
+  /// Returns the summary as a tab-separated spreadsheet.
+  ///
+  /// This value can be copied straight to a Google Spreadsheet for further analysis.
+  String printSummary() {
+    final Map<String, _ScoreSummary> summariesA = _summarize(_aResults);
+    final Map<String, _ScoreSummary> summariesB = _summarize(_bResults);
+    final Set<String> scoreKeyUnion = <String>{
+      ...summariesA.keys,
+      ...summariesB.keys,
+    };
+
+    final StringBuffer buffer = StringBuffer(
+      'Score\tAverage A (noise)\tAverage B (noise)\tSpeed-up\n',
+    );
+
+    for (final String scoreKey in scoreKeyUnion) {
+      final _ScoreSummary summaryA = summariesA[scoreKey];
+      final _ScoreSummary summaryB = summariesB[scoreKey];
+      buffer.write('$scoreKey\t');
+
+      if (summaryA != null) {
+        buffer.write('${summaryA.average.toStringAsFixed(2)} (${_ratioToPercent(summaryA.noise)})\t');
+      } else {
+        buffer.write('\t');
+      }
+
+      if (summaryB != null) {
+        buffer.write('${summaryB.average.toStringAsFixed(2)} (${_ratioToPercent(summaryB.noise)})\t');
+      } else {
+        buffer.write('\t');
+      }
+
+      if (summaryA != null && summaryB != null) {
+        buffer.write('${(summaryA.average / summaryB.average).toStringAsFixed(2)}x\t');
+      }
+
+      buffer.writeln();
+    }
+
+    return buffer.toString();
+  }
+}
+
+class _ScoreSummary {
+  _ScoreSummary({
+    @required this.average,
+    @required this.noise,
+  });
+
+  /// Average (arithmetic mean) of a series of values collected by a benchmark.
+  final double average;
+
+  /// The noise (standard deviation divided by [average]) in the collected
+  /// values.
+  final double noise;
+}
+
+void _addResult(Map<String, dynamic> result, Map<String, List<double>> results) {
+  final List<String> scoreKeys = (result['benchmarkScoreKeys'] as List<dynamic>).cast<String>();
+  final Map<String, dynamic> data = result['data'] as Map<String, dynamic>;
+  for (final String scoreKey in scoreKeys) {
+    final double score = (data[scoreKey] as num).toDouble();
+    results.putIfAbsent(scoreKey, () => <double>[]).add(score);
+  }
+}
+
+Map<String, _ScoreSummary> _summarize(Map<String, List<double>> results) {
+  return results.map<String, _ScoreSummary>((String scoreKey, List<double> values) {
+    final double average = _computeAverage(values);
+    return MapEntry<String, _ScoreSummary>(scoreKey, _ScoreSummary(
+      average: average,
+      // If the average is zero, the benchmark got the perfect score with no noise.
+      noise: average > 0
+        ? _computeStandardDeviationForPopulation(values) / average
+        : 0.0,
+    ));
+  });
+}
+
+/// Computes the arithmetic mean (or average) of given [values].
+double _computeAverage(Iterable<double> values) {
+  final double sum = values.reduce((double a, double b) => a + b);
+  return sum / values.length;
+}
+
+/// Computes population standard deviation.
+///
+/// Unlike sample standard deviation, which divides by N - 1, this divides by N.
+///
+/// See also:
+///
+/// * https://en.wikipedia.org/wiki/Standard_deviation
+double _computeStandardDeviationForPopulation(Iterable<double> population) {
+  final double mean = _computeAverage(population);
+  final double sumOfSquaredDeltas = population.fold<double>(
+    0.0,
+    (double previous, num value) => previous += math.pow(value - mean, 2),
+  );
+  return math.sqrt(sumOfSquaredDeltas / population.length);
+}
+
+String _ratioToPercent(double value) {
+  return '${(value * 100).toStringAsFixed(2)}%';
+}