blob: 0e280cc6ae8bcb08a6d8b61a5c2384667159d00c [file] [log] [blame]
// Copyright 2013 The Flutter Authors. All rights reserved.
// Use of this source code is governed by a BSD-style license that can be
// found in the LICENSE file.
const int kChar_0 = 48;
const int kChar_9 = kChar_0 + 9;
const int kChar_A = 65;
const int kChar_Z = 90;
const int kChar_a = 97;
const int kChar_z = 122;
const int kCharBang = 33;
const int kMashriqi_0 = 0x660;
const int kMashriqi_9 = kMashriqi_0 + 9;
enum _ComparisonResult {
inside,
higher,
lower,
}
/// Each instance of [UnicodeRange] represents a range of unicode characters
/// that are assigned a [CharProperty]. For example, the following snippet:
///
/// ```dart
/// UnicodeRange(0x0041, 0x005A, CharProperty.ALetter);
/// ```
///
/// is saying that all characters between 0x0041 ("A") and 0x005A ("Z") are
/// assigned the property [CharProperty.ALetter].
///
/// Note that the Unicode spec uses inclusive ranges and we are doing the
/// same here.
class UnicodeRange<P> {
const UnicodeRange(this.start, this.end, this.property);
final int start;
final int end;
final P property;
/// Compare a [value] to this range.
///
/// The return value is either:
/// - lower: The value is lower than the range.
/// - higher: The value is higher than the range
/// - inside: The value is within the range.
_ComparisonResult compare(int value) {
if (value < start) {
return _ComparisonResult.lower;
}
if (value > end) {
return _ComparisonResult.higher;
}
return _ComparisonResult.inside;
}
}
/// Checks whether the given char code is a UTF-16 surrogate.
///
/// See:
/// - http://www.unicode.org/faq//utf_bom.html#utf16-2
bool isUtf16Surrogate(int char) {
return char & 0xF800 == 0xD800;
}
/// Combines a pair of UTF-16 surrogate into a single character code point.
///
/// The surrogate pair is expected to start at [index] in the [text].
///
/// See:
/// - http://www.unicode.org/faq//utf_bom.html#utf16-3
int combineSurrogatePair(String text, int index) {
final int hi = text.codeUnitAt(index);
final int lo = text.codeUnitAt(index + 1);
final int x = (hi & ((1 << 6) - 1)) << 10 | lo & ((1 << 10) - 1);
final int w = (hi >> 6) & ((1 << 5) - 1);
final int u = w + 1;
return u << 16 | x;
}
/// Returns the code point from [text] at [index] and handles surrogate pairs
/// for cases that involve two UTF-16 codes.
int? getCodePoint(String text, int index) {
if (index < 0 || index >= text.length) {
return null;
}
final int char = text.codeUnitAt(index);
if (isUtf16Surrogate(char) && index < text.length - 1) {
return combineSurrogatePair(text, index);
}
return char;
}
/// Given a list of [UnicodeRange]s, this class performs efficient lookup
/// to find which range a value falls into.
///
/// The lookup algorithm expects the ranges to have the following constraints:
/// - Be sorted.
/// - No overlap between the ranges.
/// - Gaps between ranges are ok.
///
/// This is used in the context of unicode to find out what property a letter
/// has. The properties are then used to decide word boundaries, line break
/// opportunities, etc.
class UnicodePropertyLookup<P> {
UnicodePropertyLookup(this.ranges, this.defaultProperty);
/// Creates a [UnicodePropertyLookup] from packed line break data.
factory UnicodePropertyLookup.fromPackedData(
String packedData,
int singleRangesCount,
List<P> propertyEnumValues,
P defaultProperty,
) {
return UnicodePropertyLookup<P>(
_unpackProperties<P>(packedData, singleRangesCount, propertyEnumValues),
defaultProperty,
);
}
/// The list of unicode ranges and their associated properties.
final List<UnicodeRange<P>> ranges;
/// The default property to use when a character doesn't belong in any
/// known range.
final P defaultProperty;
/// Cache for lookup results.
final Map<int, P> _cache = <int, P>{};
/// Take a [text] and an [index], and returns the property of the character
/// located at that [index].
///
/// If the [index] is out of range, null will be returned.
P find(String text, int index) {
final int? codePoint = getCodePoint(text, index);
return codePoint == null ? defaultProperty : findForChar(codePoint);
}
/// Takes one character as an integer code unit and returns its property.
///
/// If a property can't be found for the given character, then the default
/// property will be returned.
P findForChar(int? char) {
if (char == null) {
return defaultProperty;
}
final P? cacheHit = _cache[char];
if (cacheHit != null) {
return cacheHit;
}
final int rangeIndex = _binarySearch(char);
final P result = rangeIndex == -1 ? defaultProperty : ranges[rangeIndex].property;
// Cache the result.
_cache[char] = result;
return result;
}
int _binarySearch(int value) {
int min = 0;
int max = ranges.length;
while (min < max) {
final int mid = min + ((max - min) >> 1);
final UnicodeRange<P> range = ranges[mid];
switch (range.compare(value)) {
case _ComparisonResult.higher:
min = mid + 1;
break;
case _ComparisonResult.lower:
max = mid;
break;
case _ComparisonResult.inside:
return mid;
}
}
return -1;
}
}
List<UnicodeRange<P>> _unpackProperties<P>(
String packedData,
int singleRangesCount,
List<P> propertyEnumValues,
) {
// Packed data is mostly structured in chunks of 9 characters each:
//
// * [0..3]: Range start, encoded as a base36 integer.
// * [4..7]: Range end, encoded as a base36 integer.
// * [8]: Index of the property enum value, encoded as a single letter.
//
// When the range is a single number (i.e. range start == range end), it gets
// packed more efficiently in a chunk of 6 characters:
//
// * [0..3]: Range start (and range end), encoded as a base 36 integer.
// * [4]: "!" to indicate that there's no range end.
// * [5]: Index of the property enum value, encoded as a single letter.
// `packedData.length + singleRangesCount * 3` would have been the size of the
// packed data if the efficient packing of single-range items wasn't applied.
assert((packedData.length + singleRangesCount * 3) % 9 == 0);
final List<UnicodeRange<P>> ranges = <UnicodeRange<P>>[];
final int dataLength = packedData.length;
int i = 0;
while (i < dataLength) {
final int rangeStart = _consumeInt(packedData, i);
i += 4;
int rangeEnd;
if (packedData.codeUnitAt(i) == kCharBang) {
rangeEnd = rangeStart;
i++;
} else {
rangeEnd = _consumeInt(packedData, i);
i += 4;
}
final int charCode = packedData.codeUnitAt(i);
final P property =
propertyEnumValues[_getEnumIndexFromPackedValue(charCode)];
i++;
ranges.add(UnicodeRange<P>(rangeStart, rangeEnd, property));
}
return ranges;
}
int _getEnumIndexFromPackedValue(int charCode) {
// This has to stay in sync with [EnumValue.serialized] in
// `tool/unicode_sync_script.dart`.
assert((charCode >= kChar_A && charCode <= kChar_Z) ||
(charCode >= kChar_a && charCode <= kChar_z));
// Uppercase letters were assigned to the first 26 enum values.
if (charCode <= kChar_Z) {
return charCode - kChar_A;
}
// Lowercase letters were assigned to enum values above 26.
return 26 + charCode - kChar_a;
}
int _consumeInt(String packedData, int index) {
// The implementation is equivalent to:
//
// ```dart
// return int.tryParse(packedData.substring(index, index + 4), radix: 36);
// ```
//
// But using substring is slow when called too many times. This custom
// implementation makes the unpacking 25%-45% faster than using substring.
final int digit0 = _getIntFromCharCode(packedData.codeUnitAt(index + 3));
final int digit1 = _getIntFromCharCode(packedData.codeUnitAt(index + 2));
final int digit2 = _getIntFromCharCode(packedData.codeUnitAt(index + 1));
final int digit3 = _getIntFromCharCode(packedData.codeUnitAt(index));
return digit0 + (digit1 * 36) + (digit2 * 36 * 36) + (digit3 * 36 * 36 * 36);
}
/// Does the same thing as [int.parse(str, 36)] but takes only a single
/// character as a [charCode] integer.
int _getIntFromCharCode(int charCode) {
assert((charCode >= kChar_0 && charCode <= kChar_9) ||
(charCode >= kChar_a && charCode <= kChar_z));
if (charCode <= kChar_9) {
return charCode - kChar_0;
}
// "a" starts from 10 and remaining letters go up from there.
return charCode - kChar_a + 10;
}