lib/web_ui/lib/src/engine/text/unicode_range.dart - mirrors/engine - Git at Google

 // Copyright 2013 The Flutter Authors. All rights reserved.
 // Use of this source code is governed by a BSD-style license that can be
 // found in the LICENSE file.

 const int kChar_0 = 48;
 const int kChar_9 = kChar_0 + 9;
 const int kChar_A = 65;
 const int kChar_Z = 90;
 const int kChar_a = 97;
 const int kChar_z = 122;
 const int kCharBang = 33;
 const int kMashriqi_0 = 0x660;
 const int kMashriqi_9 = kMashriqi_0 + 9;

 enum _ComparisonResult {
   inside,
   higher,
   lower,
 }

 /// Each instance of [UnicodeRange] represents a range of unicode characters
 /// that are assigned a [CharProperty]. For example, the following snippet:
 ///
 /// ```dart
 /// UnicodeRange(0x0041, 0x005A, CharProperty.ALetter);
 /// ```
 ///
 /// is saying that all characters between 0x0041 ("A") and 0x005A ("Z") are
 /// assigned the property [CharProperty.ALetter].
 ///
 /// Note that the Unicode spec uses inclusive ranges and we are doing the
 /// same here.
 class UnicodeRange<P> {
   const UnicodeRange(this.start, this.end, this.property);

   final int start;

   final int end;

   final P property;

   /// Compare a [value] to this range.
   ///
   /// The return value is either:
   /// - lower: The value is lower than the range.
   /// - higher: The value is higher than the range
   /// - inside: The value is within the range.
   _ComparisonResult compare(int value) {
     if (value < start) {
       return _ComparisonResult.lower;
     }
     if (value > end) {
       return _ComparisonResult.higher;
     }
     return _ComparisonResult.inside;
   }
 }

 /// Checks whether the given char code is a UTF-16 surrogate.
 ///
 /// See:
 /// - http://www.unicode.org/faq//utf_bom.html#utf16-2
 bool isUtf16Surrogate(int char) {
   return char & 0xF800 == 0xD800;
 }

 /// Combines a pair of UTF-16 surrogate into a single character code point.
 ///
 /// The surrogate pair is expected to start at [index] in the [text].
 ///
 /// See:
 /// - http://www.unicode.org/faq//utf_bom.html#utf16-3
 int combineSurrogatePair(String text, int index) {
   final int hi = text.codeUnitAt(index);
   final int lo = text.codeUnitAt(index + 1);

   final int x = (hi & ((1 << 6) - 1)) << 10 | lo & ((1 << 10) - 1);
   final int w = (hi >> 6) & ((1 << 5) - 1);
   final int u = w + 1;
   return u << 16 | x;
 }

 /// Returns the code point from [text] at [index] and handles surrogate pairs
 /// for cases that involve two UTF-16 codes.
 int? getCodePoint(String text, int index) {
   if (index < 0 || index >= text.length) {
     return null;
   }

   final int char = text.codeUnitAt(index);
   if (isUtf16Surrogate(char) && index < text.length - 1) {
     return combineSurrogatePair(text, index);
   }
   return char;
 }

 /// Given a list of [UnicodeRange]s, this class performs efficient lookup
 /// to find which range a value falls into.
 ///
 /// The lookup algorithm expects the ranges to have the following constraints:
 /// - Be sorted.
 /// - No overlap between the ranges.
 /// - Gaps between ranges are ok.
 ///
 /// This is used in the context of unicode to find out what property a letter
 /// has. The properties are then used to decide word boundaries, line break
 /// opportunities, etc.
 class UnicodePropertyLookup<P> {
   UnicodePropertyLookup(this.ranges, this.defaultProperty);

   /// Creates a [UnicodePropertyLookup] from packed line break data.
   factory UnicodePropertyLookup.fromPackedData(
     String packedData,
     int singleRangesCount,
     List<P> propertyEnumValues,
     P defaultProperty,
   ) {
     return UnicodePropertyLookup<P>(
       _unpackProperties<P>(packedData, singleRangesCount, propertyEnumValues),
       defaultProperty,
     );
   }

   /// The list of unicode ranges and their associated properties.
   final List<UnicodeRange<P>> ranges;

   /// The default property to use when a character doesn't belong in any
   /// known range.
   final P defaultProperty;

   /// Cache for lookup results.
   final Map<int, P> _cache = <int, P>{};

   /// Take a [text] and an [index], and returns the property of the character
   /// located at that [index].
   ///
   /// If the [index] is out of range, null will be returned.
   P find(String text, int index) {
     final int? codePoint = getCodePoint(text, index);
     return codePoint == null ? defaultProperty : findForChar(codePoint);
   }

   /// Takes one character as an integer code unit and returns its property.
   ///
   /// If a property can't be found for the given character, then the default
   /// property will be returned.
   P findForChar(int? char) {
     if (char == null) {
       return defaultProperty;
     }

     final P? cacheHit = _cache[char];
     if (cacheHit != null) {
       return cacheHit;
     }

     final int rangeIndex = _binarySearch(char);
     final P result = rangeIndex == -1 ? defaultProperty : ranges[rangeIndex].property;
     // Cache the result.
     _cache[char] = result;
     return result;
   }

   int _binarySearch(int value) {
     int min = 0;
     int max = ranges.length;
     while (min < max) {
       final int mid = min + ((max - min) >> 1);
       final UnicodeRange<P> range = ranges[mid];
       switch (range.compare(value)) {
         case _ComparisonResult.higher:
           min = mid + 1;
           break;
         case _ComparisonResult.lower:
           max = mid;
           break;
         case _ComparisonResult.inside:
           return mid;
       }
     }
     return -1;
   }
 }

 List<UnicodeRange<P>> _unpackProperties<P>(
   String packedData,
   int singleRangesCount,
   List<P> propertyEnumValues,
 ) {
   // Packed data is mostly structured in chunks of 9 characters each:
   //
   // * [0..3]: Range start, encoded as a base36 integer.
   // * [4..7]: Range end, encoded as a base36 integer.
   // * [8]: Index of the property enum value, encoded as a single letter.
   //
   // When the range is a single number (i.e. range start == range end), it gets
   // packed more efficiently in a chunk of 6 characters:
   //
   // * [0..3]: Range start (and range end), encoded as a base 36 integer.
   // * [4]: "!" to indicate that there's no range end.
   // * [5]: Index of the property enum value, encoded as a single letter.

   // `packedData.length + singleRangesCount * 3` would have been the size of the
   // packed data if the efficient packing of single-range items wasn't applied.
   assert((packedData.length + singleRangesCount * 3) % 9 == 0);

   final List<UnicodeRange<P>> ranges = <UnicodeRange<P>>[];
   final int dataLength = packedData.length;
   int i = 0;
   while (i < dataLength) {
     final int rangeStart = _consumeInt(packedData, i);
     i += 4;

     int rangeEnd;
     if (packedData.codeUnitAt(i) == kCharBang) {
       rangeEnd = rangeStart;
       i++;
     } else {
       rangeEnd = _consumeInt(packedData, i);
       i += 4;
     }
     final int charCode = packedData.codeUnitAt(i);
     final P property =
         propertyEnumValues[_getEnumIndexFromPackedValue(charCode)];
     i++;

     ranges.add(UnicodeRange<P>(rangeStart, rangeEnd, property));
   }
   return ranges;
 }

 int _getEnumIndexFromPackedValue(int charCode) {
   // This has to stay in sync with [EnumValue.serialized] in
   // `tool/unicode_sync_script.dart`.

   assert((charCode >= kChar_A && charCode <= kChar_Z) ||
       (charCode >= kChar_a && charCode <= kChar_z));

   // Uppercase letters were assigned to the first 26 enum values.
   if (charCode <= kChar_Z) {
     return charCode - kChar_A;
   }
   // Lowercase letters were assigned to enum values above 26.
   return 26 + charCode - kChar_a;
 }

 int _consumeInt(String packedData, int index) {
   // The implementation is equivalent to:
   //
   // ```dart
   // return int.tryParse(packedData.substring(index, index + 4), radix: 36);
   // ```
   //
   // But using substring is slow when called too many times. This custom
   // implementation makes the unpacking 25%-45% faster than using substring.
   final int digit0 = _getIntFromCharCode(packedData.codeUnitAt(index + 3));
   final int digit1 = _getIntFromCharCode(packedData.codeUnitAt(index + 2));
   final int digit2 = _getIntFromCharCode(packedData.codeUnitAt(index + 1));
   final int digit3 = _getIntFromCharCode(packedData.codeUnitAt(index));
   return digit0 + (digit1 * 36) + (digit2 * 36 * 36) + (digit3 * 36 * 36 * 36);
 }

 /// Does the same thing as [int.parse(str, 36)] but takes only a single
 /// character as a [charCode] integer.
 int _getIntFromCharCode(int charCode) {
   assert((charCode >= kChar_0 && charCode <= kChar_9) ||
       (charCode >= kChar_a && charCode <= kChar_z));

   if (charCode <= kChar_9) {
     return charCode - kChar_0;
   }
   // "a" starts from 10 and remaining letters go up from there.
   return charCode - kChar_a + 10;
 }
	// Copyright 2013 The Flutter Authors. All rights reserved.
	// Use of this source code is governed by a BSD-style license that can be
	// found in the LICENSE file.

	const int kChar_0 = 48;
	const int kChar_9 = kChar_0 + 9;
	const int kChar_A = 65;
	const int kChar_Z = 90;
	const int kChar_a = 97;
	const int kChar_z = 122;
	const int kCharBang = 33;
	const int kMashriqi_0 = 0x660;
	const int kMashriqi_9 = kMashriqi_0 + 9;

	enum _ComparisonResult {
	inside,
	higher,
	lower,
	}

	/// Each instance of [UnicodeRange] represents a range of unicode characters
	/// that are assigned a [CharProperty]. For example, the following snippet:
	///
	/// ```dart
	/// UnicodeRange(0x0041, 0x005A, CharProperty.ALetter);
	/// ```
	///
	/// is saying that all characters between 0x0041 ("A") and 0x005A ("Z") are
	/// assigned the property [CharProperty.ALetter].
	///
	/// Note that the Unicode spec uses inclusive ranges and we are doing the
	/// same here.
	class UnicodeRange<P> {
	const UnicodeRange(this.start, this.end, this.property);

	final int start;

	final int end;

	final P property;

	/// Compare a [value] to this range.
	///
	/// The return value is either:
	/// - lower: The value is lower than the range.
	/// - higher: The value is higher than the range
	/// - inside: The value is within the range.
	_ComparisonResult compare(int value) {
	if (value < start) {
	return _ComparisonResult.lower;
	}
	if (value > end) {
	return _ComparisonResult.higher;
	}
	return _ComparisonResult.inside;
	}
	}

	/// Checks whether the given char code is a UTF-16 surrogate.
	///
	/// See:
	/// - http://www.unicode.org/faq//utf_bom.html#utf16-2
	bool isUtf16Surrogate(int char) {
	return char & 0xF800 == 0xD800;
	}

	/// Combines a pair of UTF-16 surrogate into a single character code point.
	///
	/// The surrogate pair is expected to start at [index] in the [text].
	///
	/// See:
	/// - http://www.unicode.org/faq//utf_bom.html#utf16-3
	int combineSurrogatePair(String text, int index) {
	final int hi = text.codeUnitAt(index);
	final int lo = text.codeUnitAt(index + 1);

	final int x = (hi & ((1 << 6) - 1)) << 10 \| lo & ((1 << 10) - 1);
	final int w = (hi >> 6) & ((1 << 5) - 1);
	final int u = w + 1;
	return u << 16 \| x;
	}

	/// Returns the code point from [text] at [index] and handles surrogate pairs
	/// for cases that involve two UTF-16 codes.
	int? getCodePoint(String text, int index) {
	if (index < 0 \|\| index >= text.length) {
	return null;
	}

	final int char = text.codeUnitAt(index);
	if (isUtf16Surrogate(char) && index < text.length - 1) {
	return combineSurrogatePair(text, index);
	}
	return char;
	}

	/// Given a list of [UnicodeRange]s, this class performs efficient lookup
	/// to find which range a value falls into.
	///
	/// The lookup algorithm expects the ranges to have the following constraints:
	/// - Be sorted.
	/// - No overlap between the ranges.
	/// - Gaps between ranges are ok.
	///
	/// This is used in the context of unicode to find out what property a letter
	/// has. The properties are then used to decide word boundaries, line break
	/// opportunities, etc.
	class UnicodePropertyLookup<P> {
	UnicodePropertyLookup(this.ranges, this.defaultProperty);

	/// Creates a [UnicodePropertyLookup] from packed line break data.
	factory UnicodePropertyLookup.fromPackedData(
	String packedData,
	int singleRangesCount,
	List<P> propertyEnumValues,
	P defaultProperty,
	) {
	return UnicodePropertyLookup<P>(
	_unpackProperties<P>(packedData, singleRangesCount, propertyEnumValues),
	defaultProperty,
	);
	}

	/// The list of unicode ranges and their associated properties.
	final List<UnicodeRange<P>> ranges;

	/// The default property to use when a character doesn't belong in any
	/// known range.
	final P defaultProperty;

	/// Cache for lookup results.
	final Map<int, P> _cache = <int, P>{};

	/// Take a [text] and an [index], and returns the property of the character
	/// located at that [index].
	///
	/// If the [index] is out of range, null will be returned.
	P find(String text, int index) {
	final int? codePoint = getCodePoint(text, index);
	return codePoint == null ? defaultProperty : findForChar(codePoint);
	}

	/// Takes one character as an integer code unit and returns its property.
	///
	/// If a property can't be found for the given character, then the default
	/// property will be returned.
	P findForChar(int? char) {
	if (char == null) {
	return defaultProperty;
	}

	final P? cacheHit = _cache[char];
	if (cacheHit != null) {
	return cacheHit;
	}

	final int rangeIndex = _binarySearch(char);
	final P result = rangeIndex == -1 ? defaultProperty : ranges[rangeIndex].property;
	// Cache the result.
	_cache[char] = result;
	return result;
	}

	int _binarySearch(int value) {
	int min = 0;
	int max = ranges.length;
	while (min < max) {
	final int mid = min + ((max - min) >> 1);
	final UnicodeRange<P> range = ranges[mid];
	switch (range.compare(value)) {
	case _ComparisonResult.higher:
	min = mid + 1;
	break;
	case _ComparisonResult.lower:
	max = mid;
	break;
	case _ComparisonResult.inside:
	return mid;
	}
	}
	return -1;
	}
	}

	List<UnicodeRange<P>> _unpackProperties<P>(
	String packedData,
	int singleRangesCount,
	List<P> propertyEnumValues,
	) {
	// Packed data is mostly structured in chunks of 9 characters each:
	//
	// * [0..3]: Range start, encoded as a base36 integer.
	// * [4..7]: Range end, encoded as a base36 integer.
	// * [8]: Index of the property enum value, encoded as a single letter.
	//
	// When the range is a single number (i.e. range start == range end), it gets
	// packed more efficiently in a chunk of 6 characters:
	//
	// * [0..3]: Range start (and range end), encoded as a base 36 integer.
	// * [4]: "!" to indicate that there's no range end.
	// * [5]: Index of the property enum value, encoded as a single letter.

	// `packedData.length + singleRangesCount * 3` would have been the size of the
	// packed data if the efficient packing of single-range items wasn't applied.
	assert((packedData.length + singleRangesCount * 3) % 9 == 0);

	final List<UnicodeRange<P>> ranges = <UnicodeRange<P>>[];
	final int dataLength = packedData.length;
	int i = 0;
	while (i < dataLength) {
	final int rangeStart = _consumeInt(packedData, i);
	i += 4;

	int rangeEnd;
	if (packedData.codeUnitAt(i) == kCharBang) {
	rangeEnd = rangeStart;
	i++;
	} else {
	rangeEnd = _consumeInt(packedData, i);
	i += 4;
	}
	final int charCode = packedData.codeUnitAt(i);
	final P property =
	propertyEnumValues[_getEnumIndexFromPackedValue(charCode)];
	i++;

	ranges.add(UnicodeRange<P>(rangeStart, rangeEnd, property));
	}
	return ranges;
	}

	int _getEnumIndexFromPackedValue(int charCode) {
	// This has to stay in sync with [EnumValue.serialized] in
	// `tool/unicode_sync_script.dart`.

	assert((charCode >= kChar_A && charCode <= kChar_Z) \|\|
	(charCode >= kChar_a && charCode <= kChar_z));

	// Uppercase letters were assigned to the first 26 enum values.
	if (charCode <= kChar_Z) {
	return charCode - kChar_A;
	}
	// Lowercase letters were assigned to enum values above 26.
	return 26 + charCode - kChar_a;
	}

	int _consumeInt(String packedData, int index) {
	// The implementation is equivalent to:
	//
	// ```dart
	// return int.tryParse(packedData.substring(index, index + 4), radix: 36);
	// ```
	//
	// But using substring is slow when called too many times. This custom
	// implementation makes the unpacking 25%-45% faster than using substring.
	final int digit0 = _getIntFromCharCode(packedData.codeUnitAt(index + 3));
	final int digit1 = _getIntFromCharCode(packedData.codeUnitAt(index + 2));
	final int digit2 = _getIntFromCharCode(packedData.codeUnitAt(index + 1));
	final int digit3 = _getIntFromCharCode(packedData.codeUnitAt(index));
	return digit0 + (digit1 * 36) + (digit2 * 36 * 36) + (digit3 * 36 * 36 * 36);
	}

	/// Does the same thing as [int.parse(str, 36)] but takes only a single
	/// character as a [charCode] integer.
	int _getIntFromCharCode(int charCode) {
	assert((charCode >= kChar_0 && charCode <= kChar_9) \|\|
	(charCode >= kChar_a && charCode <= kChar_z));

	if (charCode <= kChar_9) {
	return charCode - kChar_0;
	}
	// "a" starts from 10 and remaining letters go up from there.
	return charCode - kChar_a + 10;
	}