third_party/txt/src/minikin/GraphemeBreak.cpp - mirrors/engine - Git at Google

 /*
  * Copyright (C) 2014 The Android Open Source Project
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
  * You may obtain a copy of the License at
  *
  *      http://www.apache.org/licenses/LICENSE-2.0
  *
  * Unless required by applicable law or agreed to in writing, software
  * distributed under the License is distributed on an "AS IS" BASIS,
  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */

 #include <stdint.h>
 #include <unicode/uchar.h>
 #include <unicode/utf16.h>
 #include <algorithm>

 #include <minikin/Emoji.h>
 #include <minikin/GraphemeBreak.h>
 #include "MinikinInternal.h"
 #include "utils/WindowsUtils.h"

 namespace minikin {

 int32_t tailoredGraphemeClusterBreak(uint32_t c) {
   // Characters defined as Control that we want to treat them as Extend.
   // These are curated manually.
   if (c == 0x00AD                      // SHY
       || c == 0x061C                   // ALM
       || c == 0x180E                   // MONGOLIAN VOWEL SEPARATOR
       || c == 0x200B                   // ZWSP
       || c == 0x200E                   // LRM
       || c == 0x200F                   // RLM
       || (0x202A <= c && c <= 0x202E)  // LRE, RLE, PDF, LRO, RLO
       || ((c | 0xF) ==
           0x206F)     // WJ, invisible math operators, LRI, RLI, FSI, PDI,
                       // and the deprecated invisible format controls
       || c == 0xFEFF  // BOM
       || ((c | 0x7F) ==
           0xE007F))  // recently undeprecated tag characters in Plane 14
     return U_GCB_EXTEND;
   // THAI CHARACTER SARA AM is treated as a normal letter by most other
   // implementations: they allow a grapheme break before it.
   else if (c == 0x0E33)
     return U_GCB_OTHER;
   else
     return u_getIntPropertyValue(c, UCHAR_GRAPHEME_CLUSTER_BREAK);
 }

 // Returns true for all characters whose IndicSyllabicCategory is Pure_Killer.
 // From http://www.unicode.org/Public/9.0.0/ucd/IndicSyllabicCategory.txt
 bool isPureKiller(uint32_t c) {
   return (c == 0x0E3A || c == 0x0E4E || c == 0x0F84 || c == 0x103A ||
           c == 0x1714 || c == 0x1734 || c == 0x17D1 || c == 0x1BAA ||
           c == 0x1BF2 || c == 0x1BF3 || c == 0xA806 || c == 0xA953 ||
           c == 0xABED || c == 0x11134 || c == 0x112EA || c == 0x1172B);
 }

 bool GraphemeBreak::isGraphemeBreak(const float* advances,
                                     const uint16_t* buf,
                                     size_t start,
                                     size_t count,
                                     const size_t offset) {
   // This implementation closely follows Unicode Standard Annex #29 on
   // Unicode Text Segmentation (http://www.unicode.org/reports/tr29/),
   // implementing a tailored version of extended grapheme clusters.
   // The GB rules refer to section 3.1.1, Grapheme Cluster Boundary Rules.

   // Rule GB1, sot ÷; Rule GB2, ÷ eot
   if (offset <= start || offset >= start + count) {
     return true;
   }
   if (U16_IS_TRAIL(buf[offset])) {
     // Don't break a surrogate pair, but a lonely trailing surrogate pair is a
     // break
     return !U16_IS_LEAD(buf[offset - 1]);
   }
   uint32_t c1 = 0;
   uint32_t c2 = 0;
   size_t offset_back = offset;
   size_t offset_forward = offset;
   U16_PREV(buf, start, offset_back, c1);
   U16_NEXT(buf, offset_forward, start + count, c2);
   int32_t p1 = tailoredGraphemeClusterBreak(c1);
   int32_t p2 = tailoredGraphemeClusterBreak(c2);
   // Rule GB3, CR x LF
   if (p1 == U_GCB_CR && p2 == U_GCB_LF) {
     return false;
   }
   // Rule GB4, (Control | CR | LF) ÷
   if (p1 == U_GCB_CONTROL || p1 == U_GCB_CR || p1 == U_GCB_LF) {
     return true;
   }
   // Rule GB5, ÷ (Control | CR | LF)
   if (p2 == U_GCB_CONTROL || p2 == U_GCB_CR || p2 == U_GCB_LF) {
     return true;
   }
   // Rule GB6, L x ( L | V | LV | LVT )
   if (p1 == U_GCB_L &&
       (p2 == U_GCB_L || p2 == U_GCB_V || p2 == U_GCB_LV || p2 == U_GCB_LVT)) {
     return false;
   }
   // Rule GB7, ( LV | V ) x ( V | T )
   if ((p1 == U_GCB_LV || p1 == U_GCB_V) && (p2 == U_GCB_V || p2 == U_GCB_T)) {
     return false;
   }
   // Rule GB8, ( LVT | T ) x T
   if ((p1 == U_GCB_LVT || p1 == U_GCB_T) && p2 == U_GCB_T) {
     return false;
   }
   // Rule GB9, x (Extend | ZWJ); Rule GB9a, x SpacingMark; Rule GB9b, Prepend x
   if (p2 == U_GCB_EXTEND || p2 == U_GCB_ZWJ || p2 == U_GCB_SPACING_MARK ||
       p1 == U_GCB_PREPEND) {
     return false;
   }

   // This is used to decide font-dependent grapheme clusters. If we don't have
   // the advance information, we become conservative in grapheme breaking and
   // assume that it has no advance.
   const bool c2_has_advance =
       (advances != nullptr && advances[offset - start] != 0.0);

   // All the following rules are font-dependent, in the way that if we know c2
   // has an advance, we definitely know that it cannot form a grapheme with the
   // character(s) before it. So we make the decision in favor a grapheme break
   // early.
   if (c2_has_advance) {
     return true;
   }

   // Note: For Rule GB10 and GB11 below, we do not use the Unicode line breaking
   // properties for determining emoji-ness and carry our own data, because our
   // data could be more fresh than what ICU provides.
   //
   // Tailored version of Rule GB10, (E_Base | EBG) Extend* × E_Modifier.
   // The rule itself says do not break between emoji base and emoji modifiers,
   // skipping all Extend characters. Variation selectors are considered Extend,
   // so they are handled fine.
   //
   // We tailor this by requiring that an actual ligature is formed. If the font
   // doesn't form a ligature, we allow a break before the modifier.
   if (isEmojiModifier(c2)) {
     uint32_t c0 = c1;
     size_t offset_backback = offset_back;
     int32_t p0 = p1;
     if (p0 == U_GCB_EXTEND && offset_backback > start) {
       // skip over emoji variation selector
       U16_PREV(buf, start, offset_backback, c0);
     }
     if (isEmojiBase(c0)) {
       return false;
     }
   }

   // Tailored version of Rule GB11, ZWJ × (Glue_After_Zwj | EBG)
   // We try to make emoji sequences with ZWJ a single grapheme cluster, but only
   // if they actually merge to one cluster. So we are more relaxed than the UAX
   // #29 rules in accepting any emoji character after the ZWJ, but are tighter
   // in that we only treat it as one cluster if a ligature is actually formed
   // and we also require the character before the ZWJ to also be an emoji.
   if (p1 == U_GCB_ZWJ && isEmoji(c2) && offset_back > start) {
     // look at character before ZWJ to see that both can participate in an
     // emoji zwj sequence
     uint32_t c0 = 0;
     size_t offset_backback = offset_back;
     U16_PREV(buf, start, offset_backback, c0);
     if (c0 == 0xFE0F && offset_backback > start) {
       // skip over emoji variation selector
       U16_PREV(buf, start, offset_backback, c0);
     }
     if (isEmoji(c0)) {
       return false;
     }
   }

   // Tailored version of Rule GB12 and Rule GB13 that look at even-odd cases.
   // sot   (RI RI)*  RI x RI
   // [^RI] (RI RI)*  RI x RI
   //
   // If we have font information, we have already broken the cluster if and only
   // if the second character had no advance, which means a ligature was formed.
   // If we don't, we look back like UAX #29 recommends, but only up to 1000 code
   // units.
   if (p1 == U_GCB_REGIONAL_INDICATOR && p2 == U_GCB_REGIONAL_INDICATOR) {
     if (advances != nullptr) {
       // We have advances information. But if we are here, we already know c2
       // has no advance. So we should definitely disallow a break.
       return false;
     } else {
       // Look at up to 1000 code units.
       const size_t lookback_barrier =
           std::max((ssize_t)start, (ssize_t)offset_back - 1000);
       size_t offset_backback = offset_back;
       while (offset_backback > lookback_barrier) {
         uint32_t c0 = 0;
         U16_PREV(buf, lookback_barrier, offset_backback, c0);
         if (tailoredGraphemeClusterBreak(c0) != U_GCB_REGIONAL_INDICATOR) {
           offset_backback += U16_LENGTH(c0);
           break;
         }
       }
       // The number 4 comes from the number of code units in a whole flag.
       return (offset - offset_backback) % 4 == 0;
     }
   }
   // Cluster Indic syllables together (tailoring of UAX #29).
   // Immediately after each virama (that is not just a pure killer) followed by
   // a letter, we disallow grapheme breaks (if we are here, we don't know about
   // advances, or we already know that c2 has no advance).
   if (u_getIntPropertyValue(c1, UCHAR_CANONICAL_COMBINING_CLASS) == 9  // virama
       && !isPureKiller(c1) &&
       u_getIntPropertyValue(c2, UCHAR_GENERAL_CATEGORY) == U_OTHER_LETTER) {
     return false;
   }
   // Rule GB999, Any ÷ Any
   return true;
 }

 size_t GraphemeBreak::getTextRunCursor(const float* advances,
                                        const uint16_t* buf,
                                        size_t start,
                                        size_t count,
                                        size_t offset,
                                        MoveOpt opt) {
   switch (opt) {
     case AFTER:
       if (offset < start + count) {
         offset++;
       }
       // fall through
     case AT_OR_AFTER:
       while (!isGraphemeBreak(advances, buf, start, count, offset)) {
         offset++;
       }
       break;
     case BEFORE:
       if (offset > start) {
         offset--;
       }
       // fall through
     case AT_OR_BEFORE:
       while (!isGraphemeBreak(advances, buf, start, count, offset)) {
         offset--;
       }
       break;
     case AT:
       if (!isGraphemeBreak(advances, buf, start, count, offset)) {
         offset = (size_t)-1;
       }
       break;
   }
   return offset;
 }

 }  // namespace minikin
	/*
	* Copyright (C) 2014 The Android Open Source Project
	*
	* Licensed under the Apache License, Version 2.0 (the "License");
	* you may not use this file except in compliance with the License.
	* You may obtain a copy of the License at
	*
	* http://www.apache.org/licenses/LICENSE-2.0
	*
	* Unless required by applicable law or agreed to in writing, software
	* distributed under the License is distributed on an "AS IS" BASIS,
	* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
	* See the License for the specific language governing permissions and
	* limitations under the License.
	*/

	#include <stdint.h>
	#include <unicode/uchar.h>
	#include <unicode/utf16.h>
	#include <algorithm>

	#include <minikin/Emoji.h>
	#include <minikin/GraphemeBreak.h>
	#include "MinikinInternal.h"
	#include "utils/WindowsUtils.h"

	namespace minikin {

	int32_t tailoredGraphemeClusterBreak(uint32_t c) {
	// Characters defined as Control that we want to treat them as Extend.
	// These are curated manually.
	if (c == 0x00AD // SHY
	\|\| c == 0x061C // ALM
	\|\| c == 0x180E // MONGOLIAN VOWEL SEPARATOR
	\|\| c == 0x200B // ZWSP
	\|\| c == 0x200E // LRM
	\|\| c == 0x200F // RLM
	\|\| (0x202A <= c && c <= 0x202E) // LRE, RLE, PDF, LRO, RLO
	\|\| ((c \| 0xF) ==
	0x206F) // WJ, invisible math operators, LRI, RLI, FSI, PDI,
	// and the deprecated invisible format controls
	\|\| c == 0xFEFF // BOM
	\|\| ((c \| 0x7F) ==
	0xE007F)) // recently undeprecated tag characters in Plane 14
	return U_GCB_EXTEND;
	// THAI CHARACTER SARA AM is treated as a normal letter by most other
	// implementations: they allow a grapheme break before it.
	else if (c == 0x0E33)
	return U_GCB_OTHER;
	else
	return u_getIntPropertyValue(c, UCHAR_GRAPHEME_CLUSTER_BREAK);
	}

	// Returns true for all characters whose IndicSyllabicCategory is Pure_Killer.
	// From http://www.unicode.org/Public/9.0.0/ucd/IndicSyllabicCategory.txt
	bool isPureKiller(uint32_t c) {
	return (c == 0x0E3A \|\| c == 0x0E4E \|\| c == 0x0F84 \|\| c == 0x103A \|\|
	c == 0x1714 \|\| c == 0x1734 \|\| c == 0x17D1 \|\| c == 0x1BAA \|\|
	c == 0x1BF2 \|\| c == 0x1BF3 \|\| c == 0xA806 \|\| c == 0xA953 \|\|
	c == 0xABED \|\| c == 0x11134 \|\| c == 0x112EA \|\| c == 0x1172B);
	}

	bool GraphemeBreak::isGraphemeBreak(const float* advances,
	const uint16_t* buf,
	size_t start,
	size_t count,
	const size_t offset) {
	// This implementation closely follows Unicode Standard Annex #29 on
	// Unicode Text Segmentation (http://www.unicode.org/reports/tr29/),
	// implementing a tailored version of extended grapheme clusters.
	// The GB rules refer to section 3.1.1, Grapheme Cluster Boundary Rules.

	// Rule GB1, sot ÷; Rule GB2, ÷ eot
	if (offset <= start \|\| offset >= start + count) {
	return true;
	}
	if (U16_IS_TRAIL(buf[offset])) {
	// Don't break a surrogate pair, but a lonely trailing surrogate pair is a
	// break
	return !U16_IS_LEAD(buf[offset - 1]);
	}
	uint32_t c1 = 0;
	uint32_t c2 = 0;
	size_t offset_back = offset;
	size_t offset_forward = offset;
	U16_PREV(buf, start, offset_back, c1);
	U16_NEXT(buf, offset_forward, start + count, c2);
	int32_t p1 = tailoredGraphemeClusterBreak(c1);
	int32_t p2 = tailoredGraphemeClusterBreak(c2);
	// Rule GB3, CR x LF
	if (p1 == U_GCB_CR && p2 == U_GCB_LF) {
	return false;
	}
	// Rule GB4, (Control \| CR \| LF) ÷
	if (p1 == U_GCB_CONTROL \|\| p1 == U_GCB_CR \|\| p1 == U_GCB_LF) {
	return true;
	}
	// Rule GB5, ÷ (Control \| CR \| LF)
	if (p2 == U_GCB_CONTROL \|\| p2 == U_GCB_CR \|\| p2 == U_GCB_LF) {
	return true;
	}
	// Rule GB6, L x ( L \| V \| LV \| LVT )
	if (p1 == U_GCB_L &&
	(p2 == U_GCB_L \|\| p2 == U_GCB_V \|\| p2 == U_GCB_LV \|\| p2 == U_GCB_LVT)) {
	return false;
	}
	// Rule GB7, ( LV \| V ) x ( V \| T )
	if ((p1 == U_GCB_LV \|\| p1 == U_GCB_V) && (p2 == U_GCB_V \|\| p2 == U_GCB_T)) {
	return false;
	}
	// Rule GB8, ( LVT \| T ) x T
	if ((p1 == U_GCB_LVT \|\| p1 == U_GCB_T) && p2 == U_GCB_T) {
	return false;
	}
	// Rule GB9, x (Extend \| ZWJ); Rule GB9a, x SpacingMark; Rule GB9b, Prepend x
	if (p2 == U_GCB_EXTEND \|\| p2 == U_GCB_ZWJ \|\| p2 == U_GCB_SPACING_MARK \|\|
	p1 == U_GCB_PREPEND) {
	return false;
	}

	// This is used to decide font-dependent grapheme clusters. If we don't have
	// the advance information, we become conservative in grapheme breaking and
	// assume that it has no advance.
	const bool c2_has_advance =
	(advances != nullptr && advances[offset - start] != 0.0);

	// All the following rules are font-dependent, in the way that if we know c2
	// has an advance, we definitely know that it cannot form a grapheme with the
	// character(s) before it. So we make the decision in favor a grapheme break
	// early.
	if (c2_has_advance) {
	return true;
	}

	// Note: For Rule GB10 and GB11 below, we do not use the Unicode line breaking
	// properties for determining emoji-ness and carry our own data, because our
	// data could be more fresh than what ICU provides.
	//
	// Tailored version of Rule GB10, (E_Base \| EBG) Extend* × E_Modifier.
	// The rule itself says do not break between emoji base and emoji modifiers,
	// skipping all Extend characters. Variation selectors are considered Extend,
	// so they are handled fine.
	//
	// We tailor this by requiring that an actual ligature is formed. If the font
	// doesn't form a ligature, we allow a break before the modifier.
	if (isEmojiModifier(c2)) {
	uint32_t c0 = c1;
	size_t offset_backback = offset_back;
	int32_t p0 = p1;
	if (p0 == U_GCB_EXTEND && offset_backback > start) {
	// skip over emoji variation selector
	U16_PREV(buf, start, offset_backback, c0);
	}
	if (isEmojiBase(c0)) {
	return false;
	}
	}

	// Tailored version of Rule GB11, ZWJ × (Glue_After_Zwj \| EBG)
	// We try to make emoji sequences with ZWJ a single grapheme cluster, but only
	// if they actually merge to one cluster. So we are more relaxed than the UAX
	// #29 rules in accepting any emoji character after the ZWJ, but are tighter
	// in that we only treat it as one cluster if a ligature is actually formed
	// and we also require the character before the ZWJ to also be an emoji.
	if (p1 == U_GCB_ZWJ && isEmoji(c2) && offset_back > start) {
	// look at character before ZWJ to see that both can participate in an
	// emoji zwj sequence
	uint32_t c0 = 0;
	size_t offset_backback = offset_back;
	U16_PREV(buf, start, offset_backback, c0);
	if (c0 == 0xFE0F && offset_backback > start) {
	// skip over emoji variation selector
	U16_PREV(buf, start, offset_backback, c0);
	}
	if (isEmoji(c0)) {
	return false;
	}
	}

	// Tailored version of Rule GB12 and Rule GB13 that look at even-odd cases.
	// sot (RI RI)* RI x RI
	// [^RI] (RI RI)* RI x RI
	//
	// If we have font information, we have already broken the cluster if and only
	// if the second character had no advance, which means a ligature was formed.
	// If we don't, we look back like UAX #29 recommends, but only up to 1000 code
	// units.
	if (p1 == U_GCB_REGIONAL_INDICATOR && p2 == U_GCB_REGIONAL_INDICATOR) {
	if (advances != nullptr) {
	// We have advances information. But if we are here, we already know c2
	// has no advance. So we should definitely disallow a break.
	return false;
	} else {
	// Look at up to 1000 code units.
	const size_t lookback_barrier =
	std::max((ssize_t)start, (ssize_t)offset_back - 1000);
	size_t offset_backback = offset_back;
	while (offset_backback > lookback_barrier) {
	uint32_t c0 = 0;
	U16_PREV(buf, lookback_barrier, offset_backback, c0);
	if (tailoredGraphemeClusterBreak(c0) != U_GCB_REGIONAL_INDICATOR) {
	offset_backback += U16_LENGTH(c0);
	break;
	}
	}
	// The number 4 comes from the number of code units in a whole flag.
	return (offset - offset_backback) % 4 == 0;
	}
	}
	// Cluster Indic syllables together (tailoring of UAX #29).
	// Immediately after each virama (that is not just a pure killer) followed by
	// a letter, we disallow grapheme breaks (if we are here, we don't know about
	// advances, or we already know that c2 has no advance).
	if (u_getIntPropertyValue(c1, UCHAR_CANONICAL_COMBINING_CLASS) == 9 // virama
	&& !isPureKiller(c1) &&
	u_getIntPropertyValue(c2, UCHAR_GENERAL_CATEGORY) == U_OTHER_LETTER) {
	return false;
	}
	// Rule GB999, Any ÷ Any
	return true;
	}

	size_t GraphemeBreak::getTextRunCursor(const float* advances,
	const uint16_t* buf,
	size_t start,
	size_t count,
	size_t offset,
	MoveOpt opt) {
	switch (opt) {
	case AFTER:
	if (offset < start + count) {
	offset++;
	}
	// fall through
	case AT_OR_AFTER:
	while (!isGraphemeBreak(advances, buf, start, count, offset)) {
	offset++;
	}
	break;
	case BEFORE:
	if (offset > start) {
	offset--;
	}
	// fall through
	case AT_OR_BEFORE:
	while (!isGraphemeBreak(advances, buf, start, count, offset)) {
	offset--;
	}
	break;
	case AT:
	if (!isGraphemeBreak(advances, buf, start, count, offset)) {
	offset = (size_t)-1;
	}
	break;
	}
	return offset;
	}

	} // namespace minikin