| // Copyright 2013 The Flutter Authors. All rights reserved. |
| // Use of this source code is governed by a BSD-style license that can be |
| // found in the LICENSE file. |
| |
| import 'fragmenter.dart'; |
| import 'line_break_properties.dart'; |
| import 'unicode_range.dart'; |
| |
| /// Various types of line breaks as defined by the Unicode spec. |
| enum LineBreakType { |
| /// Indicates that a line break is possible but not mandatory. |
| opportunity, |
| |
| /// Indicates that a line break isn't possible. |
| prohibited, |
| |
| /// Indicates that this is a hard line break that can't be skipped. |
| mandatory, |
| |
| /// Indicates the end of the text (which is also considered a line break in |
| /// the Unicode spec). This is the same as [mandatory] but it's needed in our |
| /// implementation to distinguish between the universal [endOfText] and the |
| /// line break caused by "\n" at the end of the text. |
| endOfText, |
| } |
| |
| /// Splits [text] into fragments based on line breaks. |
| class LineBreakFragmenter extends TextFragmenter { |
| const LineBreakFragmenter(super.text); |
| |
| @override |
| List<LineBreakFragment> fragment() { |
| return _computeLineBreakFragments(text); |
| } |
| } |
| |
| class LineBreakFragment extends TextFragment { |
| const LineBreakFragment(super.start, super.end, this.type, { |
| required this.trailingNewlines, |
| required this.trailingSpaces, |
| }); |
| |
| final LineBreakType type; |
| final int trailingNewlines; |
| final int trailingSpaces; |
| |
| @override |
| int get hashCode => Object.hash(start, end, type, trailingNewlines, trailingSpaces); |
| |
| @override |
| bool operator ==(Object other) { |
| return other is LineBreakFragment && |
| other.start == start && |
| other.end == end && |
| other.type == type && |
| other.trailingNewlines == trailingNewlines && |
| other.trailingSpaces == trailingSpaces; |
| } |
| |
| @override |
| String toString() { |
| return 'LineBreakFragment($start, $end, $type)'; |
| } |
| } |
| |
| bool _isHardBreak(LineCharProperty? prop) { |
| // No need to check for NL because it's already normalized to BK. |
| return prop == LineCharProperty.LF || prop == LineCharProperty.BK; |
| } |
| |
| bool _isALorHL(LineCharProperty? prop) { |
| return prop == LineCharProperty.AL || prop == LineCharProperty.HL; |
| } |
| |
| /// Whether the given property is part of a Korean Syllable block. |
| /// |
| /// See: |
| /// - https://unicode.org/reports/tr14/tr14-45.html#LB27 |
| bool _isKoreanSyllable(LineCharProperty? prop) { |
| return prop == LineCharProperty.JL || |
| prop == LineCharProperty.JV || |
| prop == LineCharProperty.JT || |
| prop == LineCharProperty.H2 || |
| prop == LineCharProperty.H3; |
| } |
| |
| /// Whether the given char code has an Eastern Asian width property of F, W or H. |
| /// |
| /// See: |
| /// - https://www.unicode.org/reports/tr14/tr14-45.html#LB30 |
| /// - https://www.unicode.org/Public/13.0.0/ucd/EastAsianWidth.txt |
| bool _hasEastAsianWidthFWH(int charCode) { |
| return charCode == 0x2329 || |
| (charCode >= 0x3008 && charCode <= 0x301D) || |
| (charCode >= 0xFE17 && charCode <= 0xFF62); |
| } |
| |
| bool _isSurrogatePair(int? codePoint) { |
| return codePoint != null && codePoint > 0xFFFF; |
| } |
| |
| /// Finds the next line break in the given [text] starting from [index]. |
| /// |
| /// We think about indices as pointing between characters, and they go all the |
| /// way from 0 to the string length. For example, here are the indices for the |
| /// string "foo bar": |
| /// |
| /// ``` |
| /// f o o b a r |
| /// ^ ^ ^ ^ ^ ^ ^ ^ |
| /// 0 1 2 3 4 5 6 7 |
| /// ``` |
| /// |
| /// This way the indices work well with [String.substring]. |
| /// |
| /// Useful resources: |
| /// |
| /// * https://www.unicode.org/reports/tr14/tr14-45.html#Algorithm |
| /// * https://www.unicode.org/Public/11.0.0/ucd/LineBreak.txt |
| List<LineBreakFragment> _computeLineBreakFragments(String text) { |
| final List<LineBreakFragment> fragments = <LineBreakFragment>[]; |
| |
| // Keeps track of the character two positions behind. |
| LineCharProperty? prev2; |
| LineCharProperty? prev1; |
| |
| int? codePoint = getCodePoint(text, 0); |
| LineCharProperty? curr = lineLookup.findForChar(codePoint); |
| |
| // When there's a sequence of spaces, this variable contains the base property |
| // i.e. the property of the character preceding the sequence. |
| LineCharProperty baseOfSpaceSequence = LineCharProperty.WJ; |
| |
| // When there's a sequence of combining marks, this variable contains the base |
| // property i.e. the property of the character preceding the sequence. |
| LineCharProperty baseOfCombiningMarks = LineCharProperty.AL; |
| |
| int index = 0; |
| int trailingNewlines = 0; |
| int trailingSpaces = 0; |
| |
| int fragmentStart = 0; |
| |
| void setBreak(LineBreakType type, int debugRuleNumber) { |
| final int fragmentEnd = |
| type == LineBreakType.endOfText ? text.length : index; |
| assert(fragmentEnd >= fragmentStart); |
| |
| if (prev1 == LineCharProperty.SP) { |
| trailingSpaces++; |
| } else if (_isHardBreak(prev1) || prev1 == LineCharProperty.CR) { |
| trailingNewlines++; |
| trailingSpaces++; |
| } |
| |
| if (type == LineBreakType.prohibited) { |
| // Don't create a fragment. |
| return; |
| } |
| |
| fragments.add(LineBreakFragment( |
| fragmentStart, |
| fragmentEnd, |
| type, |
| trailingNewlines: trailingNewlines, |
| trailingSpaces: trailingSpaces, |
| )); |
| |
| fragmentStart = index; |
| |
| // Reset trailing spaces/newlines counter after a new fragment. |
| trailingNewlines = 0; |
| trailingSpaces = 0; |
| |
| prev1 = prev2 = null; |
| } |
| |
| // Never break at the start of text. |
| // LB2: sot × |
| setBreak(LineBreakType.prohibited, 2); |
| |
| // Never break at the start of text. |
| // LB2: sot × |
| // |
| // Skip index 0 because a line break can't exist at the start of text. |
| index++; |
| |
| int regionalIndicatorCount = 0; |
| |
| // We need to go until `text.length` in order to handle the case where the |
| // paragraph ends with a hard break. In this case, there will be an empty line |
| // at the end. |
| for (; index <= text.length; index++) { |
| prev2 = prev1; |
| prev1 = curr; |
| |
| if (_isSurrogatePair(codePoint)) { |
| // Can't break in the middle of a surrogate pair. |
| setBreak(LineBreakType.prohibited, -1); |
| // Advance `index` one extra step to skip the tail of the surrogate pair. |
| index++; |
| } |
| |
| codePoint = getCodePoint(text, index); |
| curr = lineLookup.findForChar(codePoint); |
| |
| // Keep count of the RI (regional indicator) sequence. |
| if (prev1 == LineCharProperty.RI) { |
| regionalIndicatorCount++; |
| } else { |
| regionalIndicatorCount = 0; |
| } |
| |
| // Always break after hard line breaks. |
| // LB4: BK ! |
| // |
| // Treat CR followed by LF, as well as CR, LF, and NL as hard line breaks. |
| // LB5: LF ! |
| // NL ! |
| if (_isHardBreak(prev1)) { |
| setBreak(LineBreakType.mandatory, 5); |
| continue; |
| } |
| |
| if (prev1 == LineCharProperty.CR) { |
| if (curr == LineCharProperty.LF) { |
| // LB5: CR × LF |
| setBreak(LineBreakType.prohibited, 5); |
| } else { |
| // LB5: CR ! |
| setBreak(LineBreakType.mandatory, 5); |
| } |
| continue; |
| } |
| |
| // Do not break before hard line breaks. |
| // LB6: × ( BK | CR | LF | NL ) |
| if (_isHardBreak(curr) || curr == LineCharProperty.CR) { |
| setBreak(LineBreakType.prohibited, 6); |
| continue; |
| } |
| |
| if (index >= text.length) { |
| break; |
| } |
| |
| // Establish the base for the space sequence. |
| if (prev1 != LineCharProperty.SP) { |
| // When the text/line starts with SP, we should treat the beginning of text/line |
| // as if it were a WJ (word joiner). |
| baseOfSpaceSequence = prev1 ?? LineCharProperty.WJ; |
| } |
| |
| // Do not break before spaces or zero width space. |
| // LB7: × SP |
| // × ZW |
| if (curr == LineCharProperty.SP || curr == LineCharProperty.ZW) { |
| setBreak(LineBreakType.prohibited, 7); |
| continue; |
| } |
| |
| // Break before any character following a zero-width space, even if one or |
| // more spaces intervene. |
| // LB8: ZW SP* ÷ |
| if (prev1 == LineCharProperty.ZW || |
| baseOfSpaceSequence == LineCharProperty.ZW) { |
| setBreak(LineBreakType.opportunity, 8); |
| continue; |
| } |
| |
| // Do not break after a zero width joiner. |
| // LB8a: ZWJ × |
| if (prev1 == LineCharProperty.ZWJ) { |
| setBreak(LineBreakType.prohibited, 8); |
| continue; |
| } |
| |
| // Establish the base for the sequences of combining marks. |
| if (prev1 != LineCharProperty.CM && prev1 != LineCharProperty.ZWJ) { |
| baseOfCombiningMarks = prev1 ?? LineCharProperty.AL; |
| } |
| |
| // Do not break a combining character sequence; treat it as if it has the |
| // line breaking class of the base character in all of the following rules. |
| // Treat ZWJ as if it were CM. |
| if (curr == LineCharProperty.CM || curr == LineCharProperty.ZWJ) { |
| if (baseOfCombiningMarks == LineCharProperty.SP) { |
| // LB10: Treat any remaining combining mark or ZWJ as AL. |
| curr = LineCharProperty.AL; |
| } else { |
| // LB9: Treat X (CM | ZWJ)* as if it were X |
| // where X is any line break class except BK, NL, LF, CR, SP, or ZW. |
| curr = baseOfCombiningMarks; |
| if (curr == LineCharProperty.RI) { |
| // Prevent the previous RI from being double-counted. |
| regionalIndicatorCount--; |
| } |
| setBreak(LineBreakType.prohibited, 9); |
| continue; |
| } |
| } |
| // In certain situations (e.g. CM immediately following a hard break), we |
| // need to also check if the previous character was CM/ZWJ. That's because |
| // hard breaks caused the previous iteration to short-circuit, which leads |
| // to `baseOfCombiningMarks` not being updated properly. |
| if (prev1 == LineCharProperty.CM || prev1 == LineCharProperty.ZWJ) { |
| prev1 = baseOfCombiningMarks; |
| } |
| |
| // Do not break before or after Word joiner and related characters. |
| // LB11: × WJ |
| // WJ × |
| if (curr == LineCharProperty.WJ || prev1 == LineCharProperty.WJ) { |
| setBreak(LineBreakType.prohibited, 11); |
| continue; |
| } |
| |
| // Do not break after NBSP and related characters. |
| // LB12: GL × |
| if (prev1 == LineCharProperty.GL) { |
| setBreak(LineBreakType.prohibited, 12); |
| continue; |
| } |
| |
| // Do not break before NBSP and related characters, except after spaces and |
| // hyphens. |
| // LB12a: [^SP BA HY] × GL |
| if (!(prev1 == LineCharProperty.SP || |
| prev1 == LineCharProperty.BA || |
| prev1 == LineCharProperty.HY) && |
| curr == LineCharProperty.GL) { |
| setBreak(LineBreakType.prohibited, 12); |
| continue; |
| } |
| |
| // Do not break before ‘]’ or ‘!’ or ‘;’ or ‘/’, even after spaces. |
| // LB13: × CL |
| // × CP |
| // × EX |
| // × IS |
| // × SY |
| // |
| // The above is a quote from unicode.org. In our implementation, we did the |
| // following modification: When there are spaces present, we consider it a |
| // line break opportunity. |
| if (prev1 != LineCharProperty.SP && |
| (curr == LineCharProperty.CL || |
| curr == LineCharProperty.CP || |
| curr == LineCharProperty.EX || |
| curr == LineCharProperty.IS || |
| curr == LineCharProperty.SY)) { |
| setBreak(LineBreakType.prohibited, 13); |
| continue; |
| } |
| |
| // Do not break after ‘[’, even after spaces. |
| // LB14: OP SP* × |
| // |
| // The above is a quote from unicode.org. In our implementation, we did the |
| // following modification: Allow breaks when there are spaces. |
| if (prev1 == LineCharProperty.OP) { |
| setBreak(LineBreakType.prohibited, 14); |
| continue; |
| } |
| |
| // Do not break within ‘”[’, even with intervening spaces. |
| // LB15: QU SP* × OP |
| // |
| // The above is a quote from unicode.org. In our implementation, we did the |
| // following modification: Allow breaks when there are spaces. |
| if (prev1 == LineCharProperty.QU && curr == LineCharProperty.OP) { |
| setBreak(LineBreakType.prohibited, 15); |
| continue; |
| } |
| |
| // Do not break between closing punctuation and a nonstarter, even with |
| // intervening spaces. |
| // LB16: (CL | CP) SP* × NS |
| if ((prev1 == LineCharProperty.CL || |
| baseOfSpaceSequence == LineCharProperty.CL || |
| prev1 == LineCharProperty.CP || |
| baseOfSpaceSequence == LineCharProperty.CP) && |
| curr == LineCharProperty.NS) { |
| setBreak(LineBreakType.prohibited, 16); |
| continue; |
| } |
| |
| // Do not break within ‘——’, even with intervening spaces. |
| // LB17: B2 SP* × B2 |
| if ((prev1 == LineCharProperty.B2 || |
| baseOfSpaceSequence == LineCharProperty.B2) && |
| curr == LineCharProperty.B2) { |
| setBreak(LineBreakType.prohibited, 17); |
| continue; |
| } |
| |
| // Break after spaces. |
| // LB18: SP ÷ |
| if (prev1 == LineCharProperty.SP) { |
| setBreak(LineBreakType.opportunity, 18); |
| continue; |
| } |
| |
| // Do not break before or after quotation marks, such as ‘”’. |
| // LB19: × QU |
| // QU × |
| if (prev1 == LineCharProperty.QU || curr == LineCharProperty.QU) { |
| setBreak(LineBreakType.prohibited, 19); |
| continue; |
| } |
| |
| // Break before and after unresolved CB. |
| // LB20: ÷ CB |
| // CB ÷ |
| // |
| // In flutter web, we use this as an object-replacement character for |
| // placeholders. |
| if (prev1 == LineCharProperty.CB || curr == LineCharProperty.CB) { |
| setBreak(LineBreakType.opportunity, 20); |
| continue; |
| } |
| |
| // Do not break before hyphen-minus, other hyphens, fixed-width spaces, |
| // small kana, and other non-starters, or after acute accents. |
| // LB21: × BA |
| // × HY |
| // × NS |
| // BB × |
| if (curr == LineCharProperty.BA || |
| curr == LineCharProperty.HY || |
| curr == LineCharProperty.NS || |
| prev1 == LineCharProperty.BB) { |
| setBreak(LineBreakType.prohibited, 21); |
| continue; |
| } |
| |
| // Don't break after Hebrew + Hyphen. |
| // LB21a: HL (HY | BA) × |
| if (prev2 == LineCharProperty.HL && |
| (prev1 == LineCharProperty.HY || prev1 == LineCharProperty.BA)) { |
| setBreak(LineBreakType.prohibited, 21); |
| continue; |
| } |
| |
| // Don’t break between Solidus and Hebrew letters. |
| // LB21b: SY × HL |
| if (prev1 == LineCharProperty.SY && curr == LineCharProperty.HL) { |
| setBreak(LineBreakType.prohibited, 21); |
| continue; |
| } |
| |
| // Do not break before ellipses. |
| // LB22: × IN |
| if (curr == LineCharProperty.IN) { |
| setBreak(LineBreakType.prohibited, 22); |
| continue; |
| } |
| |
| // Do not break between digits and letters. |
| // LB23: (AL | HL) × NU |
| // NU × (AL | HL) |
| if ((_isALorHL(prev1) && curr == LineCharProperty.NU) || |
| (prev1 == LineCharProperty.NU && _isALorHL(curr))) { |
| setBreak(LineBreakType.prohibited, 23); |
| continue; |
| } |
| |
| // Do not break between numeric prefixes and ideographs, or between |
| // ideographs and numeric postfixes. |
| // LB23a: PR × (ID | EB | EM) |
| if (prev1 == LineCharProperty.PR && |
| (curr == LineCharProperty.ID || |
| curr == LineCharProperty.EB || |
| curr == LineCharProperty.EM)) { |
| setBreak(LineBreakType.prohibited, 23); |
| continue; |
| } |
| // LB23a: (ID | EB | EM) × PO |
| if ((prev1 == LineCharProperty.ID || |
| prev1 == LineCharProperty.EB || |
| prev1 == LineCharProperty.EM) && |
| curr == LineCharProperty.PO) { |
| setBreak(LineBreakType.prohibited, 23); |
| continue; |
| } |
| |
| // Do not break between numeric prefix/postfix and letters, or between |
| // letters and prefix/postfix. |
| // LB24: (PR | PO) × (AL | HL) |
| if ((prev1 == LineCharProperty.PR || prev1 == LineCharProperty.PO) && |
| _isALorHL(curr)) { |
| setBreak(LineBreakType.prohibited, 24); |
| continue; |
| } |
| // LB24: (AL | HL) × (PR | PO) |
| if (_isALorHL(prev1) && |
| (curr == LineCharProperty.PR || curr == LineCharProperty.PO)) { |
| setBreak(LineBreakType.prohibited, 24); |
| continue; |
| } |
| |
| // Do not break between the following pairs of classes relevant to numbers. |
| // LB25: (CL | CP | NU) × (PO | PR) |
| if ((prev1 == LineCharProperty.CL || |
| prev1 == LineCharProperty.CP || |
| prev1 == LineCharProperty.NU) && |
| (curr == LineCharProperty.PO || curr == LineCharProperty.PR)) { |
| setBreak(LineBreakType.prohibited, 25); |
| continue; |
| } |
| // LB25: (PO | PR) × OP |
| if ((prev1 == LineCharProperty.PO || prev1 == LineCharProperty.PR) && |
| curr == LineCharProperty.OP) { |
| setBreak(LineBreakType.prohibited, 25); |
| continue; |
| } |
| // LB25: (PO | PR | HY | IS | NU | SY) × NU |
| if ((prev1 == LineCharProperty.PO || |
| prev1 == LineCharProperty.PR || |
| prev1 == LineCharProperty.HY || |
| prev1 == LineCharProperty.IS || |
| prev1 == LineCharProperty.NU || |
| prev1 == LineCharProperty.SY) && |
| curr == LineCharProperty.NU) { |
| setBreak(LineBreakType.prohibited, 25); |
| continue; |
| } |
| |
| // Do not break a Korean syllable. |
| // LB26: JL × (JL | JV | H2 | H3) |
| if (prev1 == LineCharProperty.JL && |
| (curr == LineCharProperty.JL || |
| curr == LineCharProperty.JV || |
| curr == LineCharProperty.H2 || |
| curr == LineCharProperty.H3)) { |
| setBreak(LineBreakType.prohibited, 26); |
| continue; |
| } |
| // LB26: (JV | H2) × (JV | JT) |
| if ((prev1 == LineCharProperty.JV || prev1 == LineCharProperty.H2) && |
| (curr == LineCharProperty.JV || curr == LineCharProperty.JT)) { |
| setBreak(LineBreakType.prohibited, 26); |
| continue; |
| } |
| // LB26: (JT | H3) × JT |
| if ((prev1 == LineCharProperty.JT || prev1 == LineCharProperty.H3) && |
| curr == LineCharProperty.JT) { |
| setBreak(LineBreakType.prohibited, 26); |
| continue; |
| } |
| |
| // Treat a Korean Syllable Block the same as ID. |
| // LB27: (JL | JV | JT | H2 | H3) × PO |
| if (_isKoreanSyllable(prev1) && curr == LineCharProperty.PO) { |
| setBreak(LineBreakType.prohibited, 27); |
| continue; |
| } |
| // LB27: PR × (JL | JV | JT | H2 | H3) |
| if (prev1 == LineCharProperty.PR && _isKoreanSyllable(curr)) { |
| setBreak(LineBreakType.prohibited, 27); |
| continue; |
| } |
| |
| // Do not break between alphabetics. |
| // LB28: (AL | HL) × (AL | HL) |
| if (_isALorHL(prev1) && _isALorHL(curr)) { |
| setBreak(LineBreakType.prohibited, 28); |
| continue; |
| } |
| |
| // Do not break between numeric punctuation and alphabetics (“e.g.”). |
| // LB29: IS × (AL | HL) |
| if (prev1 == LineCharProperty.IS && _isALorHL(curr)) { |
| setBreak(LineBreakType.prohibited, 29); |
| continue; |
| } |
| |
| // Do not break between letters, numbers, or ordinary symbols and opening or |
| // closing parentheses. |
| // LB30: (AL | HL | NU) × OP |
| // |
| // LB30 requires that we exclude characters that have an Eastern Asian width |
| // property of value F, W or H classes. |
| if ((_isALorHL(prev1) || prev1 == LineCharProperty.NU) && |
| curr == LineCharProperty.OP && |
| !_hasEastAsianWidthFWH(text.codeUnitAt(index))) { |
| setBreak(LineBreakType.prohibited, 30); |
| continue; |
| } |
| // LB30: CP × (AL | HL | NU) |
| if (prev1 == LineCharProperty.CP && |
| !_hasEastAsianWidthFWH(text.codeUnitAt(index - 1)) && |
| (_isALorHL(curr) || curr == LineCharProperty.NU)) { |
| setBreak(LineBreakType.prohibited, 30); |
| continue; |
| } |
| |
| // Break between two regional indicator symbols if and only if there are an |
| // even number of regional indicators preceding the position of the break. |
| // LB30a: sot (RI RI)* RI × RI |
| // [^RI] (RI RI)* RI × RI |
| if (curr == LineCharProperty.RI) { |
| if (regionalIndicatorCount.isOdd) { |
| setBreak(LineBreakType.prohibited, 30); |
| } else { |
| setBreak(LineBreakType.opportunity, 30); |
| } |
| continue; |
| } |
| |
| // Do not break between an emoji base and an emoji modifier. |
| // LB30b: EB × EM |
| if (prev1 == LineCharProperty.EB && curr == LineCharProperty.EM) { |
| setBreak(LineBreakType.prohibited, 30); |
| continue; |
| } |
| |
| // Break everywhere else. |
| // LB31: ALL ÷ |
| // ÷ ALL |
| setBreak(LineBreakType.opportunity, 31); |
| } |
| |
| // Always break at the end of text. |
| // LB3: ! eot |
| setBreak(LineBreakType.endOfText, 3); |
| |
| return fragments; |
| } |