| /* |
| * Copyright (C) 2015 The Android Open Source Project |
| * |
| * Licensed under the Apache License, Version 2.0 (the "License"); |
| * you may not use this file except in compliance with the License. |
| * You may obtain a copy of the License at |
| * |
| * http://www.apache.org/licenses/LICENSE-2.0 |
| * |
| * Unless required by applicable law or agreed to in writing, software |
| * distributed under the License is distributed on an "AS IS" BASIS, |
| * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
| * See the License for the specific language governing permissions and |
| * limitations under the License. |
| */ |
| |
| #include <unicode/uchar.h> |
| #include <unicode/uscript.h> |
| #include <algorithm> |
| #include <memory> |
| #include <string> |
| #include <vector> |
| |
| // HACK: for reading pattern file |
| #include <fcntl.h> |
| |
| #define LOG_TAG "Minikin" |
| |
| #include "minikin/Hyphenator.h" |
| #include "utils/WindowsUtils.h" |
| |
| using std::vector; |
| |
| namespace minikin { |
| |
| static const uint16_t CHAR_HYPHEN_MINUS = 0x002D; |
| static const uint16_t CHAR_SOFT_HYPHEN = 0x00AD; |
| static const uint16_t CHAR_MIDDLE_DOT = 0x00B7; |
| static const uint16_t CHAR_HYPHEN = 0x2010; |
| |
| // The following are structs that correspond to tables inside the hyb file |
| // format |
| |
| struct AlphabetTable0 { |
| uint32_t version; |
| uint32_t min_codepoint; |
| uint32_t max_codepoint; |
| uint8_t data[1]; // actually flexible array, size is known at runtime |
| }; |
| |
| struct AlphabetTable1 { |
| uint32_t version; |
| uint32_t n_entries; |
| uint32_t data[1]; // actually flexible array, size is known at runtime |
| |
| static uint32_t codepoint(uint32_t entry) { return entry >> 11; } |
| static uint32_t value(uint32_t entry) { return entry & 0x7ff; } |
| }; |
| |
| struct Trie { |
| uint32_t version; |
| uint32_t char_mask; |
| uint32_t link_shift; |
| uint32_t link_mask; |
| uint32_t pattern_shift; |
| uint32_t n_entries; |
| uint32_t data[1]; // actually flexible array, size is known at runtime |
| }; |
| |
| struct Pattern { |
| uint32_t version; |
| uint32_t n_entries; |
| uint32_t pattern_offset; |
| uint32_t pattern_size; |
| uint32_t data[1]; // actually flexible array, size is known at runtime |
| |
| // accessors |
| static uint32_t len(uint32_t entry) { return entry >> 26; } |
| static uint32_t shift(uint32_t entry) { return (entry >> 20) & 0x3f; } |
| const uint8_t* buf(uint32_t entry) const { |
| return reinterpret_cast<const uint8_t*>(this) + pattern_offset + |
| (entry & 0xfffff); |
| } |
| }; |
| |
| struct Header { |
| uint32_t magic; |
| uint32_t version; |
| uint32_t alphabet_offset; |
| uint32_t trie_offset; |
| uint32_t pattern_offset; |
| uint32_t file_size; |
| |
| // accessors |
| const uint8_t* bytes() const { |
| return reinterpret_cast<const uint8_t*>(this); |
| } |
| uint32_t alphabetVersion() const { |
| return *reinterpret_cast<const uint32_t*>(bytes() + alphabet_offset); |
| } |
| const AlphabetTable0* alphabetTable0() const { |
| return reinterpret_cast<const AlphabetTable0*>(bytes() + alphabet_offset); |
| } |
| const AlphabetTable1* alphabetTable1() const { |
| return reinterpret_cast<const AlphabetTable1*>(bytes() + alphabet_offset); |
| } |
| const Trie* trieTable() const { |
| return reinterpret_cast<const Trie*>(bytes() + trie_offset); |
| } |
| const Pattern* patternTable() const { |
| return reinterpret_cast<const Pattern*>(bytes() + pattern_offset); |
| } |
| }; |
| |
| Hyphenator* Hyphenator::loadBinary(const uint8_t* patternData, |
| size_t minPrefix, |
| size_t minSuffix) { |
| Hyphenator* result = new Hyphenator; |
| result->patternData = patternData; |
| result->minPrefix = minPrefix; |
| result->minSuffix = minSuffix; |
| return result; |
| } |
| |
| void Hyphenator::hyphenate(vector<HyphenationType>* result, |
| const uint16_t* word, |
| size_t len, |
| const icu::Locale& locale) { |
| result->clear(); |
| result->resize(len); |
| const size_t paddedLen = len + 2; // start and stop code each count for 1 |
| if (patternData != nullptr && len >= minPrefix + minSuffix && |
| paddedLen <= MAX_HYPHENATED_SIZE) { |
| uint16_t alpha_codes[MAX_HYPHENATED_SIZE]; |
| const HyphenationType hyphenValue = alphabetLookup(alpha_codes, word, len); |
| if (hyphenValue != HyphenationType::DONT_BREAK) { |
| hyphenateFromCodes(result->data(), alpha_codes, paddedLen, hyphenValue); |
| return; |
| } |
| // TODO: try NFC normalization |
| // TODO: handle non-BMP Unicode (requires remapping of offsets) |
| } |
| // Note that we will always get here if the word contains a hyphen or a soft |
| // hyphen, because the alphabet is not expected to contain a hyphen or a soft |
| // hyphen character, so alphabetLookup would return DONT_BREAK. |
| hyphenateWithNoPatterns(result->data(), word, len, locale); |
| } |
| |
| // This function determines whether a character is like U+2010 HYPHEN in |
| // line breaking and usage: a character immediately after which line breaks |
| // are allowed, but words containing it should not be automatically |
| // hyphenated using patterns. This is a curated set, created by manually |
| // inspecting all the characters that have the Unicode line breaking |
| // property of BA or HY and seeing which ones are hyphens. |
| bool Hyphenator::isLineBreakingHyphen(uint32_t c) { |
| return (c == 0x002D || // HYPHEN-MINUS |
| c == 0x058A || // ARMENIAN HYPHEN |
| c == 0x05BE || // HEBREW PUNCTUATION MAQAF |
| c == 0x1400 || // CANADIAN SYLLABICS HYPHEN |
| c == 0x2010 || // HYPHEN |
| c == 0x2013 || // EN DASH |
| c == 0x2027 || // HYPHENATION POINT |
| c == 0x2E17 || // DOUBLE OBLIQUE HYPHEN |
| c == 0x2E40); // DOUBLE HYPHEN |
| } |
| |
| const static uint32_t HYPHEN_STR[] = {0x2010, 0}; |
| const static uint32_t ARMENIAN_HYPHEN_STR[] = {0x058A, 0}; |
| const static uint32_t MAQAF_STR[] = {0x05BE, 0}; |
| const static uint32_t UCAS_HYPHEN_STR[] = {0x1400, 0}; |
| const static uint32_t ZWJ_STR[] = {0x200D, 0}; |
| const static uint32_t ZWJ_AND_HYPHEN_STR[] = {0x200D, 0x2010, 0}; |
| |
| const uint32_t* HyphenEdit::getHyphenString(uint32_t hyph) { |
| switch (hyph) { |
| case INSERT_HYPHEN_AT_END: |
| case REPLACE_WITH_HYPHEN_AT_END: |
| case INSERT_HYPHEN_AT_START: |
| return HYPHEN_STR; |
| case INSERT_ARMENIAN_HYPHEN_AT_END: |
| return ARMENIAN_HYPHEN_STR; |
| case INSERT_MAQAF_AT_END: |
| return MAQAF_STR; |
| case INSERT_UCAS_HYPHEN_AT_END: |
| return UCAS_HYPHEN_STR; |
| case INSERT_ZWJ_AND_HYPHEN_AT_END: |
| return ZWJ_AND_HYPHEN_STR; |
| case INSERT_ZWJ_AT_START: |
| return ZWJ_STR; |
| default: |
| return nullptr; |
| } |
| } |
| |
| uint32_t HyphenEdit::editForThisLine(HyphenationType type) { |
| switch (type) { |
| case HyphenationType::DONT_BREAK: |
| return NO_EDIT; |
| case HyphenationType::BREAK_AND_INSERT_HYPHEN: |
| return INSERT_HYPHEN_AT_END; |
| case HyphenationType::BREAK_AND_INSERT_ARMENIAN_HYPHEN: |
| return INSERT_ARMENIAN_HYPHEN_AT_END; |
| case HyphenationType::BREAK_AND_INSERT_MAQAF: |
| return INSERT_MAQAF_AT_END; |
| case HyphenationType::BREAK_AND_INSERT_UCAS_HYPHEN: |
| return INSERT_UCAS_HYPHEN_AT_END; |
| case HyphenationType::BREAK_AND_REPLACE_WITH_HYPHEN: |
| return REPLACE_WITH_HYPHEN_AT_END; |
| case HyphenationType::BREAK_AND_INSERT_HYPHEN_AND_ZWJ: |
| return INSERT_ZWJ_AND_HYPHEN_AT_END; |
| default: |
| return BREAK_AT_END; |
| } |
| } |
| |
| uint32_t HyphenEdit::editForNextLine(HyphenationType type) { |
| switch (type) { |
| case HyphenationType::DONT_BREAK: |
| return NO_EDIT; |
| case HyphenationType::BREAK_AND_INSERT_HYPHEN_AT_NEXT_LINE: |
| return INSERT_HYPHEN_AT_START; |
| case HyphenationType::BREAK_AND_INSERT_HYPHEN_AND_ZWJ: |
| return INSERT_ZWJ_AT_START; |
| default: |
| return BREAK_AT_START; |
| } |
| } |
| |
| static UScriptCode getScript(uint32_t codePoint) { |
| UErrorCode errorCode = U_ZERO_ERROR; |
| const UScriptCode script = |
| uscript_getScript(static_cast<UChar32>(codePoint), &errorCode); |
| if (U_SUCCESS(errorCode)) { |
| return script; |
| } else { |
| return USCRIPT_INVALID_CODE; |
| } |
| } |
| |
| static HyphenationType hyphenationTypeBasedOnScript(uint32_t codePoint) { |
| // Note: It's not clear what the best hyphen for Hebrew is. While maqaf is the |
| // "correct" hyphen for Hebrew, modern practice may have shifted towards |
| // Western hyphens. We use normal hyphens for now to be safe. |
| // BREAK_AND_INSERT_MAQAF is already implemented, so if we want to switch to |
| // maqaf for Hebrew, we can simply add a condition here. |
| const UScriptCode script = getScript(codePoint); |
| if (script == USCRIPT_KANNADA || script == USCRIPT_MALAYALAM || |
| script == USCRIPT_TAMIL || script == USCRIPT_TELUGU) { |
| // Grantha is not included, since we don't support non-BMP hyphenation yet. |
| return HyphenationType::BREAK_AND_DONT_INSERT_HYPHEN; |
| } else if (script == USCRIPT_ARMENIAN) { |
| return HyphenationType::BREAK_AND_INSERT_ARMENIAN_HYPHEN; |
| } else if (script == USCRIPT_CANADIAN_ABORIGINAL) { |
| return HyphenationType::BREAK_AND_INSERT_UCAS_HYPHEN; |
| } else { |
| return HyphenationType::BREAK_AND_INSERT_HYPHEN; |
| } |
| } |
| |
| static inline int32_t getJoiningType(UChar32 codepoint) { |
| return u_getIntPropertyValue(codepoint, UCHAR_JOINING_TYPE); |
| } |
| |
| // Assumption for caller: location must be >= 2 and word[location] == |
| // CHAR_SOFT_HYPHEN. This function decides if the letters before and after the |
| // hyphen should appear as joining. |
| static inline HyphenationType getHyphTypeForArabic(const uint16_t* word, |
| size_t len, |
| size_t location) { |
| ssize_t i = location; |
| int32_t type = U_JT_NON_JOINING; |
| while (static_cast<size_t>(i) < len && |
| (type = getJoiningType(word[i])) == U_JT_TRANSPARENT) { |
| i++; |
| } |
| if (type == U_JT_DUAL_JOINING || type == U_JT_RIGHT_JOINING || |
| type == U_JT_JOIN_CAUSING) { |
| // The next character is of the type that may join the last character. See |
| // if the last character is also of the right type. |
| i = location - 2; // Skip the soft hyphen |
| type = U_JT_NON_JOINING; |
| while (i >= 0 && (type = getJoiningType(word[i])) == U_JT_TRANSPARENT) { |
| i--; |
| } |
| if (type == U_JT_DUAL_JOINING || type == U_JT_LEFT_JOINING || |
| type == U_JT_JOIN_CAUSING) { |
| return HyphenationType::BREAK_AND_INSERT_HYPHEN_AND_ZWJ; |
| } |
| } |
| return HyphenationType::BREAK_AND_INSERT_HYPHEN; |
| } |
| |
| // Use various recommendations of UAX #14 Unicode Line Breaking Algorithm for |
| // hyphenating words that didn't match patterns, especially words that contain |
| // hyphens or soft hyphens (See sections 5.3, Use of Hyphen, and 5.4, Use of |
| // Soft Hyphen). |
| void Hyphenator::hyphenateWithNoPatterns(HyphenationType* result, |
| const uint16_t* word, |
| size_t len, |
| const icu::Locale& locale) { |
| result[0] = HyphenationType::DONT_BREAK; |
| for (size_t i = 1; i < len; i++) { |
| const uint16_t prevChar = word[i - 1]; |
| if (i > 1 && isLineBreakingHyphen(prevChar)) { |
| // Break after hyphens, but only if they don't start the word. |
| |
| if ((prevChar == CHAR_HYPHEN_MINUS || prevChar == CHAR_HYPHEN) && |
| strcmp(locale.getLanguage(), "pl") == 0 && |
| getScript(word[i]) == USCRIPT_LATIN) { |
| // In Polish, hyphens get repeated at the next line. To be safe, |
| // we will do this only if the next character is Latin. |
| result[i] = HyphenationType::BREAK_AND_INSERT_HYPHEN_AT_NEXT_LINE; |
| } else { |
| result[i] = HyphenationType::BREAK_AND_DONT_INSERT_HYPHEN; |
| } |
| } else if (i > 1 && prevChar == CHAR_SOFT_HYPHEN) { |
| // Break after soft hyphens, but only if they don't start the word (a soft |
| // hyphen starting the word doesn't give any useful break opportunities). |
| // The type of the break is based on the script of the character we break |
| // on. |
| if (getScript(word[i]) == USCRIPT_ARABIC) { |
| // For Arabic, we need to look and see if the characters around the soft |
| // hyphen actually join. If they don't, we'll just insert a normal |
| // hyphen. |
| result[i] = getHyphTypeForArabic(word, len, i); |
| } else { |
| result[i] = hyphenationTypeBasedOnScript(word[i]); |
| } |
| } else if (prevChar == CHAR_MIDDLE_DOT && minPrefix < i && |
| i <= len - minSuffix && |
| ((word[i - 2] == 'l' && word[i] == 'l') || |
| (word[i - 2] == 'L' && word[i] == 'L')) && |
| strcmp(locale.getLanguage(), "ca") == 0) { |
| // In Catalan, "l·l" should break as "l-" on the first line |
| // and "l" on the next line. |
| result[i] = HyphenationType::BREAK_AND_REPLACE_WITH_HYPHEN; |
| } else { |
| result[i] = HyphenationType::DONT_BREAK; |
| } |
| } |
| } |
| |
| HyphenationType Hyphenator::alphabetLookup(uint16_t* alpha_codes, |
| const uint16_t* word, |
| size_t len) { |
| const Header* header = getHeader(); |
| HyphenationType result = HyphenationType::BREAK_AND_INSERT_HYPHEN; |
| // TODO: check header magic |
| uint32_t alphabetVersion = header->alphabetVersion(); |
| if (alphabetVersion == 0) { |
| const AlphabetTable0* alphabet = header->alphabetTable0(); |
| uint32_t min_codepoint = alphabet->min_codepoint; |
| uint32_t max_codepoint = alphabet->max_codepoint; |
| alpha_codes[0] = 0; // word start |
| for (size_t i = 0; i < len; i++) { |
| uint16_t c = word[i]; |
| if (c < min_codepoint || c >= max_codepoint) { |
| return HyphenationType::DONT_BREAK; |
| } |
| uint8_t code = alphabet->data[c - min_codepoint]; |
| if (code == 0) { |
| return HyphenationType::DONT_BREAK; |
| } |
| if (result == HyphenationType::BREAK_AND_INSERT_HYPHEN) { |
| result = hyphenationTypeBasedOnScript(c); |
| } |
| alpha_codes[i + 1] = code; |
| } |
| alpha_codes[len + 1] = 0; // word termination |
| return result; |
| } else if (alphabetVersion == 1) { |
| const AlphabetTable1* alphabet = header->alphabetTable1(); |
| size_t n_entries = alphabet->n_entries; |
| const uint32_t* begin = alphabet->data; |
| const uint32_t* end = begin + n_entries; |
| alpha_codes[0] = 0; |
| for (size_t i = 0; i < len; i++) { |
| uint16_t c = word[i]; |
| auto p = std::lower_bound<const uint32_t*, uint32_t>(begin, end, c << 11); |
| if (p == end) { |
| return HyphenationType::DONT_BREAK; |
| } |
| uint32_t entry = *p; |
| if (AlphabetTable1::codepoint(entry) != c) { |
| return HyphenationType::DONT_BREAK; |
| } |
| if (result == HyphenationType::BREAK_AND_INSERT_HYPHEN) { |
| result = hyphenationTypeBasedOnScript(c); |
| } |
| alpha_codes[i + 1] = AlphabetTable1::value(entry); |
| } |
| alpha_codes[len + 1] = 0; |
| return result; |
| } |
| return HyphenationType::DONT_BREAK; |
| } |
| |
| /** |
| * Internal implementation, after conversion to codes. All case folding and |
| *normalization has been done by now, and all characters have been found in the |
| *alphabet. Note: len here is the padded length including 0 codes at start and |
| *end. |
| **/ |
| void Hyphenator::hyphenateFromCodes(HyphenationType* result, |
| const uint16_t* codes, |
| size_t len, |
| HyphenationType hyphenValue) { |
| static_assert(sizeof(HyphenationType) == sizeof(uint8_t), |
| "HyphnationType must be uint8_t."); |
| // Reuse the result array as a buffer for calculating intermediate hyphenation |
| // numbers. |
| uint8_t* buffer = reinterpret_cast<uint8_t*>(result); |
| |
| const Header* header = getHeader(); |
| const Trie* trie = header->trieTable(); |
| const Pattern* pattern = header->patternTable(); |
| uint32_t char_mask = trie->char_mask; |
| uint32_t link_shift = trie->link_shift; |
| uint32_t link_mask = trie->link_mask; |
| uint32_t pattern_shift = trie->pattern_shift; |
| size_t maxOffset = len - minSuffix - 1; |
| for (size_t i = 0; i < len - 1; i++) { |
| uint32_t node = 0; // index into Trie table |
| for (size_t j = i; j < len; j++) { |
| uint16_t c = codes[j]; |
| uint32_t entry = trie->data[node + c]; |
| if ((entry & char_mask) == c) { |
| node = (entry & link_mask) >> link_shift; |
| } else { |
| break; |
| } |
| uint32_t pat_ix = trie->data[node] >> pattern_shift; |
| // pat_ix contains a 3-tuple of length, shift (number of trailing zeros), |
| // and an offset into the buf pool. This is the pattern for the substring |
| // (i..j) we just matched, which we combine (via point-wise max) into the |
| // buffer vector. |
| if (pat_ix != 0) { |
| uint32_t pat_entry = pattern->data[pat_ix]; |
| int pat_len = Pattern::len(pat_entry); |
| int pat_shift = Pattern::shift(pat_entry); |
| const uint8_t* pat_buf = pattern->buf(pat_entry); |
| int offset = j + 1 - (pat_len + pat_shift); |
| // offset is the index within buffer that lines up with the start of |
| // pat_buf |
| int start = std::max((int)minPrefix - offset, 0); |
| int end = std::min(pat_len, (int)maxOffset - offset); |
| for (int k = start; k < end; k++) { |
| buffer[offset + k] = std::max(buffer[offset + k], pat_buf[k]); |
| } |
| } |
| } |
| } |
| // Since the above calculation does not modify values outside |
| // [minPrefix, len - minSuffix], they are left as 0 = DONT_BREAK. |
| for (size_t i = minPrefix; i < maxOffset; i++) { |
| // Hyphenation opportunities happen when the hyphenation numbers are odd. |
| result[i] = (buffer[i] & 1u) ? hyphenValue : HyphenationType::DONT_BREAK; |
| } |
| } |
| |
| } // namespace minikin |