Behdad Esfahbod | d94647e | 2009-11-03 16:35:10 -0500 | [diff] [blame] | 1 | /* |
Behdad Esfahbod | 2409d5f | 2011-04-21 17:14:28 -0400 | [diff] [blame] | 2 | * Copyright © 2009 Red Hat, Inc. |
| 3 | * Copyright © 2009 Keith Stribley |
| 4 | * Copyright © 2011 Google, Inc. |
Behdad Esfahbod | d94647e | 2009-11-03 16:35:10 -0500 | [diff] [blame] | 5 | * |
Behdad Esfahbod | c755cb3 | 2010-04-22 00:11:43 -0400 | [diff] [blame] | 6 | * This is part of HarfBuzz, a text shaping library. |
Behdad Esfahbod | d94647e | 2009-11-03 16:35:10 -0500 | [diff] [blame] | 7 | * |
| 8 | * Permission is hereby granted, without written agreement and without |
| 9 | * license or royalty fees, to use, copy, modify, and distribute this |
| 10 | * software and its documentation for any purpose, provided that the |
| 11 | * above copyright notice and the following two paragraphs appear in |
| 12 | * all copies of this software. |
| 13 | * |
| 14 | * IN NO EVENT SHALL THE COPYRIGHT HOLDER BE LIABLE TO ANY PARTY FOR |
| 15 | * DIRECT, INDIRECT, SPECIAL, INCIDENTAL, OR CONSEQUENTIAL DAMAGES |
| 16 | * ARISING OUT OF THE USE OF THIS SOFTWARE AND ITS DOCUMENTATION, EVEN |
| 17 | * IF THE COPYRIGHT HOLDER HAS BEEN ADVISED OF THE POSSIBILITY OF SUCH |
| 18 | * DAMAGE. |
| 19 | * |
| 20 | * THE COPYRIGHT HOLDER SPECIFICALLY DISCLAIMS ANY WARRANTIES, INCLUDING, |
| 21 | * BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND |
| 22 | * FITNESS FOR A PARTICULAR PURPOSE. THE SOFTWARE PROVIDED HEREUNDER IS |
| 23 | * ON AN "AS IS" BASIS, AND THE COPYRIGHT HOLDER HAS NO OBLIGATION TO |
| 24 | * PROVIDE MAINTENANCE, SUPPORT, UPDATES, ENHANCEMENTS, OR MODIFICATIONS. |
| 25 | * |
| 26 | * Red Hat Author(s): Behdad Esfahbod |
Behdad Esfahbod | 2409d5f | 2011-04-21 17:14:28 -0400 | [diff] [blame] | 27 | * Google Author(s): Behdad Esfahbod |
Behdad Esfahbod | d94647e | 2009-11-03 16:35:10 -0500 | [diff] [blame] | 28 | */ |
| 29 | |
Behdad Esfahbod | c57d454 | 2011-04-20 18:50:27 -0400 | [diff] [blame] | 30 | #include "hb-private.hh" |
Behdad Esfahbod | d94647e | 2009-11-03 16:35:10 -0500 | [diff] [blame] | 31 | |
| 32 | #include "hb-icu.h" |
| 33 | |
Behdad Esfahbod | fb194b8 | 2011-04-20 02:00:47 -0400 | [diff] [blame] | 34 | #include "hb-unicode-private.hh" |
Behdad Esfahbod | d94647e | 2009-11-03 16:35:10 -0500 | [diff] [blame] | 35 | |
| 36 | #include <unicode/uchar.h> |
Behdad Esfahbod | 498e1a9 | 2011-07-20 23:19:49 -0400 | [diff] [blame] | 37 | #include <unicode/unorm.h> |
Behdad Esfahbod | 36a4f4a | 2012-01-18 22:16:49 -0500 | [diff] [blame] | 38 | #include <unicode/ustring.h> |
Kal Conley | 5f995db | 2016-02-26 00:36:17 +0100 | [diff] [blame] | 39 | #include <unicode/utf16.h> |
Behdad Esfahbod | 4ac4c6f | 2012-08-13 10:52:52 -0400 | [diff] [blame] | 40 | #include <unicode/uversion.h> |
Behdad Esfahbod | acdba3f | 2010-07-23 15:11:18 -0400 | [diff] [blame] | 41 | |
| 42 | |
Behdad Esfahbod | f144a8e | 2011-04-20 02:54:42 -0400 | [diff] [blame] | 43 | hb_script_t |
| 44 | hb_icu_script_to_script (UScriptCode script) |
Ryan Lortie | 2fd0c57 | 2011-04-20 00:19:20 -0400 | [diff] [blame] | 45 | { |
Behdad Esfahbod | 4d559cd | 2011-04-21 14:58:23 -0400 | [diff] [blame] | 46 | if (unlikely (script == USCRIPT_INVALID_CODE)) |
| 47 | return HB_SCRIPT_INVALID; |
| 48 | |
Behdad Esfahbod | 4c9fe88 | 2011-08-26 09:18:53 +0200 | [diff] [blame] | 49 | return hb_script_from_string (uscript_getShortName (script), -1); |
Ryan Lortie | 2fd0c57 | 2011-04-20 00:19:20 -0400 | [diff] [blame] | 50 | } |
| 51 | |
Behdad Esfahbod | f144a8e | 2011-04-20 02:54:42 -0400 | [diff] [blame] | 52 | UScriptCode |
| 53 | hb_icu_script_from_script (hb_script_t script) |
Behdad Esfahbod | d94647e | 2009-11-03 16:35:10 -0500 | [diff] [blame] | 54 | { |
Behdad Esfahbod | 4d559cd | 2011-04-21 14:58:23 -0400 | [diff] [blame] | 55 | if (unlikely (script == HB_SCRIPT_INVALID)) |
| 56 | return USCRIPT_INVALID_CODE; |
Behdad Esfahbod | f144a8e | 2011-04-20 02:54:42 -0400 | [diff] [blame] | 57 | |
Behdad Esfahbod | 4d559cd | 2011-04-21 14:58:23 -0400 | [diff] [blame] | 58 | for (unsigned int i = 0; i < USCRIPT_CODE_LIMIT; i++) |
| 59 | if (unlikely (hb_icu_script_to_script ((UScriptCode) i) == script)) |
| 60 | return (UScriptCode) i; |
Ryan Lortie | 2fd0c57 | 2011-04-20 00:19:20 -0400 | [diff] [blame] | 61 | |
Behdad Esfahbod | f144a8e | 2011-04-20 02:54:42 -0400 | [diff] [blame] | 62 | return USCRIPT_UNKNOWN; |
| 63 | } |
| 64 | |
| 65 | |
Behdad Esfahbod | 21fdcee | 2012-08-01 16:23:44 -0400 | [diff] [blame] | 66 | static hb_unicode_combining_class_t |
Behdad Esfahbod | fca0923 | 2011-07-20 22:16:13 -0400 | [diff] [blame] | 67 | hb_icu_unicode_combining_class (hb_unicode_funcs_t *ufuncs HB_UNUSED, |
| 68 | hb_codepoint_t unicode, |
| 69 | void *user_data HB_UNUSED) |
Behdad Esfahbod | f144a8e | 2011-04-20 02:54:42 -0400 | [diff] [blame] | 70 | |
| 71 | { |
Behdad Esfahbod | 21fdcee | 2012-08-01 16:23:44 -0400 | [diff] [blame] | 72 | return (hb_unicode_combining_class_t) u_getCombiningClass (unicode); |
Behdad Esfahbod | f144a8e | 2011-04-20 02:54:42 -0400 | [diff] [blame] | 73 | } |
| 74 | |
| 75 | static unsigned int |
Behdad Esfahbod | fca0923 | 2011-07-20 22:16:13 -0400 | [diff] [blame] | 76 | hb_icu_unicode_eastasian_width (hb_unicode_funcs_t *ufuncs HB_UNUSED, |
| 77 | hb_codepoint_t unicode, |
| 78 | void *user_data HB_UNUSED) |
Behdad Esfahbod | f144a8e | 2011-04-20 02:54:42 -0400 | [diff] [blame] | 79 | { |
| 80 | switch (u_getIntPropertyValue(unicode, UCHAR_EAST_ASIAN_WIDTH)) |
| 81 | { |
| 82 | case U_EA_WIDE: |
| 83 | case U_EA_FULLWIDTH: |
| 84 | return 2; |
| 85 | case U_EA_NEUTRAL: |
| 86 | case U_EA_AMBIGUOUS: |
| 87 | case U_EA_HALFWIDTH: |
| 88 | case U_EA_NARROW: |
| 89 | return 1; |
| 90 | } |
| 91 | return 1; |
| 92 | } |
| 93 | |
| 94 | static hb_unicode_general_category_t |
Behdad Esfahbod | fca0923 | 2011-07-20 22:16:13 -0400 | [diff] [blame] | 95 | hb_icu_unicode_general_category (hb_unicode_funcs_t *ufuncs HB_UNUSED, |
| 96 | hb_codepoint_t unicode, |
| 97 | void *user_data HB_UNUSED) |
Behdad Esfahbod | f144a8e | 2011-04-20 02:54:42 -0400 | [diff] [blame] | 98 | { |
| 99 | switch (u_getIntPropertyValue(unicode, UCHAR_GENERAL_CATEGORY)) |
| 100 | { |
| 101 | case U_UNASSIGNED: return HB_UNICODE_GENERAL_CATEGORY_UNASSIGNED; |
| 102 | |
| 103 | case U_UPPERCASE_LETTER: return HB_UNICODE_GENERAL_CATEGORY_UPPERCASE_LETTER; |
| 104 | case U_LOWERCASE_LETTER: return HB_UNICODE_GENERAL_CATEGORY_LOWERCASE_LETTER; |
| 105 | case U_TITLECASE_LETTER: return HB_UNICODE_GENERAL_CATEGORY_TITLECASE_LETTER; |
| 106 | case U_MODIFIER_LETTER: return HB_UNICODE_GENERAL_CATEGORY_MODIFIER_LETTER; |
| 107 | case U_OTHER_LETTER: return HB_UNICODE_GENERAL_CATEGORY_OTHER_LETTER; |
| 108 | |
| 109 | case U_NON_SPACING_MARK: return HB_UNICODE_GENERAL_CATEGORY_NON_SPACING_MARK; |
| 110 | case U_ENCLOSING_MARK: return HB_UNICODE_GENERAL_CATEGORY_ENCLOSING_MARK; |
Behdad Esfahbod | 5157e12 | 2011-07-21 00:12:33 -0400 | [diff] [blame] | 111 | case U_COMBINING_SPACING_MARK: return HB_UNICODE_GENERAL_CATEGORY_SPACING_MARK; |
Behdad Esfahbod | f144a8e | 2011-04-20 02:54:42 -0400 | [diff] [blame] | 112 | |
| 113 | case U_DECIMAL_DIGIT_NUMBER: return HB_UNICODE_GENERAL_CATEGORY_DECIMAL_NUMBER; |
| 114 | case U_LETTER_NUMBER: return HB_UNICODE_GENERAL_CATEGORY_LETTER_NUMBER; |
| 115 | case U_OTHER_NUMBER: return HB_UNICODE_GENERAL_CATEGORY_OTHER_NUMBER; |
| 116 | |
| 117 | case U_SPACE_SEPARATOR: return HB_UNICODE_GENERAL_CATEGORY_SPACE_SEPARATOR; |
| 118 | case U_LINE_SEPARATOR: return HB_UNICODE_GENERAL_CATEGORY_LINE_SEPARATOR; |
| 119 | case U_PARAGRAPH_SEPARATOR: return HB_UNICODE_GENERAL_CATEGORY_PARAGRAPH_SEPARATOR; |
| 120 | |
| 121 | case U_CONTROL_CHAR: return HB_UNICODE_GENERAL_CATEGORY_CONTROL; |
| 122 | case U_FORMAT_CHAR: return HB_UNICODE_GENERAL_CATEGORY_FORMAT; |
| 123 | case U_PRIVATE_USE_CHAR: return HB_UNICODE_GENERAL_CATEGORY_PRIVATE_USE; |
| 124 | case U_SURROGATE: return HB_UNICODE_GENERAL_CATEGORY_SURROGATE; |
| 125 | |
| 126 | |
| 127 | case U_DASH_PUNCTUATION: return HB_UNICODE_GENERAL_CATEGORY_DASH_PUNCTUATION; |
| 128 | case U_START_PUNCTUATION: return HB_UNICODE_GENERAL_CATEGORY_OPEN_PUNCTUATION; |
| 129 | case U_END_PUNCTUATION: return HB_UNICODE_GENERAL_CATEGORY_CLOSE_PUNCTUATION; |
| 130 | case U_CONNECTOR_PUNCTUATION: return HB_UNICODE_GENERAL_CATEGORY_CONNECT_PUNCTUATION; |
| 131 | case U_OTHER_PUNCTUATION: return HB_UNICODE_GENERAL_CATEGORY_OTHER_PUNCTUATION; |
| 132 | |
| 133 | case U_MATH_SYMBOL: return HB_UNICODE_GENERAL_CATEGORY_MATH_SYMBOL; |
| 134 | case U_CURRENCY_SYMBOL: return HB_UNICODE_GENERAL_CATEGORY_CURRENCY_SYMBOL; |
| 135 | case U_MODIFIER_SYMBOL: return HB_UNICODE_GENERAL_CATEGORY_MODIFIER_SYMBOL; |
| 136 | case U_OTHER_SYMBOL: return HB_UNICODE_GENERAL_CATEGORY_OTHER_SYMBOL; |
| 137 | |
| 138 | case U_INITIAL_PUNCTUATION: return HB_UNICODE_GENERAL_CATEGORY_INITIAL_PUNCTUATION; |
| 139 | case U_FINAL_PUNCTUATION: return HB_UNICODE_GENERAL_CATEGORY_FINAL_PUNCTUATION; |
| 140 | } |
| 141 | |
| 142 | return HB_UNICODE_GENERAL_CATEGORY_UNASSIGNED; |
| 143 | } |
| 144 | |
| 145 | static hb_codepoint_t |
Behdad Esfahbod | fca0923 | 2011-07-20 22:16:13 -0400 | [diff] [blame] | 146 | hb_icu_unicode_mirroring (hb_unicode_funcs_t *ufuncs HB_UNUSED, |
| 147 | hb_codepoint_t unicode, |
| 148 | void *user_data HB_UNUSED) |
Behdad Esfahbod | f144a8e | 2011-04-20 02:54:42 -0400 | [diff] [blame] | 149 | { |
| 150 | return u_charMirror(unicode); |
| 151 | } |
| 152 | |
| 153 | static hb_script_t |
Behdad Esfahbod | fca0923 | 2011-07-20 22:16:13 -0400 | [diff] [blame] | 154 | hb_icu_unicode_script (hb_unicode_funcs_t *ufuncs HB_UNUSED, |
| 155 | hb_codepoint_t unicode, |
| 156 | void *user_data HB_UNUSED) |
Behdad Esfahbod | f144a8e | 2011-04-20 02:54:42 -0400 | [diff] [blame] | 157 | { |
| 158 | UErrorCode status = U_ZERO_ERROR; |
| 159 | UScriptCode scriptCode = uscript_getScript(unicode, &status); |
| 160 | |
Behdad Esfahbod | 889caa5 | 2012-01-18 22:32:52 -0500 | [diff] [blame] | 161 | if (unlikely (U_FAILURE (status))) |
Behdad Esfahbod | 03034ac | 2011-05-02 12:37:45 -0400 | [diff] [blame] | 162 | return HB_SCRIPT_UNKNOWN; |
| 163 | |
Behdad Esfahbod | f144a8e | 2011-04-20 02:54:42 -0400 | [diff] [blame] | 164 | return hb_icu_script_to_script (scriptCode); |
Behdad Esfahbod | d94647e | 2009-11-03 16:35:10 -0500 | [diff] [blame] | 165 | } |
| 166 | |
Behdad Esfahbod | d5045a5 | 2012-08-11 21:26:25 -0400 | [diff] [blame] | 167 | #if U_ICU_VERSION_MAJOR_NUM >= 49 |
| 168 | static const UNormalizer2 *normalizer; |
| 169 | #endif |
| 170 | |
Behdad Esfahbod | fca0923 | 2011-07-20 22:16:13 -0400 | [diff] [blame] | 171 | static hb_bool_t |
| 172 | hb_icu_unicode_compose (hb_unicode_funcs_t *ufuncs HB_UNUSED, |
| 173 | hb_codepoint_t a, |
| 174 | hb_codepoint_t b, |
| 175 | hb_codepoint_t *ab, |
| 176 | void *user_data HB_UNUSED) |
| 177 | { |
Behdad Esfahbod | d5045a5 | 2012-08-11 21:26:25 -0400 | [diff] [blame] | 178 | #if U_ICU_VERSION_MAJOR_NUM >= 49 |
| 179 | { |
| 180 | UChar32 ret = unorm2_composePair (normalizer, a, b); |
| 181 | if (ret < 0) return false; |
| 182 | *ab = ret; |
| 183 | return true; |
| 184 | } |
| 185 | #endif |
| 186 | |
| 187 | /* We don't ifdef-out the fallback code such that compiler always |
| 188 | * sees it and makes sure it's compilable. */ |
Behdad Esfahbod | 498e1a9 | 2011-07-20 23:19:49 -0400 | [diff] [blame] | 189 | |
| 190 | UChar utf16[4], normalized[5]; |
Behdad Esfahbod | b1914b8 | 2012-08-07 16:57:48 -0400 | [diff] [blame] | 191 | unsigned int len; |
Behdad Esfahbod | 498e1a9 | 2011-07-20 23:19:49 -0400 | [diff] [blame] | 192 | hb_bool_t ret, err; |
| 193 | UErrorCode icu_err; |
| 194 | |
| 195 | len = 0; |
Behdad Esfahbod | 0594a24 | 2012-06-05 20:35:40 -0400 | [diff] [blame] | 196 | err = false; |
Behdad Esfahbod | 498e1a9 | 2011-07-20 23:19:49 -0400 | [diff] [blame] | 197 | U16_APPEND (utf16, len, ARRAY_LENGTH (utf16), a, err); |
Behdad Esfahbod | 0594a24 | 2012-06-05 20:35:40 -0400 | [diff] [blame] | 198 | if (err) return false; |
Behdad Esfahbod | 498e1a9 | 2011-07-20 23:19:49 -0400 | [diff] [blame] | 199 | U16_APPEND (utf16, len, ARRAY_LENGTH (utf16), b, err); |
Behdad Esfahbod | 0594a24 | 2012-06-05 20:35:40 -0400 | [diff] [blame] | 200 | if (err) return false; |
Behdad Esfahbod | 498e1a9 | 2011-07-20 23:19:49 -0400 | [diff] [blame] | 201 | |
| 202 | icu_err = U_ZERO_ERROR; |
| 203 | len = unorm_normalize (utf16, len, UNORM_NFC, 0, normalized, ARRAY_LENGTH (normalized), &icu_err); |
Behdad Esfahbod | 889caa5 | 2012-01-18 22:32:52 -0500 | [diff] [blame] | 204 | if (U_FAILURE (icu_err)) |
Behdad Esfahbod | 0594a24 | 2012-06-05 20:35:40 -0400 | [diff] [blame] | 205 | return false; |
Behdad Esfahbod | 36a4f4a | 2012-01-18 22:16:49 -0500 | [diff] [blame] | 206 | if (u_countChar32 (normalized, len) == 1) { |
Behdad Esfahbod | 498e1a9 | 2011-07-20 23:19:49 -0400 | [diff] [blame] | 207 | U16_GET_UNSAFE (normalized, 0, *ab); |
Behdad Esfahbod | 0594a24 | 2012-06-05 20:35:40 -0400 | [diff] [blame] | 208 | ret = true; |
Behdad Esfahbod | 498e1a9 | 2011-07-20 23:19:49 -0400 | [diff] [blame] | 209 | } else { |
Behdad Esfahbod | 0594a24 | 2012-06-05 20:35:40 -0400 | [diff] [blame] | 210 | ret = false; |
Behdad Esfahbod | 498e1a9 | 2011-07-20 23:19:49 -0400 | [diff] [blame] | 211 | } |
| 212 | |
| 213 | return ret; |
Behdad Esfahbod | fca0923 | 2011-07-20 22:16:13 -0400 | [diff] [blame] | 214 | } |
| 215 | |
| 216 | static hb_bool_t |
| 217 | hb_icu_unicode_decompose (hb_unicode_funcs_t *ufuncs HB_UNUSED, |
| 218 | hb_codepoint_t ab, |
| 219 | hb_codepoint_t *a, |
| 220 | hb_codepoint_t *b, |
| 221 | void *user_data HB_UNUSED) |
| 222 | { |
Behdad Esfahbod | d5045a5 | 2012-08-11 21:26:25 -0400 | [diff] [blame] | 223 | #if U_ICU_VERSION_MAJOR_NUM >= 49 |
| 224 | { |
| 225 | UChar decomposed[4]; |
| 226 | int len; |
| 227 | UErrorCode icu_err = U_ZERO_ERROR; |
| 228 | len = unorm2_getRawDecomposition (normalizer, ab, decomposed, |
| 229 | ARRAY_LENGTH (decomposed), &icu_err); |
| 230 | if (U_FAILURE (icu_err) || len < 0) return false; |
| 231 | |
| 232 | len = u_countChar32 (decomposed, len); |
| 233 | if (len == 1) { |
| 234 | U16_GET_UNSAFE (decomposed, 0, *a); |
| 235 | *b = 0; |
| 236 | return *a != ab; |
| 237 | } else if (len == 2) { |
| 238 | len =0; |
| 239 | U16_NEXT_UNSAFE (decomposed, len, *a); |
| 240 | U16_NEXT_UNSAFE (decomposed, len, *b); |
| 241 | } |
| 242 | return true; |
| 243 | } |
| 244 | #endif |
| 245 | |
| 246 | /* We don't ifdef-out the fallback code such that compiler always |
| 247 | * sees it and makes sure it's compilable. */ |
| 248 | |
Behdad Esfahbod | 378d279 | 2012-07-31 21:36:16 -0400 | [diff] [blame] | 249 | UChar utf16[2], normalized[2 * HB_UNICODE_MAX_DECOMPOSITION_LEN + 1]; |
Behdad Esfahbod | b1914b8 | 2012-08-07 16:57:48 -0400 | [diff] [blame] | 250 | unsigned int len; |
Behdad Esfahbod | 498e1a9 | 2011-07-20 23:19:49 -0400 | [diff] [blame] | 251 | hb_bool_t ret, err; |
| 252 | UErrorCode icu_err; |
| 253 | |
Behdad Esfahbod | 63c0ef4 | 2011-07-21 20:58:42 -0400 | [diff] [blame] | 254 | /* This function is a monster! Maybe it wasn't a good idea adding a |
| 255 | * pairwise decompose API... */ |
| 256 | /* Watchout for the dragons. Err, watchout for macros changing len. */ |
| 257 | |
Behdad Esfahbod | 498e1a9 | 2011-07-20 23:19:49 -0400 | [diff] [blame] | 258 | len = 0; |
Behdad Esfahbod | 0594a24 | 2012-06-05 20:35:40 -0400 | [diff] [blame] | 259 | err = false; |
Behdad Esfahbod | 498e1a9 | 2011-07-20 23:19:49 -0400 | [diff] [blame] | 260 | U16_APPEND (utf16, len, ARRAY_LENGTH (utf16), ab, err); |
Behdad Esfahbod | 0594a24 | 2012-06-05 20:35:40 -0400 | [diff] [blame] | 261 | if (err) return false; |
Behdad Esfahbod | 498e1a9 | 2011-07-20 23:19:49 -0400 | [diff] [blame] | 262 | |
| 263 | icu_err = U_ZERO_ERROR; |
| 264 | len = unorm_normalize (utf16, len, UNORM_NFD, 0, normalized, ARRAY_LENGTH (normalized), &icu_err); |
Behdad Esfahbod | 889caa5 | 2012-01-18 22:32:52 -0500 | [diff] [blame] | 265 | if (U_FAILURE (icu_err)) |
Behdad Esfahbod | 0594a24 | 2012-06-05 20:35:40 -0400 | [diff] [blame] | 266 | return false; |
Behdad Esfahbod | 498e1a9 | 2011-07-20 23:19:49 -0400 | [diff] [blame] | 267 | |
Behdad Esfahbod | 36a4f4a | 2012-01-18 22:16:49 -0500 | [diff] [blame] | 268 | len = u_countChar32 (normalized, len); |
Behdad Esfahbod | 498e1a9 | 2011-07-20 23:19:49 -0400 | [diff] [blame] | 269 | |
| 270 | if (len == 1) { |
| 271 | U16_GET_UNSAFE (normalized, 0, *a); |
| 272 | *b = 0; |
| 273 | ret = *a != ab; |
| 274 | } else if (len == 2) { |
Behdad Esfahbod | 63c0ef4 | 2011-07-21 20:58:42 -0400 | [diff] [blame] | 275 | len =0; |
| 276 | U16_NEXT_UNSAFE (normalized, len, *a); |
| 277 | U16_NEXT_UNSAFE (normalized, len, *b); |
| 278 | |
Behdad Esfahbod | 498e1a9 | 2011-07-20 23:19:49 -0400 | [diff] [blame] | 279 | /* Here's the ugly part: if ab decomposes to a single character and |
| 280 | * that character decomposes again, we have to detect that and undo |
| 281 | * the second part :-(. */ |
| 282 | UChar recomposed[20]; |
| 283 | icu_err = U_ZERO_ERROR; |
Behdad Esfahbod | 63c0ef4 | 2011-07-21 20:58:42 -0400 | [diff] [blame] | 284 | unorm_normalize (normalized, len, UNORM_NFC, 0, recomposed, ARRAY_LENGTH (recomposed), &icu_err); |
Behdad Esfahbod | 889caa5 | 2012-01-18 22:32:52 -0500 | [diff] [blame] | 285 | if (U_FAILURE (icu_err)) |
Behdad Esfahbod | 0594a24 | 2012-06-05 20:35:40 -0400 | [diff] [blame] | 286 | return false; |
Behdad Esfahbod | 63c0ef4 | 2011-07-21 20:58:42 -0400 | [diff] [blame] | 287 | hb_codepoint_t c; |
| 288 | U16_GET_UNSAFE (recomposed, 0, c); |
| 289 | if (c != *a && c != ab) { |
| 290 | *a = c; |
Behdad Esfahbod | 498e1a9 | 2011-07-20 23:19:49 -0400 | [diff] [blame] | 291 | *b = 0; |
Behdad Esfahbod | 498e1a9 | 2011-07-20 23:19:49 -0400 | [diff] [blame] | 292 | } |
Behdad Esfahbod | 0594a24 | 2012-06-05 20:35:40 -0400 | [diff] [blame] | 293 | ret = true; |
Behdad Esfahbod | 498e1a9 | 2011-07-20 23:19:49 -0400 | [diff] [blame] | 294 | } else { |
| 295 | /* If decomposed to more than two characters, take the last one, |
| 296 | * and recompose the rest to get the first component. */ |
Behdad Esfahbod | a18280a | 2012-06-07 15:44:12 -0400 | [diff] [blame] | 297 | U16_PREV_UNSAFE (normalized, len, *b); /* Changes len in-place. */ |
| 298 | UChar recomposed[18 * 2]; |
Behdad Esfahbod | 498e1a9 | 2011-07-20 23:19:49 -0400 | [diff] [blame] | 299 | icu_err = U_ZERO_ERROR; |
| 300 | len = unorm_normalize (normalized, len, UNORM_NFC, 0, recomposed, ARRAY_LENGTH (recomposed), &icu_err); |
Behdad Esfahbod | 889caa5 | 2012-01-18 22:32:52 -0500 | [diff] [blame] | 301 | if (U_FAILURE (icu_err)) |
Behdad Esfahbod | 0594a24 | 2012-06-05 20:35:40 -0400 | [diff] [blame] | 302 | return false; |
Behdad Esfahbod | 498e1a9 | 2011-07-20 23:19:49 -0400 | [diff] [blame] | 303 | /* We expect that recomposed has exactly one character now. */ |
Behdad Esfahbod | a18280a | 2012-06-07 15:44:12 -0400 | [diff] [blame] | 304 | if (unlikely (u_countChar32 (recomposed, len) != 1)) |
| 305 | return false; |
Behdad Esfahbod | 498e1a9 | 2011-07-20 23:19:49 -0400 | [diff] [blame] | 306 | U16_GET_UNSAFE (recomposed, 0, *a); |
Behdad Esfahbod | 0594a24 | 2012-06-05 20:35:40 -0400 | [diff] [blame] | 307 | ret = true; |
Behdad Esfahbod | 498e1a9 | 2011-07-20 23:19:49 -0400 | [diff] [blame] | 308 | } |
| 309 | |
| 310 | return ret; |
Behdad Esfahbod | fca0923 | 2011-07-20 22:16:13 -0400 | [diff] [blame] | 311 | } |
| 312 | |
Behdad Esfahbod | 378d279 | 2012-07-31 21:36:16 -0400 | [diff] [blame] | 313 | static unsigned int |
| 314 | hb_icu_unicode_decompose_compatibility (hb_unicode_funcs_t *ufuncs HB_UNUSED, |
| 315 | hb_codepoint_t u, |
| 316 | hb_codepoint_t *decomposed, |
| 317 | void *user_data HB_UNUSED) |
| 318 | { |
| 319 | UChar utf16[2], normalized[2 * HB_UNICODE_MAX_DECOMPOSITION_LEN + 1]; |
Behdad Esfahbod | b1914b8 | 2012-08-07 16:57:48 -0400 | [diff] [blame] | 320 | unsigned int len; |
Behdad Esfahbod | 378d279 | 2012-07-31 21:36:16 -0400 | [diff] [blame] | 321 | int32_t utf32_len; |
| 322 | hb_bool_t err; |
| 323 | UErrorCode icu_err; |
| 324 | |
| 325 | /* Copy @u into a UTF-16 array to be passed to ICU. */ |
| 326 | len = 0; |
Behdad Esfahbod | 9036484 | 2014-03-24 14:26:36 -0700 | [diff] [blame] | 327 | err = false; |
Behdad Esfahbod | 378d279 | 2012-07-31 21:36:16 -0400 | [diff] [blame] | 328 | U16_APPEND (utf16, len, ARRAY_LENGTH (utf16), u, err); |
| 329 | if (err) |
| 330 | return 0; |
| 331 | |
| 332 | /* Normalise the codepoint using NFKD mode. */ |
| 333 | icu_err = U_ZERO_ERROR; |
| 334 | len = unorm_normalize (utf16, len, UNORM_NFKD, 0, normalized, ARRAY_LENGTH (normalized), &icu_err); |
| 335 | if (icu_err) |
| 336 | return 0; |
| 337 | |
| 338 | /* Convert the decomposed form from UTF-16 to UTF-32. */ |
| 339 | icu_err = U_ZERO_ERROR; |
| 340 | u_strToUTF32 ((UChar32*) decomposed, HB_UNICODE_MAX_DECOMPOSITION_LEN, &utf32_len, normalized, len, &icu_err); |
| 341 | if (icu_err) |
| 342 | return 0; |
| 343 | |
| 344 | return utf32_len; |
| 345 | } |
| 346 | |
Behdad Esfahbod | be4560a | 2012-06-05 18:14:03 -0400 | [diff] [blame] | 347 | |
Behdad Esfahbod | d94647e | 2009-11-03 16:35:10 -0500 | [diff] [blame] | 348 | hb_unicode_funcs_t * |
| 349 | hb_icu_get_unicode_funcs (void) |
| 350 | { |
Behdad Esfahbod | d5045a5 | 2012-08-11 21:26:25 -0400 | [diff] [blame] | 351 | static const hb_unicode_funcs_t _hb_icu_unicode_funcs = { |
| 352 | HB_OBJECT_HEADER_STATIC, |
| 353 | |
| 354 | NULL, /* parent */ |
| 355 | true, /* immutable */ |
| 356 | { |
| 357 | #define HB_UNICODE_FUNC_IMPLEMENT(name) hb_icu_unicode_##name, |
| 358 | HB_UNICODE_FUNCS_IMPLEMENT_CALLBACKS |
| 359 | #undef HB_UNICODE_FUNC_IMPLEMENT |
| 360 | } |
| 361 | }; |
| 362 | |
| 363 | #if U_ICU_VERSION_MAJOR_NUM >= 49 |
| 364 | if (!hb_atomic_ptr_get (&normalizer)) { |
| 365 | UErrorCode icu_err = U_ZERO_ERROR; |
| 366 | /* We ignore failure in getNFCInstace(). */ |
Chris Peterson | d1897a9 | 2015-01-03 19:46:19 -0800 | [diff] [blame] | 367 | (void) hb_atomic_ptr_cmpexch (&normalizer, NULL, unorm2_getNFCInstance (&icu_err)); |
Behdad Esfahbod | d5045a5 | 2012-08-11 21:26:25 -0400 | [diff] [blame] | 368 | } |
| 369 | #endif |
Behdad Esfahbod | f06ab8a | 2012-06-05 12:31:51 -0400 | [diff] [blame] | 370 | return const_cast<hb_unicode_funcs_t *> (&_hb_icu_unicode_funcs); |
Behdad Esfahbod | d94647e | 2009-11-03 16:35:10 -0500 | [diff] [blame] | 371 | } |