third_party/txt/src/minikin/Hyphenator.h - mirrors/engine - Git at Google

 /*
  * Copyright (C) 2015 The Android Open Source Project
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
  * You may obtain a copy of the License at
  *
  *      http://www.apache.org/licenses/LICENSE-2.0
  *
  * Unless required by applicable law or agreed to in writing, software
  * distributed under the License is distributed on an "AS IS" BASIS,
  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */

 /**
  * An implementation of Liang's hyphenation algorithm.
  */

 #ifndef U_USING_ICU_NAMESPACE
 #define U_USING_ICU_NAMESPACE 0
 #endif  //  U_USING_ICU_NAMESPACE

 #include <memory>
 #include <unordered_map>
 #include <vector>
 #include "unicode/locid.h"

 #ifndef MINIKIN_HYPHENATOR_H
 #define MINIKIN_HYPHENATOR_H

 namespace minikin {

 enum class HyphenationType : uint8_t {
   // Note: There are implicit assumptions scattered in the code that DONT_BREAK
   // is 0.

   // Do not break.
   DONT_BREAK = 0,
   // Break the line and insert a normal hyphen.
   BREAK_AND_INSERT_HYPHEN = 1,
   // Break the line and insert an Armenian hyphen (U+058A).
   BREAK_AND_INSERT_ARMENIAN_HYPHEN = 2,
   // Break the line and insert a maqaf (Hebrew hyphen, U+05BE).
   BREAK_AND_INSERT_MAQAF = 3,
   // Break the line and insert a Canadian Syllabics hyphen (U+1400).
   BREAK_AND_INSERT_UCAS_HYPHEN = 4,
   // Break the line, but don't insert a hyphen. Used for cases when there is
   // already a hyphen
   // present or the script does not use a hyphen (e.g. in Malayalam).
   BREAK_AND_DONT_INSERT_HYPHEN = 5,
   // Break and replace the last code unit with hyphen. Used for Catalan "l·l"
   // which hyphenates
   // as "l-/l".
   BREAK_AND_REPLACE_WITH_HYPHEN = 6,
   // Break the line, and repeat the hyphen (which is the last character) at the
   // beginning of the
   // next line. Used in Polish, where "czerwono-niebieska" should hyphenate as
   // "czerwono-/-niebieska".
   BREAK_AND_INSERT_HYPHEN_AT_NEXT_LINE = 7,
   // Break the line, insert a ZWJ and hyphen at the first line, and a ZWJ at the
   // second line.
   // This is used in Arabic script, mostly for writing systems of Central Asia.
   // It's our default
   // behavior when a soft hyphen is used in Arabic script.
   BREAK_AND_INSERT_HYPHEN_AND_ZWJ = 8
 };

 // The hyphen edit represents an edit to the string when a word is
 // hyphenated. The most common hyphen edit is adding a "-" at the end
 // of a syllable, but nonstandard hyphenation allows for more choices.
 // Note that a HyphenEdit can hold two types of edits at the same time,
 // One at the beginning of the string/line and one at the end.
 class HyphenEdit {
  public:
   static const uint32_t NO_EDIT = 0x00;

   static const uint32_t INSERT_HYPHEN_AT_END = 0x01;
   static const uint32_t INSERT_ARMENIAN_HYPHEN_AT_END = 0x02;
   static const uint32_t INSERT_MAQAF_AT_END = 0x03;
   static const uint32_t INSERT_UCAS_HYPHEN_AT_END = 0x04;
   static const uint32_t INSERT_ZWJ_AND_HYPHEN_AT_END = 0x05;
   static const uint32_t REPLACE_WITH_HYPHEN_AT_END = 0x06;
   static const uint32_t BREAK_AT_END = 0x07;

   static const uint32_t INSERT_HYPHEN_AT_START = 0x01 << 3;
   static const uint32_t INSERT_ZWJ_AT_START = 0x02 << 3;
   static const uint32_t BREAK_AT_START = 0x03 << 3;

   // Keep in sync with the definitions in the Java code at:
   // frameworks/base/graphics/java/android/graphics/Paint.java
   static const uint32_t MASK_END_OF_LINE = 0x07;
   static const uint32_t MASK_START_OF_LINE = 0x03 << 3;

   inline static bool isReplacement(uint32_t hyph) {
     return hyph == REPLACE_WITH_HYPHEN_AT_END;
   }

   inline static bool isInsertion(uint32_t hyph) {
     return (hyph == INSERT_HYPHEN_AT_END ||
             hyph == INSERT_ARMENIAN_HYPHEN_AT_END ||
             hyph == INSERT_MAQAF_AT_END || hyph == INSERT_UCAS_HYPHEN_AT_END ||
             hyph == INSERT_ZWJ_AND_HYPHEN_AT_END ||
             hyph == INSERT_HYPHEN_AT_START || hyph == INSERT_ZWJ_AT_START);
   }

   const static uint32_t* getHyphenString(uint32_t hyph);
   static uint32_t editForThisLine(HyphenationType type);
   static uint32_t editForNextLine(HyphenationType type);

   HyphenEdit() : hyphen(NO_EDIT) {}
   HyphenEdit(uint32_t hyphenInt)  // NOLINT(google-explicit-constructor)
       : hyphen(hyphenInt) {}
   uint32_t getHyphen() const { return hyphen; }
   bool operator==(const HyphenEdit& other) const {
     return hyphen == other.hyphen;
   }

   uint32_t getEnd() const { return hyphen & MASK_END_OF_LINE; }
   uint32_t getStart() const { return hyphen & MASK_START_OF_LINE; }

  private:
   uint32_t hyphen;
 };

 // hyb file header; implementation details are in the .cpp file
 struct Header;

 class Hyphenator {
  public:
   // Compute the hyphenation of a word, storing the hyphenation in result
   // vector. Each entry in the vector is a "hyphenation type" for a potential
   // hyphenation that can be applied at the corresponding code unit offset in
   // the word.
   //
   // Example: word is "hyphen", result is the following, corresponding to
   // "hy-phen": [DONT_BREAK, DONT_BREAK, BREAK_AND_INSERT_HYPHEN, DONT_BREAK,
   // DONT_BREAK, DONT_BREAK]
   void hyphenate(std::vector<HyphenationType>* result,
                  const uint16_t* word,
                  size_t len,
                  const icu::Locale& locale);

   // Returns true if the codepoint is like U+2010 HYPHEN in line breaking and
   // usage: a character immediately after which line breaks are allowed, but
   // words containing it should not be automatically hyphenated.
   static bool isLineBreakingHyphen(uint32_t cp);

   // pattern data is in binary format, as described in doc/hyb_file_format.md.
   // Note: the caller is responsible for ensuring that the lifetime of the
   // pattern data is at least as long as the Hyphenator object.

   // Note: nullptr is valid input, in which case the hyphenator only processes
   // soft hyphens.
   static Hyphenator* loadBinary(const uint8_t* patternData,
                                 size_t minPrefix,
                                 size_t minSuffix);

  private:
   // apply various hyphenation rules including hard and soft hyphens, ignoring
   // patterns
   void hyphenateWithNoPatterns(HyphenationType* result,
                                const uint16_t* word,
                                size_t len,
                                const icu::Locale& locale);

   // Try looking up word in alphabet table, return DONT_BREAK if any code units
   // fail to map. Otherwise, returns BREAK_AND_INSERT_HYPHEN,
   // BREAK_AND_INSERT_ARMENIAN_HYPHEN, or BREAK_AND_DONT_INSERT_HYPHEN based on
   // the script of the characters seen. Note that this method writes len+2
   // entries into alpha_codes (including start and stop)
   HyphenationType alphabetLookup(uint16_t* alpha_codes,
                                  const uint16_t* word,
                                  size_t len);

   // calculate hyphenation from patterns, assuming alphabet lookup has already
   // been done
   void hyphenateFromCodes(HyphenationType* result,
                           const uint16_t* codes,
                           size_t len,
                           HyphenationType hyphenValue);

   // See also LONGEST_HYPHENATED_WORD in LineBreaker.cpp. Here the constant is
   // used so that temporary buffers can be stack-allocated without waste, which
   // is a slightly different use case. It measures UTF-16 code units.
   static const size_t MAX_HYPHENATED_SIZE = 64;

   const uint8_t* patternData;
   size_t minPrefix, minSuffix;

   // accessors for binary data
   const Header* getHeader() const {
     return reinterpret_cast<const Header*>(patternData);
   }
 };

 }  // namespace minikin

 #endif  // MINIKIN_HYPHENATOR_H
	/*
	* Copyright (C) 2015 The Android Open Source Project
	*
	* Licensed under the Apache License, Version 2.0 (the "License");
	* you may not use this file except in compliance with the License.
	* You may obtain a copy of the License at
	*
	* http://www.apache.org/licenses/LICENSE-2.0
	*
	* Unless required by applicable law or agreed to in writing, software
	* distributed under the License is distributed on an "AS IS" BASIS,
	* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
	* See the License for the specific language governing permissions and
	* limitations under the License.
	*/

	/**
	* An implementation of Liang's hyphenation algorithm.
	*/

	#ifndef U_USING_ICU_NAMESPACE
	#define U_USING_ICU_NAMESPACE 0
	#endif // U_USING_ICU_NAMESPACE

	#include <memory>
	#include <unordered_map>
	#include <vector>
	#include "unicode/locid.h"

	#ifndef MINIKIN_HYPHENATOR_H
	#define MINIKIN_HYPHENATOR_H

	namespace minikin {

	enum class HyphenationType : uint8_t {
	// Note: There are implicit assumptions scattered in the code that DONT_BREAK
	// is 0.

	// Do not break.
	DONT_BREAK = 0,
	// Break the line and insert a normal hyphen.
	BREAK_AND_INSERT_HYPHEN = 1,
	// Break the line and insert an Armenian hyphen (U+058A).
	BREAK_AND_INSERT_ARMENIAN_HYPHEN = 2,
	// Break the line and insert a maqaf (Hebrew hyphen, U+05BE).
	BREAK_AND_INSERT_MAQAF = 3,
	// Break the line and insert a Canadian Syllabics hyphen (U+1400).
	BREAK_AND_INSERT_UCAS_HYPHEN = 4,
	// Break the line, but don't insert a hyphen. Used for cases when there is
	// already a hyphen
	// present or the script does not use a hyphen (e.g. in Malayalam).
	BREAK_AND_DONT_INSERT_HYPHEN = 5,
	// Break and replace the last code unit with hyphen. Used for Catalan "l·l"
	// which hyphenates
	// as "l-/l".
	BREAK_AND_REPLACE_WITH_HYPHEN = 6,
	// Break the line, and repeat the hyphen (which is the last character) at the
	// beginning of the
	// next line. Used in Polish, where "czerwono-niebieska" should hyphenate as
	// "czerwono-/-niebieska".
	BREAK_AND_INSERT_HYPHEN_AT_NEXT_LINE = 7,
	// Break the line, insert a ZWJ and hyphen at the first line, and a ZWJ at the
	// second line.
	// This is used in Arabic script, mostly for writing systems of Central Asia.
	// It's our default
	// behavior when a soft hyphen is used in Arabic script.
	BREAK_AND_INSERT_HYPHEN_AND_ZWJ = 8
	};

	// The hyphen edit represents an edit to the string when a word is
	// hyphenated. The most common hyphen edit is adding a "-" at the end
	// of a syllable, but nonstandard hyphenation allows for more choices.
	// Note that a HyphenEdit can hold two types of edits at the same time,
	// One at the beginning of the string/line and one at the end.
	class HyphenEdit {
	public:
	static const uint32_t NO_EDIT = 0x00;

	static const uint32_t INSERT_HYPHEN_AT_END = 0x01;
	static const uint32_t INSERT_ARMENIAN_HYPHEN_AT_END = 0x02;
	static const uint32_t INSERT_MAQAF_AT_END = 0x03;
	static const uint32_t INSERT_UCAS_HYPHEN_AT_END = 0x04;
	static const uint32_t INSERT_ZWJ_AND_HYPHEN_AT_END = 0x05;
	static const uint32_t REPLACE_WITH_HYPHEN_AT_END = 0x06;
	static const uint32_t BREAK_AT_END = 0x07;

	static const uint32_t INSERT_HYPHEN_AT_START = 0x01 << 3;
	static const uint32_t INSERT_ZWJ_AT_START = 0x02 << 3;
	static const uint32_t BREAK_AT_START = 0x03 << 3;

	// Keep in sync with the definitions in the Java code at:
	// frameworks/base/graphics/java/android/graphics/Paint.java
	static const uint32_t MASK_END_OF_LINE = 0x07;
	static const uint32_t MASK_START_OF_LINE = 0x03 << 3;

	inline static bool isReplacement(uint32_t hyph) {
	return hyph == REPLACE_WITH_HYPHEN_AT_END;
	}

	inline static bool isInsertion(uint32_t hyph) {
	return (hyph == INSERT_HYPHEN_AT_END \|\|
	hyph == INSERT_ARMENIAN_HYPHEN_AT_END \|\|
	hyph == INSERT_MAQAF_AT_END \|\| hyph == INSERT_UCAS_HYPHEN_AT_END \|\|
	hyph == INSERT_ZWJ_AND_HYPHEN_AT_END \|\|
	hyph == INSERT_HYPHEN_AT_START \|\| hyph == INSERT_ZWJ_AT_START);
	}

	const static uint32_t* getHyphenString(uint32_t hyph);
	static uint32_t editForThisLine(HyphenationType type);
	static uint32_t editForNextLine(HyphenationType type);

	HyphenEdit() : hyphen(NO_EDIT) {}
	HyphenEdit(uint32_t hyphenInt) // NOLINT(google-explicit-constructor)
	: hyphen(hyphenInt) {}
	uint32_t getHyphen() const { return hyphen; }
	bool operator==(const HyphenEdit& other) const {
	return hyphen == other.hyphen;
	}

	uint32_t getEnd() const { return hyphen & MASK_END_OF_LINE; }
	uint32_t getStart() const { return hyphen & MASK_START_OF_LINE; }

	private:
	uint32_t hyphen;
	};

	// hyb file header; implementation details are in the .cpp file
	struct Header;

	class Hyphenator {
	public:
	// Compute the hyphenation of a word, storing the hyphenation in result
	// vector. Each entry in the vector is a "hyphenation type" for a potential
	// hyphenation that can be applied at the corresponding code unit offset in
	// the word.
	//
	// Example: word is "hyphen", result is the following, corresponding to
	// "hy-phen": [DONT_BREAK, DONT_BREAK, BREAK_AND_INSERT_HYPHEN, DONT_BREAK,
	// DONT_BREAK, DONT_BREAK]
	void hyphenate(std::vector<HyphenationType>* result,
	const uint16_t* word,
	size_t len,
	const icu::Locale& locale);

	// Returns true if the codepoint is like U+2010 HYPHEN in line breaking and
	// usage: a character immediately after which line breaks are allowed, but
	// words containing it should not be automatically hyphenated.
	static bool isLineBreakingHyphen(uint32_t cp);

	// pattern data is in binary format, as described in doc/hyb_file_format.md.
	// Note: the caller is responsible for ensuring that the lifetime of the
	// pattern data is at least as long as the Hyphenator object.

	// Note: nullptr is valid input, in which case the hyphenator only processes
	// soft hyphens.
	static Hyphenator* loadBinary(const uint8_t* patternData,
	size_t minPrefix,
	size_t minSuffix);

	private:
	// apply various hyphenation rules including hard and soft hyphens, ignoring
	// patterns
	void hyphenateWithNoPatterns(HyphenationType* result,
	const uint16_t* word,
	size_t len,
	const icu::Locale& locale);

	// Try looking up word in alphabet table, return DONT_BREAK if any code units
	// fail to map. Otherwise, returns BREAK_AND_INSERT_HYPHEN,
	// BREAK_AND_INSERT_ARMENIAN_HYPHEN, or BREAK_AND_DONT_INSERT_HYPHEN based on
	// the script of the characters seen. Note that this method writes len+2
	// entries into alpha_codes (including start and stop)
	HyphenationType alphabetLookup(uint16_t* alpha_codes,
	const uint16_t* word,
	size_t len);

	// calculate hyphenation from patterns, assuming alphabet lookup has already
	// been done
	void hyphenateFromCodes(HyphenationType* result,
	const uint16_t* codes,
	size_t len,
	HyphenationType hyphenValue);

	// See also LONGEST_HYPHENATED_WORD in LineBreaker.cpp. Here the constant is
	// used so that temporary buffers can be stack-allocated without waste, which
	// is a slightly different use case. It measures UTF-16 code units.
	static const size_t MAX_HYPHENATED_SIZE = 64;

	const uint8_t* patternData;
	size_t minPrefix, minSuffix;

	// accessors for binary data
	const Header* getHeader() const {
	return reinterpret_cast<const Header*>(patternData);
	}
	};

	} // namespace minikin

	#endif // MINIKIN_HYPHENATOR_H