third_party/re2/ucs2.diff - mirrors/engine - Git at Google

 This is a dump from Google's source control system of the change
 that removed UCS-2 support from RE2.  As the explanation below
 says, UCS-2 mode is fundamentally at odds with things like ^ and $,
 so it never really worked very well.  But if you are interested in using
 it without those operators, it did work for that.  It assumed that the
 UCS-2 data was in the native host byte order.

 If you are interested in adding UCS-2 mode back, this patch might
 be a good starting point.


 Change 12780686 by rsc@rsc-re2 on 2009/09/16 15:30:15

 	Retire UCS-2 mode.

 	I added it as an experiment for V8, but it
 	requires 2-byte lookahead to do completely,
 	and RE2 has 1-byte lookahead (enough for UTF-8)
 	as a fairly deep fundamental assumption,
 	so it did not support ^ or $.

 ==== re2/bitstate.cc#2 - re2/bitstate.cc#3 ====
 re2/bitstate.cc#2:314,321 - re2/bitstate.cc#3:314,319
       cap_[0] = p;
       if (TrySearch(prog_->start(), p))  // Match must be leftmost; done.
         return true;
 -     if (prog_->flags() & Regexp::UCS2)
 -       p++;
     }
     return false;
   }
 ==== re2/compile.cc#17 - re2/compile.cc#18 ====
 re2/compile.cc#17:95,101 - re2/compile.cc#18:95,100
   // Input encodings.
   enum Encoding {
     kEncodingUTF8 = 1,  // UTF-8 (0-10FFFF)
 -   kEncodingUCS2,     // UCS-2 (0-FFFF), native byte order
     kEncodingLatin1,    // Latin1 (0-FF)
   };

 re2/compile.cc#17:168,176 - re2/compile.cc#18:167,172
     void AddRuneRangeLatin1(Rune lo, Rune hi, bool foldcase);
     void AddRuneRangeUTF8(Rune lo, Rune hi, bool foldcase);
     void Add_80_10ffff();
 -   void AddRuneRangeUCS2(Rune lo, Rune hi, bool foldcase);
 -   void AddUCS2Pair(uint8 lo1, uint8 hi1, bool fold1,
 -                    uint8 lo2, uint8 hi2, bool fold2);

     // New suffix that matches the byte range lo-hi, then goes to next.
     Inst* RuneByteSuffix(uint8 lo, uint8 hi, bool foldcase, Inst* next);
 re2/compile.cc#17:475,481 - re2/compile.cc#18:471,477

   // Converts rune range lo-hi into a fragment that recognizes
   // the bytes that would make up those runes in the current
 - // encoding (Latin 1, UTF-8, or UCS-2).
 + // encoding (Latin 1 or UTF-8).
   // This lets the machine work byte-by-byte even when
   // using multibyte encodings.

 re2/compile.cc#17:488,496 - re2/compile.cc#18:484,489
       case kEncodingLatin1:
         AddRuneRangeLatin1(lo, hi, foldcase);
         break;
 -     case kEncodingUCS2:
 -       AddRuneRangeUCS2(lo, hi, foldcase);
 -       break;
     }
   }

 re2/compile.cc#17:503,581 - re2/compile.cc#18:496,501
     AddSuffix(RuneByteSuffix(lo, hi, foldcase, NULL));
   }

 - // Test whether 16-bit values are big or little endian.
 - static bool BigEndian() {
 -   union {
 -     char byte[2];
 -     int16 endian;
 -   } u;
 -
 -   u.byte[0] = 1;
 -   u.byte[1] = 2;
 -   return u.endian == 0x0102;
 - }
 -
 - void Compiler::AddUCS2Pair(uint8 lo1, uint8 hi1, bool fold1,
 -                            uint8 lo2, uint8 hi2, bool fold2) {
 -   Inst* ip;
 -   if (reversed_) {
 -     ip = RuneByteSuffix(lo1, hi1, fold1, NULL);
 -     ip = RuneByteSuffix(lo2, hi2, fold2, ip);
 -   } else {
 -     ip = RuneByteSuffix(lo2, hi2, fold2, NULL);
 -     ip = RuneByteSuffix(lo1, hi1, fold1, ip);
 -   }
 -   AddSuffix(ip);
 - }
 -
 - void Compiler::AddRuneRangeUCS2(Rune lo, Rune hi, bool foldcase) {
 -   if (lo > hi || lo > 0xFFFF)
 -     return;
 -   if (hi > 0xFFFF)
 -     hi = 0xFFFF;
 -
 -   // We'll assemble a pattern assuming big endian.
 -   // If the machine isn't, tell Cat to reverse its arguments.
 -   bool oldreversed = reversed_;
 -   if (!BigEndian()) {
 -     reversed_ = !oldreversed;
 -   }
 -
 -   // Split into bytes.
 -   int lo1 = lo >> 8;
 -   int lo2 = lo & 0xFF;
 -   int hi1 = hi >> 8;
 -   int hi2 = hi & 0xFF;
 -
 -   if (lo1 == hi1) {
 -     // Easy case: high bits are same in both.
 -     // Only do ASCII case folding on the second byte if the top byte is 00.
 -     AddUCS2Pair(lo1, lo1, false, lo2, hi2, lo1==0 && foldcase);
 -   } else {
 -     // Harder case: different second byte ranges depending on first byte.
 -
 -     // Initial fragment.
 -     if (lo2 > 0) {
 -       AddUCS2Pair(lo1, lo1, false, lo2, 0xFF, lo1==0 && foldcase);
 -       lo1++;
 -     }
 -
 -     // Trailing fragment.
 -     if (hi2 < 0xFF) {
 -       AddUCS2Pair(hi1, hi1, false, 0, hi2, false);
 -       hi1--;
 -     }
 -
 -     // Inner ranges.
 -     if (lo1 <= hi1) {
 -       AddUCS2Pair(lo1, hi1, false, 0, 0xFF, false);
 -     }
 -   }
 -
 -   // Restore reverse setting.
 -   reversed_ = oldreversed;
 - }
 -
   // Table describing how to make a UTF-8 matching machine
   // for the rune range 80-10FFFF (Runeself-Runemax).
   // This range happens frequently enough (for example /./ and /[^a-z]/)
 re2/compile.cc#17:707,716 - re2/compile.cc#18:627,634

   Frag Compiler::Literal(Rune r, bool foldcase) {
     switch (encoding_) {
 -     default:  // UCS-2 or something new
 -       BeginRange();
 -       AddRuneRange(r, r, foldcase);
 -       return EndRange();
 +     default:
 +       return kNullFrag;

       case kEncodingLatin1:
         return ByteRange(r, r, foldcase);
 re2/compile.cc#17:927,934 - re2/compile.cc#18:845,850

     if (re->parse_flags() & Regexp::Latin1)
       c.encoding_ = kEncodingLatin1;
 -   else if (re->parse_flags() & Regexp::UCS2)
 -     c.encoding_ = kEncodingUCS2;
     c.reversed_ = reversed;
     if (max_mem <= 0) {
       c.max_inst_ = 100000;  // more than enough
 re2/compile.cc#17:983,993 - re2/compile.cc#18:899,905
       c.prog_->set_start_unanchored(c.prog_->start());
     } else {
       Frag dot;
 -     if (c.encoding_ == kEncodingUCS2) {
 -       dot = c.Cat(c.ByteRange(0x00, 0xFF, false), c.ByteRange(0x00, 0xFF, false));
 -     } else {
 -       dot = c.ByteRange(0x00, 0xFF, false);
 -     }
 +     dot = c.ByteRange(0x00, 0xFF, false);
       Frag dotloop = c.Star(dot, true);
       Frag unanchored = c.Cat(dotloop, all);
       c.prog_->set_start_unanchored(unanchored.begin);
 ==== re2/nfa.cc#8 - re2/nfa.cc#9 ====
 re2/nfa.cc#8:426,432 - re2/nfa.cc#9:426,431
     const char* bp = context.begin();
     int c = -1;
     int wasword = 0;
 -   bool ucs2 = prog_->flags() & Regexp::UCS2;

     if (text.begin() > context.begin()) {
       c = text.begin()[-1] & 0xFF;
 re2/nfa.cc#8:492,498 - re2/nfa.cc#9:491,497
         // If there's a required first byte for an unanchored search
         // and we're not in the middle of any possible matches,
         // use memchr to search for the byte quickly.
 -       if (!ucs2 && !anchored && first_byte_ >= 0 && runq->size() == 0 &&
 +       if (!anchored && first_byte_ >= 0 && runq->size() == 0 &&
             p < text.end() && (p[0] & 0xFF) != first_byte_) {
           p = reinterpret_cast<const char*>(memchr(p, first_byte_,
                                                    text.end() - p));
 re2/nfa.cc#8:505,526 - re2/nfa.cc#9:504,514
           flag = Prog::EmptyFlags(context, p);
         }

 -       // In UCS-2 mode, if we need to start a new thread,
 -       // make sure to do it on an even boundary.
 -       if(ucs2 && runq->size() == 0 &&
 -           (p - context.begin()) % 2 && p < text.end()) {
 -         p++;
 -         flag = Prog::EmptyFlags(context, p);
 -       }
 -
         // Steal match storage (cleared but unused as of yet)
         // temporarily to hold match boundaries for new thread.
 -       // In UCS-2 mode, only start the thread on a 2-byte boundary.
 -       if(!ucs2 || (p - context.begin()) % 2 == 0) {
 -         match_[0] = p;
 -         AddToThreadq(runq, start_, flag, p, match_);
 -         match_[0] = NULL;
 -       }
 +       match_[0] = p;
 +       AddToThreadq(runq, start_, flag, p, match_);
 +       match_[0] = NULL;
       }

       // If all the threads have died, stop early.
 ==== re2/parse.cc#22 - re2/parse.cc#23 ====
 re2/parse.cc#22:160,167 - re2/parse.cc#23:160,165
       status_(status), stacktop_(NULL), ncap_(0) {
     if (flags_ & Latin1)
       rune_max_ = 0xFF;
 -   else if (flags & UCS2)
 -     rune_max_ = 0xFFFF;
     else
       rune_max_ = Runemax;
   }
 re2/parse.cc#22:365,387 - re2/parse.cc#23:363,374
   bool Regexp::ParseState::PushCarat() {
     if (flags_ & OneLine) {
       return PushSimpleOp(kRegexpBeginText);
 -   } else {
 -     if (flags_ & UCS2) {
 -       status_->set_code(kRegexpUnsupported);
 -       status_->set_error_arg("multiline ^ in UCS-2 mode");
 -       return false;
 -     }
 -     return PushSimpleOp(kRegexpBeginLine);
     }
 +   return PushSimpleOp(kRegexpBeginLine);
   }

   // Pushes a \b or \B onto the stack.
   bool Regexp::ParseState::PushWordBoundary(bool word) {
 -   if (flags_ & UCS2) {
 -     status_->set_code(kRegexpUnsupported);
 -     status_->set_error_arg("\\b or \\B in UCS-2 mode");
 -     return false;
 -   }
     if (word)
       return PushSimpleOp(kRegexpWordBoundary);
     return PushSimpleOp(kRegexpNoWordBoundary);
 re2/parse.cc#22:397,407 - re2/parse.cc#23:384,389
       bool ret = PushSimpleOp(kRegexpEndText);
       flags_ = oflags;
       return ret;
 -   }
 -   if (flags_ & UCS2) {
 -     status_->set_code(kRegexpUnsupported);
 -     status_->set_error_arg("multiline $ in UCS-2 mode");
 -     return false;
     }
     return PushSimpleOp(kRegexpEndLine);
   }
 ==== re2/re2.cc#34 - re2/re2.cc#35 ====
 re2/re2.cc#34:79,86 - re2/re2.cc#35:79,84
         return RE2::ErrorBadUTF8;
       case re2::kRegexpBadNamedCapture:
         return RE2::ErrorBadNamedCapture;
 -     case re2::kRegexpUnsupported:
 -       return RE2::ErrorUnsupported;
     }
     return RE2::ErrorInternal;
   }
 re2/re2.cc#34:122,130 - re2/re2.cc#35:120,125
         break;
       case RE2::Options::EncodingLatin1:
         flags |= Regexp::Latin1;
 -       break;
 -     case RE2::Options::EncodingUCS2:
 -       flags |= Regexp::UCS2;
         break;
     }

 ==== re2/re2.h#36 - re2/re2.h#37 ====
 re2/re2.h#36:246,252 - re2/re2.h#37:246,251
       ErrorBadUTF8,            // invalid UTF-8 in regexp
       ErrorBadNamedCapture,    // bad named capture group
       ErrorPatternTooLarge,    // pattern too large (compile failed)
 -     ErrorUnsupported,        // unsupported feature (in UCS-2 mode)
     };

     // Predefined common options.
 re2/re2.h#36:570,576 - re2/re2.h#37:569,574

       enum Encoding {
         EncodingUTF8 = 1,
 -       EncodingUCS2,      // 16-bit Unicode 0-FFFF only
         EncodingLatin1
       };

 ==== re2/regexp.cc#15 - re2/regexp.cc#16 ====
 re2/regexp.cc#15:324,333 - re2/regexp.cc#16:324,329
   // the regexp that remains after the prefix.  The prefix might
   // be ASCII case-insensitive.
   bool Regexp::RequiredPrefix(string *prefix, bool *foldcase, Regexp** suffix) {
 -   // Don't even bother for UCS-2; it's time to throw that code away.
 -   if (parse_flags_ & UCS2)
 -     return false;
 -
     // No need for a walker: the regexp must be of the form
     // 1. some number of ^ anchors
     // 2. a literal char or string
 ==== re2/regexp.h#20 - re2/regexp.h#21 ====
 re2/regexp.h#20:187,193 - re2/regexp.h#21:187,192
     kRegexpBadPerlOp,          // bad perl operator
     kRegexpBadUTF8,            // invalid UTF-8 in regexp
     kRegexpBadNamedCapture,    // bad named capture
 -   kRegexpUnsupported,        // unsupported operator
   };

   // Error status for certain operations.
 re2/regexp.h#20:307,316 - re2/regexp.h#21:306,314
                              //   \Q and \E to disable/enable metacharacters
                              //   (?P<name>expr) for named captures
                              //   \C to match any single byte
 -     UCS2         = 1<<10,  // Text is in UCS-2, regexp is in UTF-8.
 -     UnicodeGroups = 1<<11, // Allow \p{Han} for Unicode Han group
 +     UnicodeGroups = 1<<10, // Allow \p{Han} for Unicode Han group
                              //   and \P{Han} for its negation.
 -     NeverNL      = 1<<12,  // Never match NL, even if the regexp mentions
 +     NeverNL      = 1<<11,  // Never match NL, even if the regexp mentions
                              //   it explicitly.

       // As close to Perl as we can get.
 ==== re2/testing/backtrack.cc#4 - re2/testing/backtrack.cc#5 ====
 re2/testing/backtrack.cc#4:134,141 - re2/testing/backtrack.cc#5:134,139
       cap_[0] = p;
       if (Visit(prog_->start(), p))  // Match must be leftmost; done.
         return true;
 -     if (prog_->flags() & Regexp::UCS2)
 -       p++;
     }
     return false;
   }
 ==== re2/testing/tester.cc#12 - re2/testing/tester.cc#13 ====
 re2/testing/tester.cc#12:144,154 - re2/testing/tester.cc#13:144,152
   static ParseMode parse_modes[] = {
     { single_line,                   "single-line"          },
     { single_line|Regexp::Latin1,    "single-line, latin1"  },
 -   { single_line|Regexp::UCS2,     "single-line, ucs2"   },
     { multi_line,                    "multiline"            },
     { multi_line|Regexp::NonGreedy,  "multiline, nongreedy" },
     { multi_line|Regexp::Latin1,     "multiline, latin1"    },
 -   { multi_line|Regexp::UCS2,      "multiline, ucs2"     },
   };

   static string FormatMode(Regexp::ParseFlags flags) {
 re2/testing/tester.cc#12:179,189 - re2/testing/tester.cc#13:177,185
     RegexpStatus status;
     regexp_ = Regexp::Parse(regexp_str, flags, &status);
     if (regexp_ == NULL) {
 -     if (status.code() != kRegexpUnsupported) {
 -       LOG(INFO) << "Cannot parse: " << CEscape(regexp_str_)
 -                 << " mode: " << FormatMode(flags);
 -       error_ = true;
 -     }
 +     LOG(INFO) << "Cannot parse: " << CEscape(regexp_str_)
 +               << " mode: " << FormatMode(flags);
 +     error_ = true;
       return;
     }
     prog_ = regexp_->CompileToProg(0);
 re2/testing/tester.cc#12:230,237 - re2/testing/tester.cc#13:226,231
       RE2::Options options;
       if (flags & Regexp::Latin1)
         options.set_encoding(RE2::Options::EncodingLatin1);
 -     else if (flags & Regexp::UCS2)
 -       options.set_encoding(RE2::Options::EncodingUCS2);
       if (kind_ == Prog::kLongestMatch)
         options.set_longest_match(true);
       re2_ = new RE2(re, options);
 re2/testing/tester.cc#12:281,379 - re2/testing/tester.cc#13:275,280
       delete re2_;
   }

 - // Converts UTF-8 string in text into UCS-2 string in new_text.
 - static bool ConvertUTF8ToUCS2(const StringPiece& text, StringPiece* new_text) {
 -   const char* p = text.begin();
 -   const char* ep = text.end();
 -   uint16* q = new uint16[ep - p];
 -   uint16* q0 = q;
 -
 -   int n;
 -   Rune r;
 -   for (; p < ep; p += n) {
 -     if (!fullrune(p, ep - p)) {
 -       delete[] q0;
 -       return false;
 -     }
 -     n = chartorune(&r, p);
 -     if (r > 0xFFFF) {
 -       delete[] q0;
 -       return false;
 -     }
 -     *q++ = r;
 -   }
 -   *new_text = StringPiece(reinterpret_cast<char*>(q0), 2*(q - q0));
 -   return true;
 - }
 -
 - // Rewrites *sp from being a pointer into text8 (UTF-8)
 - // to being a pointer into text16 (equivalent text but in UCS-2).
 - static void AdjustUTF8ToUCS2(const StringPiece& text8, const StringPiece& text16,
 -                               StringPiece *sp) {
 -   if (sp->begin() == NULL && text8.begin() != NULL)
 -     return;
 -
 -   int nrune = 0;
 -   int n;
 -   Rune r;
 -   const char* p = text8.begin();
 -   const char* ep = text8.end();
 -   const char* spbegin = NULL;
 -   const char* spend = NULL;
 -   for (;;) {
 -     if (p == sp->begin())
 -       spbegin = text16.begin() + sizeof(uint16)*nrune;
 -     if (p == sp->end())
 -       spend = text16.begin() + sizeof(uint16)*nrune;
 -     if (p >= ep)
 -       break;
 -     n = chartorune(&r, p);
 -     p += n;
 -     nrune++;
 -   }
 -   if (spbegin == NULL || spend == NULL) {
 -     LOG(FATAL) << "Error in AdjustUTF8ToUCS2 "
 -                << CEscape(text8) << " "
 -                << (int)(sp->begin() - text8.begin()) << " "
 -                << (int)(sp->end() - text8.begin());
 -   }
 -   *sp = StringPiece(spbegin, spend - spbegin);
 - }
 -
 - // Rewrites *sp from begin a pointer into text16 (UCS-2)
 - // to being a pointer into text8 (equivalent text but in UTF-8).
 - static void AdjustUCS2ToUTF8(const StringPiece& text16, const StringPiece& text8,
 -                               StringPiece* sp) {
 -   if (sp->begin() == NULL)
 -     return;
 -
 -   int nrune = 0;
 -   int n;
 -   Rune r;
 -   const char* p = text8.begin();
 -   const char* ep = text8.end();
 -   const char* spbegin = NULL;
 -   const char* spend = NULL;
 -   for (;;) {
 -     if (nrune == (sp->begin() - text16.begin())/2)
 -       spbegin = p;
 -     if (nrune == (sp->end() - text16.begin())/2)
 -       spend = p;
 -     if (p >= ep)
 -       break;
 -     n = chartorune(&r, p);
 -     p += n;
 -     nrune++;
 -   }
 -   if (text8.begin() != NULL && (spbegin == NULL || spend == NULL)) {
 -     LOG(FATAL) << "Error in AdjustUCS2ToUTF8 "
 -                << CEscape(text16) << " "
 -                << (int)(sp->begin() - text16.begin()) << " "
 -                << (int)(sp->end() - text16.begin());
 -   }
 -   *sp = StringPiece(spbegin, spend - spbegin);
 - }
 -
   // Runs a single search using the named engine type.
   // This interface hides all the irregularities of the various
   // engine interfaces from the rest of this file.
 re2/testing/tester.cc#12:393,411 - re2/testing/tester.cc#13:294,300

     StringPiece text = orig_text;
     StringPiece context = orig_context;
 -   bool ucs2 = false;

 -   if ((flags() & Regexp::UCS2) && type != kEnginePCRE) {
 -     if (!ConvertUTF8ToUCS2(orig_context, &context)) {
 -       result->skipped = true;
 -       return;
 -     }
 -
 -     // Rewrite context to refer to new text.
 -     AdjustUTF8ToUCS2(orig_context, context, &text);
 -     ucs2 = true;
 -   }
 -
     switch (type) {
       default:
         LOG(FATAL) << "Bad RunSearch type: " << (int)type;
 re2/testing/tester.cc#12:557,577 - re2/testing/tester.cc#13:446,451
       }
     }

 -   // If we did UCS-2 matching, rewrite the matches to refer
 -   // to the original UTF-8 text.
 -   if (ucs2) {
 -     if (result->matched) {
 -       if (result->have_submatch0) {
 -         AdjustUCS2ToUTF8(context, orig_context, &result->submatch[0]);
 -       } else if (result->have_submatch) {
 -         for (int i = 0; i < nsubmatch; i++) {
 -           AdjustUCS2ToUTF8(context, orig_context, &result->submatch[i]);
 -         }
 -       }
 -     }
 -     delete[] context.begin();
 -   }
 -
     if (!result->matched)
       memset(result->submatch, 0, sizeof result->submatch);
   }
 re2/testing/tester.cc#12:596,617 - re2/testing/tester.cc#13:470,475
     return true;
   }

 - // Check whether text uses only Unicode points <= 0xFFFF
 - // (in the BMP).
 - static bool IsBMP(const StringPiece& text) {
 -   const char* p = text.begin();
 -   const char* ep = text.end();
 -   while (p < ep) {
 -     if (!fullrune(p, ep - p))
 -       return false;
 -     Rune r;
 -     p += chartorune(&r, p);
 -     if (r > 0xFFFF)
 -       return false;
 -   }
 -   return true;
 - }
 -
   // Runs a single test.
   bool TestInstance::RunCase(const StringPiece& text, const StringPiece& context,
                              Prog::Anchor anchor) {
 re2/testing/tester.cc#12:619,625 - re2/testing/tester.cc#13:477,483
     Result correct;
     RunSearch(kEngineBacktrack, text, context, anchor, &correct);
     if (correct.skipped) {
 -     if (regexp_ == NULL || !IsBMP(context))  // okay to skip in UCS-2 mode
 +     if (regexp_ == NULL)
         return true;
       LOG(ERROR) << "Skipped backtracking! " << CEscape(regexp_str_)
                  << " " << FormatMode(flags_);
	This is a dump from Google's source control system of the change
	that removed UCS-2 support from RE2. As the explanation below
	says, UCS-2 mode is fundamentally at odds with things like ^ and $,
	so it never really worked very well. But if you are interested in using
	it without those operators, it did work for that. It assumed that the
	UCS-2 data was in the native host byte order.

	If you are interested in adding UCS-2 mode back, this patch might
	be a good starting point.


	Change 12780686 by rsc@rsc-re2 on 2009/09/16 15:30:15

	Retire UCS-2 mode.

	I added it as an experiment for V8, but it
	requires 2-byte lookahead to do completely,
	and RE2 has 1-byte lookahead (enough for UTF-8)
	as a fairly deep fundamental assumption,
	so it did not support ^ or $.

	==== re2/bitstate.cc#2 - re2/bitstate.cc#3 ====
	re2/bitstate.cc#2:314,321 - re2/bitstate.cc#3:314,319
	cap_[0] = p;
	if (TrySearch(prog_->start(), p)) // Match must be leftmost; done.
	return true;
	- if (prog_->flags() & Regexp::UCS2)
	- p++;
	}
	return false;
	}
	==== re2/compile.cc#17 - re2/compile.cc#18 ====
	re2/compile.cc#17:95,101 - re2/compile.cc#18:95,100
	// Input encodings.
	enum Encoding {
	kEncodingUTF8 = 1, // UTF-8 (0-10FFFF)
	- kEncodingUCS2, // UCS-2 (0-FFFF), native byte order
	kEncodingLatin1, // Latin1 (0-FF)
	};

	re2/compile.cc#17:168,176 - re2/compile.cc#18:167,172
	void AddRuneRangeLatin1(Rune lo, Rune hi, bool foldcase);
	void AddRuneRangeUTF8(Rune lo, Rune hi, bool foldcase);
	void Add_80_10ffff();
	- void AddRuneRangeUCS2(Rune lo, Rune hi, bool foldcase);
	- void AddUCS2Pair(uint8 lo1, uint8 hi1, bool fold1,
	- uint8 lo2, uint8 hi2, bool fold2);

	// New suffix that matches the byte range lo-hi, then goes to next.
	Inst* RuneByteSuffix(uint8 lo, uint8 hi, bool foldcase, Inst* next);
	re2/compile.cc#17:475,481 - re2/compile.cc#18:471,477

	// Converts rune range lo-hi into a fragment that recognizes
	// the bytes that would make up those runes in the current
	- // encoding (Latin 1, UTF-8, or UCS-2).
	+ // encoding (Latin 1 or UTF-8).
	// This lets the machine work byte-by-byte even when
	// using multibyte encodings.

	re2/compile.cc#17:488,496 - re2/compile.cc#18:484,489
	case kEncodingLatin1:
	AddRuneRangeLatin1(lo, hi, foldcase);
	break;
	- case kEncodingUCS2:
	- AddRuneRangeUCS2(lo, hi, foldcase);
	- break;
	}
	}

	re2/compile.cc#17:503,581 - re2/compile.cc#18:496,501
	AddSuffix(RuneByteSuffix(lo, hi, foldcase, NULL));
	}

	- // Test whether 16-bit values are big or little endian.
	- static bool BigEndian() {
	- union {
	- char byte[2];
	- int16 endian;
	- } u;
	-
	- u.byte[0] = 1;
	- u.byte[1] = 2;
	- return u.endian == 0x0102;
	- }
	-
	- void Compiler::AddUCS2Pair(uint8 lo1, uint8 hi1, bool fold1,
	- uint8 lo2, uint8 hi2, bool fold2) {
	- Inst* ip;
	- if (reversed_) {
	- ip = RuneByteSuffix(lo1, hi1, fold1, NULL);
	- ip = RuneByteSuffix(lo2, hi2, fold2, ip);
	- } else {
	- ip = RuneByteSuffix(lo2, hi2, fold2, NULL);
	- ip = RuneByteSuffix(lo1, hi1, fold1, ip);
	- }
	- AddSuffix(ip);
	- }
	-
	- void Compiler::AddRuneRangeUCS2(Rune lo, Rune hi, bool foldcase) {
	- if (lo > hi \|\| lo > 0xFFFF)
	- return;
	- if (hi > 0xFFFF)
	- hi = 0xFFFF;
	-
	- // We'll assemble a pattern assuming big endian.
	- // If the machine isn't, tell Cat to reverse its arguments.
	- bool oldreversed = reversed_;
	- if (!BigEndian()) {
	- reversed_ = !oldreversed;
	- }
	-
	- // Split into bytes.
	- int lo1 = lo >> 8;
	- int lo2 = lo & 0xFF;
	- int hi1 = hi >> 8;
	- int hi2 = hi & 0xFF;
	-
	- if (lo1 == hi1) {
	- // Easy case: high bits are same in both.
	- // Only do ASCII case folding on the second byte if the top byte is 00.
	- AddUCS2Pair(lo1, lo1, false, lo2, hi2, lo1==0 && foldcase);
	- } else {
	- // Harder case: different second byte ranges depending on first byte.
	-
	- // Initial fragment.
	- if (lo2 > 0) {
	- AddUCS2Pair(lo1, lo1, false, lo2, 0xFF, lo1==0 && foldcase);
	- lo1++;
	- }
	-
	- // Trailing fragment.
	- if (hi2 < 0xFF) {
	- AddUCS2Pair(hi1, hi1, false, 0, hi2, false);
	- hi1--;
	- }
	-
	- // Inner ranges.
	- if (lo1 <= hi1) {
	- AddUCS2Pair(lo1, hi1, false, 0, 0xFF, false);
	- }
	- }
	-
	- // Restore reverse setting.
	- reversed_ = oldreversed;
	- }
	-
	// Table describing how to make a UTF-8 matching machine
	// for the rune range 80-10FFFF (Runeself-Runemax).
	// This range happens frequently enough (for example /./ and /[^a-z]/)
	re2/compile.cc#17:707,716 - re2/compile.cc#18:627,634

	Frag Compiler::Literal(Rune r, bool foldcase) {
	switch (encoding_) {
	- default: // UCS-2 or something new
	- BeginRange();
	- AddRuneRange(r, r, foldcase);
	- return EndRange();
	+ default:
	+ return kNullFrag;

	case kEncodingLatin1:
	return ByteRange(r, r, foldcase);
	re2/compile.cc#17:927,934 - re2/compile.cc#18:845,850

	if (re->parse_flags() & Regexp::Latin1)
	c.encoding_ = kEncodingLatin1;
	- else if (re->parse_flags() & Regexp::UCS2)
	- c.encoding_ = kEncodingUCS2;
	c.reversed_ = reversed;
	if (max_mem <= 0) {
	c.max_inst_ = 100000; // more than enough
	re2/compile.cc#17:983,993 - re2/compile.cc#18:899,905
	c.prog_->set_start_unanchored(c.prog_->start());
	} else {
	Frag dot;
	- if (c.encoding_ == kEncodingUCS2) {
	- dot = c.Cat(c.ByteRange(0x00, 0xFF, false), c.ByteRange(0x00, 0xFF, false));
	- } else {
	- dot = c.ByteRange(0x00, 0xFF, false);
	- }
	+ dot = c.ByteRange(0x00, 0xFF, false);
	Frag dotloop = c.Star(dot, true);
	Frag unanchored = c.Cat(dotloop, all);
	c.prog_->set_start_unanchored(unanchored.begin);
	==== re2/nfa.cc#8 - re2/nfa.cc#9 ====
	re2/nfa.cc#8:426,432 - re2/nfa.cc#9:426,431
	const char* bp = context.begin();
	int c = -1;
	int wasword = 0;
	- bool ucs2 = prog_->flags() & Regexp::UCS2;

	if (text.begin() > context.begin()) {
	c = text.begin()[-1] & 0xFF;
	re2/nfa.cc#8:492,498 - re2/nfa.cc#9:491,497
	// If there's a required first byte for an unanchored search
	// and we're not in the middle of any possible matches,
	// use memchr to search for the byte quickly.
	- if (!ucs2 && !anchored && first_byte_ >= 0 && runq->size() == 0 &&
	+ if (!anchored && first_byte_ >= 0 && runq->size() == 0 &&
	p < text.end() && (p[0] & 0xFF) != first_byte_) {
	p = reinterpret_cast<const char*>(memchr(p, first_byte_,
	text.end() - p));
	re2/nfa.cc#8:505,526 - re2/nfa.cc#9:504,514
	flag = Prog::EmptyFlags(context, p);
	}

	- // In UCS-2 mode, if we need to start a new thread,
	- // make sure to do it on an even boundary.
	- if(ucs2 && runq->size() == 0 &&
	- (p - context.begin()) % 2 && p < text.end()) {
	- p++;
	- flag = Prog::EmptyFlags(context, p);
	- }
	-
	// Steal match storage (cleared but unused as of yet)
	// temporarily to hold match boundaries for new thread.
	- // In UCS-2 mode, only start the thread on a 2-byte boundary.
	- if(!ucs2 \|\| (p - context.begin()) % 2 == 0) {
	- match_[0] = p;
	- AddToThreadq(runq, start_, flag, p, match_);
	- match_[0] = NULL;
	- }
	+ match_[0] = p;
	+ AddToThreadq(runq, start_, flag, p, match_);
	+ match_[0] = NULL;
	}

	// If all the threads have died, stop early.
	==== re2/parse.cc#22 - re2/parse.cc#23 ====
	re2/parse.cc#22:160,167 - re2/parse.cc#23:160,165
	status_(status), stacktop_(NULL), ncap_(0) {
	if (flags_ & Latin1)
	rune_max_ = 0xFF;
	- else if (flags & UCS2)
	- rune_max_ = 0xFFFF;
	else
	rune_max_ = Runemax;
	}
	re2/parse.cc#22:365,387 - re2/parse.cc#23:363,374
	bool Regexp::ParseState::PushCarat() {
	if (flags_ & OneLine) {
	return PushSimpleOp(kRegexpBeginText);
	- } else {
	- if (flags_ & UCS2) {
	- status_->set_code(kRegexpUnsupported);
	- status_->set_error_arg("multiline ^ in UCS-2 mode");
	- return false;
	- }
	- return PushSimpleOp(kRegexpBeginLine);
	}
	+ return PushSimpleOp(kRegexpBeginLine);
	}

	// Pushes a \b or \B onto the stack.
	bool Regexp::ParseState::PushWordBoundary(bool word) {
	- if (flags_ & UCS2) {
	- status_->set_code(kRegexpUnsupported);
	- status_->set_error_arg("\\b or \\B in UCS-2 mode");
	- return false;
	- }
	if (word)
	return PushSimpleOp(kRegexpWordBoundary);
	return PushSimpleOp(kRegexpNoWordBoundary);
	re2/parse.cc#22:397,407 - re2/parse.cc#23:384,389
	bool ret = PushSimpleOp(kRegexpEndText);
	flags_ = oflags;
	return ret;
	- }
	- if (flags_ & UCS2) {
	- status_->set_code(kRegexpUnsupported);
	- status_->set_error_arg("multiline $ in UCS-2 mode");
	- return false;
	}
	return PushSimpleOp(kRegexpEndLine);
	}
	==== re2/re2.cc#34 - re2/re2.cc#35 ====
	re2/re2.cc#34:79,86 - re2/re2.cc#35:79,84
	return RE2::ErrorBadUTF8;
	case re2::kRegexpBadNamedCapture:
	return RE2::ErrorBadNamedCapture;
	- case re2::kRegexpUnsupported:
	- return RE2::ErrorUnsupported;
	}
	return RE2::ErrorInternal;
	}
	re2/re2.cc#34:122,130 - re2/re2.cc#35:120,125
	break;
	case RE2::Options::EncodingLatin1:
	flags \|= Regexp::Latin1;
	- break;
	- case RE2::Options::EncodingUCS2:
	- flags \|= Regexp::UCS2;
	break;
	}

	==== re2/re2.h#36 - re2/re2.h#37 ====
	re2/re2.h#36:246,252 - re2/re2.h#37:246,251
	ErrorBadUTF8, // invalid UTF-8 in regexp
	ErrorBadNamedCapture, // bad named capture group
	ErrorPatternTooLarge, // pattern too large (compile failed)
	- ErrorUnsupported, // unsupported feature (in UCS-2 mode)
	};

	// Predefined common options.
	re2/re2.h#36:570,576 - re2/re2.h#37:569,574

	enum Encoding {
	EncodingUTF8 = 1,
	- EncodingUCS2, // 16-bit Unicode 0-FFFF only
	EncodingLatin1
	};

	==== re2/regexp.cc#15 - re2/regexp.cc#16 ====
	re2/regexp.cc#15:324,333 - re2/regexp.cc#16:324,329
	// the regexp that remains after the prefix. The prefix might
	// be ASCII case-insensitive.
	bool Regexp::RequiredPrefix(string prefix, bool foldcase, Regexp** suffix) {
	- // Don't even bother for UCS-2; it's time to throw that code away.
	- if (parse_flags_ & UCS2)
	- return false;
	-
	// No need for a walker: the regexp must be of the form
	// 1. some number of ^ anchors
	// 2. a literal char or string
	==== re2/regexp.h#20 - re2/regexp.h#21 ====
	re2/regexp.h#20:187,193 - re2/regexp.h#21:187,192
	kRegexpBadPerlOp, // bad perl operator
	kRegexpBadUTF8, // invalid UTF-8 in regexp
	kRegexpBadNamedCapture, // bad named capture
	- kRegexpUnsupported, // unsupported operator
	};

	// Error status for certain operations.
	re2/regexp.h#20:307,316 - re2/regexp.h#21:306,314
	// \Q and \E to disable/enable metacharacters
	// (?P<name>expr) for named captures
	// \C to match any single byte
	- UCS2 = 1<<10, // Text is in UCS-2, regexp is in UTF-8.
	- UnicodeGroups = 1<<11, // Allow \p{Han} for Unicode Han group
	+ UnicodeGroups = 1<<10, // Allow \p{Han} for Unicode Han group
	// and \P{Han} for its negation.
	- NeverNL = 1<<12, // Never match NL, even if the regexp mentions
	+ NeverNL = 1<<11, // Never match NL, even if the regexp mentions
	// it explicitly.

	// As close to Perl as we can get.
	==== re2/testing/backtrack.cc#4 - re2/testing/backtrack.cc#5 ====
	re2/testing/backtrack.cc#4:134,141 - re2/testing/backtrack.cc#5:134,139
	cap_[0] = p;
	if (Visit(prog_->start(), p)) // Match must be leftmost; done.
	return true;
	- if (prog_->flags() & Regexp::UCS2)
	- p++;
	}
	return false;
	}
	==== re2/testing/tester.cc#12 - re2/testing/tester.cc#13 ====
	re2/testing/tester.cc#12:144,154 - re2/testing/tester.cc#13:144,152
	static ParseMode parse_modes[] = {
	{ single_line, "single-line" },
	{ single_line\|Regexp::Latin1, "single-line, latin1" },
	- { single_line\|Regexp::UCS2, "single-line, ucs2" },
	{ multi_line, "multiline" },
	{ multi_line\|Regexp::NonGreedy, "multiline, nongreedy" },
	{ multi_line\|Regexp::Latin1, "multiline, latin1" },
	- { multi_line\|Regexp::UCS2, "multiline, ucs2" },
	};

	static string FormatMode(Regexp::ParseFlags flags) {
	re2/testing/tester.cc#12:179,189 - re2/testing/tester.cc#13:177,185
	RegexpStatus status;
	regexp_ = Regexp::Parse(regexp_str, flags, &status);
	if (regexp_ == NULL) {
	- if (status.code() != kRegexpUnsupported) {
	- LOG(INFO) << "Cannot parse: " << CEscape(regexp_str_)
	- << " mode: " << FormatMode(flags);
	- error_ = true;
	- }
	+ LOG(INFO) << "Cannot parse: " << CEscape(regexp_str_)
	+ << " mode: " << FormatMode(flags);
	+ error_ = true;
	return;
	}
	prog_ = regexp_->CompileToProg(0);
	re2/testing/tester.cc#12:230,237 - re2/testing/tester.cc#13:226,231
	RE2::Options options;
	if (flags & Regexp::Latin1)
	options.set_encoding(RE2::Options::EncodingLatin1);
	- else if (flags & Regexp::UCS2)
	- options.set_encoding(RE2::Options::EncodingUCS2);
	if (kind_ == Prog::kLongestMatch)
	options.set_longest_match(true);
	re2_ = new RE2(re, options);
	re2/testing/tester.cc#12:281,379 - re2/testing/tester.cc#13:275,280
	delete re2_;
	}

	- // Converts UTF-8 string in text into UCS-2 string in new_text.
	- static bool ConvertUTF8ToUCS2(const StringPiece& text, StringPiece* new_text) {
	- const char* p = text.begin();
	- const char* ep = text.end();
	- uint16* q = new uint16[ep - p];
	- uint16* q0 = q;
	-
	- int n;
	- Rune r;
	- for (; p < ep; p += n) {
	- if (!fullrune(p, ep - p)) {
	- delete[] q0;
	- return false;
	- }
	- n = chartorune(&r, p);
	- if (r > 0xFFFF) {
	- delete[] q0;
	- return false;
	- }
	- *q++ = r;
	- }
	- new_text = StringPiece(reinterpret_cast<char>(q0), 2*(q - q0));
	- return true;
	- }
	-
	- // Rewrites *sp from being a pointer into text8 (UTF-8)
	- // to being a pointer into text16 (equivalent text but in UCS-2).
	- static void AdjustUTF8ToUCS2(const StringPiece& text8, const StringPiece& text16,
	- StringPiece *sp) {
	- if (sp->begin() == NULL && text8.begin() != NULL)
	- return;
	-
	- int nrune = 0;
	- int n;
	- Rune r;
	- const char* p = text8.begin();
	- const char* ep = text8.end();
	- const char* spbegin = NULL;
	- const char* spend = NULL;
	- for (;;) {
	- if (p == sp->begin())
	- spbegin = text16.begin() + sizeof(uint16)*nrune;
	- if (p == sp->end())
	- spend = text16.begin() + sizeof(uint16)*nrune;
	- if (p >= ep)
	- break;
	- n = chartorune(&r, p);
	- p += n;
	- nrune++;
	- }
	- if (spbegin == NULL \|\| spend == NULL) {
	- LOG(FATAL) << "Error in AdjustUTF8ToUCS2 "
	- << CEscape(text8) << " "
	- << (int)(sp->begin() - text8.begin()) << " "
	- << (int)(sp->end() - text8.begin());
	- }
	- *sp = StringPiece(spbegin, spend - spbegin);
	- }
	-
	- // Rewrites *sp from begin a pointer into text16 (UCS-2)
	- // to being a pointer into text8 (equivalent text but in UTF-8).
	- static void AdjustUCS2ToUTF8(const StringPiece& text16, const StringPiece& text8,
	- StringPiece* sp) {
	- if (sp->begin() == NULL)
	- return;
	-
	- int nrune = 0;
	- int n;
	- Rune r;
	- const char* p = text8.begin();
	- const char* ep = text8.end();
	- const char* spbegin = NULL;
	- const char* spend = NULL;
	- for (;;) {
	- if (nrune == (sp->begin() - text16.begin())/2)
	- spbegin = p;
	- if (nrune == (sp->end() - text16.begin())/2)
	- spend = p;
	- if (p >= ep)
	- break;
	- n = chartorune(&r, p);
	- p += n;
	- nrune++;
	- }
	- if (text8.begin() != NULL && (spbegin == NULL \|\| spend == NULL)) {
	- LOG(FATAL) << "Error in AdjustUCS2ToUTF8 "
	- << CEscape(text16) << " "
	- << (int)(sp->begin() - text16.begin()) << " "
	- << (int)(sp->end() - text16.begin());
	- }
	- *sp = StringPiece(spbegin, spend - spbegin);
	- }
	-
	// Runs a single search using the named engine type.
	// This interface hides all the irregularities of the various
	// engine interfaces from the rest of this file.
	re2/testing/tester.cc#12:393,411 - re2/testing/tester.cc#13:294,300

	StringPiece text = orig_text;
	StringPiece context = orig_context;
	- bool ucs2 = false;

	- if ((flags() & Regexp::UCS2) && type != kEnginePCRE) {
	- if (!ConvertUTF8ToUCS2(orig_context, &context)) {
	- result->skipped = true;
	- return;
	- }
	-
	- // Rewrite context to refer to new text.
	- AdjustUTF8ToUCS2(orig_context, context, &text);
	- ucs2 = true;
	- }
	-
	switch (type) {
	default:
	LOG(FATAL) << "Bad RunSearch type: " << (int)type;
	re2/testing/tester.cc#12:557,577 - re2/testing/tester.cc#13:446,451
	}
	}

	- // If we did UCS-2 matching, rewrite the matches to refer
	- // to the original UTF-8 text.
	- if (ucs2) {
	- if (result->matched) {
	- if (result->have_submatch0) {
	- AdjustUCS2ToUTF8(context, orig_context, &result->submatch[0]);
	- } else if (result->have_submatch) {
	- for (int i = 0; i < nsubmatch; i++) {
	- AdjustUCS2ToUTF8(context, orig_context, &result->submatch[i]);
	- }
	- }
	- }
	- delete[] context.begin();
	- }
	-
	if (!result->matched)
	memset(result->submatch, 0, sizeof result->submatch);
	}
	re2/testing/tester.cc#12:596,617 - re2/testing/tester.cc#13:470,475
	return true;
	}

	- // Check whether text uses only Unicode points <= 0xFFFF
	- // (in the BMP).
	- static bool IsBMP(const StringPiece& text) {
	- const char* p = text.begin();
	- const char* ep = text.end();
	- while (p < ep) {
	- if (!fullrune(p, ep - p))
	- return false;
	- Rune r;
	- p += chartorune(&r, p);
	- if (r > 0xFFFF)
	- return false;
	- }
	- return true;
	- }
	-
	// Runs a single test.
	bool TestInstance::RunCase(const StringPiece& text, const StringPiece& context,
	Prog::Anchor anchor) {
	re2/testing/tester.cc#12:619,625 - re2/testing/tester.cc#13:477,483
	Result correct;
	RunSearch(kEngineBacktrack, text, context, anchor, &correct);
	if (correct.skipped) {
	- if (regexp_ == NULL \|\| !IsBMP(context)) // okay to skip in UCS-2 mode
	+ if (regexp_ == NULL)
	return true;
	LOG(ERROR) << "Skipped backtracking! " << CEscape(regexp_str_)
	<< " " << FormatMode(flags_);