lib/yaml/scanner.py - third_party/rapidjson - Git at Google


 # Scanner produces tokens of the following types:
 # STREAM-START
 # STREAM-END
 # DIRECTIVE(name, value)
 # DOCUMENT-START
 # DOCUMENT-END
 # BLOCK-SEQUENCE-START
 # BLOCK-MAPPING-START
 # BLOCK-END
 # FLOW-SEQUENCE-START
 # FLOW-MAPPING-START
 # FLOW-SEQUENCE-END
 # FLOW-MAPPING-END
 # BLOCK-ENTRY
 # FLOW-ENTRY
 # KEY
 # VALUE
 # ALIAS(value)
 # ANCHOR(value)
 # TAG(value)
 # SCALAR(value, plain, style)
 #
 # Read comments in the Scanner code for more details.
 #

 __all__ = ['Scanner', 'ScannerError']

 from error import MarkedYAMLError
 from tokens import *

 class ScannerError(MarkedYAMLError):
     pass

 class SimpleKey(object):
     # See below simple keys treatment.

     def __init__(self, token_number, required, index, line, column, mark):
         self.token_number = token_number
         self.required = required
         self.index = index
         self.line = line
         self.column = column
         self.mark = mark

 class Scanner(object):

     def __init__(self):
         """Initialize the scanner."""
         # It is assumed that Scanner and Reader will have a common descendant.
         # Reader do the dirty work of checking for BOM and converting the
         # input data to Unicode. It also adds NUL to the end.
         #
         # Reader supports the following methods
         #   self.peek(i=0)       # peek the next i-th character
         #   self.prefix(l=1)     # peek the next l characters
         #   self.forward(l=1)    # read the next l characters and move the pointer.

         # Had we reached the end of the stream?
         self.done = False

         # The number of unclosed '{' and '['. `flow_level == 0` means block
         # context.
         self.flow_level = 0

         # List of processed tokens that are not yet emitted.
         self.tokens = []

         # Add the STREAM-START token.
         self.fetch_stream_start()

         # Number of tokens that were emitted through the `get_token` method.
         self.tokens_taken = 0

         # The current indentation level.
         self.indent = -1

         # Past indentation levels.
         self.indents = []

         # Variables related to simple keys treatment.

         # A simple key is a key that is not denoted by the '?' indicator.
         # Example of simple keys:
         #   ---
         #   block simple key: value
         #   ? not a simple key:
         #   : { flow simple key: value }
         # We emit the KEY token before all keys, so when we find a potential
         # simple key, we try to locate the corresponding ':' indicator.
         # Simple keys should be limited to a single line and 1024 characters.

         # Can a simple key start at the current position? A simple key may
         # start:
         # - at the beginning of the line, not counting indentation spaces
         #       (in block context),
         # - after '{', '[', ',' (in the flow context),
         # - after '?', ':', '-' (in the block context).
         # In the block context, this flag also signifies if a block collection
         # may start at the current position.
         self.allow_simple_key = True

         # Keep track of possible simple keys. This is a dictionary. The key
         # is `flow_level`; there can be no more that one possible simple key
         # for each level. The value is a SimpleKey record:
         #   (token_number, required, index, line, column, mark)
         # A simple key may start with ALIAS, ANCHOR, TAG, SCALAR(flow),
         # '[', or '{' tokens.
         self.possible_simple_keys = {}

     # Public methods.

     def check_token(self, *choices):
         # Check if the next token is one of the given types.
         while self.need_more_tokens():
             self.fetch_more_tokens()
         if self.tokens:
             if not choices:
                 return True
             for choice in choices:
                 if isinstance(self.tokens[0], choice):
                     return True
         return False

     def peek_token(self):
         # Return the next token, but do not delete if from the queue.
         # Return None if no more tokens.
         while self.need_more_tokens():
             self.fetch_more_tokens()
         if self.tokens:
             return self.tokens[0]
         else:
             return None

     def get_token(self):
         # Return the next token.
         while self.need_more_tokens():
             self.fetch_more_tokens()
         if self.tokens:
             self.tokens_taken += 1
             return self.tokens.pop(0)

     # Private methods.

     def need_more_tokens(self):
         if self.done:
             return False
         if not self.tokens:
             return True
         # The current token may be a potential simple key, so we
         # need to look further.
         self.stale_possible_simple_keys()
         if self.next_possible_simple_key() == self.tokens_taken:
             return True

     def fetch_more_tokens(self):

         # Eat whitespaces and comments until we reach the next token.
         self.scan_to_next_token()

         # Remove obsolete possible simple keys.
         self.stale_possible_simple_keys()

         # Compare the current indentation and column. It may add some tokens
         # and decrease the current indentation level.
         self.unwind_indent(self.column)

         # Peek the next character.
         ch = self.peek()

         # Is it the end of stream?
         if ch == u'\0':
             return self.fetch_stream_end()

         # Is it a directive?
         if ch == u'%' and self.check_directive():
             return self.fetch_directive()

         # Is it the document start?
         if ch == u'-' and self.check_document_start():
             return self.fetch_document_start()

         # Is it the document end?
         if ch == u'.' and self.check_document_end():
             return self.fetch_document_end()

         # TODO: support for BOM within a stream.
         #if ch == u'\uFEFF':
         #    return self.fetch_bom()    <-- issue BOMToken

         # Note: the order of the following checks is NOT significant.

         # Is it the flow sequence start indicator?
         if ch == u'[':
             return self.fetch_flow_sequence_start()

         # Is it the flow mapping start indicator?
         if ch == u'{':
             return self.fetch_flow_mapping_start()

         # Is it the flow sequence end indicator?
         if ch == u']':
             return self.fetch_flow_sequence_end()

         # Is it the flow mapping end indicator?
         if ch == u'}':
             return self.fetch_flow_mapping_end()

         # Is it the flow entry indicator?
         if ch == u',':
             return self.fetch_flow_entry()

         # Is it the block entry indicator?
         if ch == u'-' and self.check_block_entry():
             return self.fetch_block_entry()

         # Is it the key indicator?
         if ch == u'?' and self.check_key():
             return self.fetch_key()

         # Is it the value indicator?
         if ch == u':' and self.check_value():
             return self.fetch_value()

         # Is it an alias?
         if ch == u'*':
             return self.fetch_alias()

         # Is it an anchor?
         if ch == u'&':
             return self.fetch_anchor()

         # Is it a tag?
         if ch == u'!':
             return self.fetch_tag()

         # Is it a literal scalar?
         if ch == u'|' and not self.flow_level:
             return self.fetch_literal()

         # Is it a folded scalar?
         if ch == u'>' and not self.flow_level:
             return self.fetch_folded()

         # Is it a single quoted scalar?
         if ch == u'\'':
             return self.fetch_single()

         # Is it a double quoted scalar?
         if ch == u'\"':
             return self.fetch_double()

         # It must be a plain scalar then.
         if self.check_plain():
             return self.fetch_plain()

         # No? It's an error. Let's produce a nice error message.
         raise ScannerError("while scanning for the next token", None,
                 "found character %r that cannot start any token"
                 % ch.encode('utf-8'), self.get_mark())

     # Simple keys treatment.

     def next_possible_simple_key(self):
         # Return the number of the nearest possible simple key. Actually we
         # don't need to loop through the whole dictionary. We may replace it
         # with the following code:
         #   if not self.possible_simple_keys:
         #       return None
         #   return self.possible_simple_keys[
         #           min(self.possible_simple_keys.keys())].token_number
         min_token_number = None
         for level in self.possible_simple_keys:
             key = self.possible_simple_keys[level]
             if min_token_number is None or key.token_number < min_token_number:
                 min_token_number = key.token_number
         return min_token_number

     def stale_possible_simple_keys(self):
         # Remove entries that are no longer possible simple keys. According to
         # the YAML specification, simple keys
         # - should be limited to a single line,
         # - should be no longer than 1024 characters.
         # Disabling this procedure will allow simple keys of any length and
         # height (may cause problems if indentation is broken though).
         for level in self.possible_simple_keys.keys():
             key = self.possible_simple_keys[level]
             if key.line != self.line  \
                     or self.index-key.index > 1024:
                 if key.required:
                     raise ScannerError("while scanning a simple key", key.mark,
                             "could not find expected ':'", self.get_mark())
                 del self.possible_simple_keys[level]

     def save_possible_simple_key(self):
         # The next token may start a simple key. We check if it's possible
         # and save its position. This function is called for
         #   ALIAS, ANCHOR, TAG, SCALAR(flow), '[', and '{'.

         # Check if a simple key is required at the current position.
         required = not self.flow_level and self.indent == self.column

         # The next token might be a simple key. Let's save it's number and
         # position.
         if self.allow_simple_key:
             self.remove_possible_simple_key()
             token_number = self.tokens_taken+len(self.tokens)
             key = SimpleKey(token_number, required,
                     self.index, self.line, self.column, self.get_mark())
             self.possible_simple_keys[self.flow_level] = key

     def remove_possible_simple_key(self):
         # Remove the saved possible key position at the current flow level.
         if self.flow_level in self.possible_simple_keys:
             key = self.possible_simple_keys[self.flow_level]

             if key.required:
                 raise ScannerError("while scanning a simple key", key.mark,
                         "could not find expected ':'", self.get_mark())

             del self.possible_simple_keys[self.flow_level]

     # Indentation functions.

     def unwind_indent(self, column):

         ## In flow context, tokens should respect indentation.
         ## Actually the condition should be `self.indent >= column` according to
         ## the spec. But this condition will prohibit intuitively correct
         ## constructions such as
         ## key : {
         ## }
         #if self.flow_level and self.indent > column:
         #    raise ScannerError(None, None,
         #            "invalid indentation or unclosed '[' or '{'",
         #            self.get_mark())

         # In the flow context, indentation is ignored. We make the scanner less
         # restrictive then specification requires.
         if self.flow_level:
             return

         # In block context, we may need to issue the BLOCK-END tokens.
         while self.indent > column:
             mark = self.get_mark()
             self.indent = self.indents.pop()
             self.tokens.append(BlockEndToken(mark, mark))

     def add_indent(self, column):
         # Check if we need to increase indentation.
         if self.indent < column:
             self.indents.append(self.indent)
             self.indent = column
             return True
         return False

     # Fetchers.

     def fetch_stream_start(self):
         # We always add STREAM-START as the first token and STREAM-END as the
         # last token.

         # Read the token.
         mark = self.get_mark()

         # Add STREAM-START.
         self.tokens.append(StreamStartToken(mark, mark,
             encoding=self.encoding))


     def fetch_stream_end(self):

         # Set the current indentation to -1.
         self.unwind_indent(-1)

         # Reset simple keys.
         self.remove_possible_simple_key()
         self.allow_simple_key = False
         self.possible_simple_keys = {}

         # Read the token.
         mark = self.get_mark()

         # Add STREAM-END.
         self.tokens.append(StreamEndToken(mark, mark))

         # The steam is finished.
         self.done = True

     def fetch_directive(self):

         # Set the current indentation to -1.
         self.unwind_indent(-1)

         # Reset simple keys.
         self.remove_possible_simple_key()
         self.allow_simple_key = False

         # Scan and add DIRECTIVE.
         self.tokens.append(self.scan_directive())

     def fetch_document_start(self):
         self.fetch_document_indicator(DocumentStartToken)

     def fetch_document_end(self):
         self.fetch_document_indicator(DocumentEndToken)

     def fetch_document_indicator(self, TokenClass):

         # Set the current indentation to -1.
         self.unwind_indent(-1)

         # Reset simple keys. Note that there could not be a block collection
         # after '---'.
         self.remove_possible_simple_key()
         self.allow_simple_key = False

         # Add DOCUMENT-START or DOCUMENT-END.
         start_mark = self.get_mark()
         self.forward(3)
         end_mark = self.get_mark()
         self.tokens.append(TokenClass(start_mark, end_mark))

     def fetch_flow_sequence_start(self):
         self.fetch_flow_collection_start(FlowSequenceStartToken)

     def fetch_flow_mapping_start(self):
         self.fetch_flow_collection_start(FlowMappingStartToken)

     def fetch_flow_collection_start(self, TokenClass):

         # '[' and '{' may start a simple key.
         self.save_possible_simple_key()

         # Increase the flow level.
         self.flow_level += 1

         # Simple keys are allowed after '[' and '{'.
         self.allow_simple_key = True

         # Add FLOW-SEQUENCE-START or FLOW-MAPPING-START.
         start_mark = self.get_mark()
         self.forward()
         end_mark = self.get_mark()
         self.tokens.append(TokenClass(start_mark, end_mark))

     def fetch_flow_sequence_end(self):
         self.fetch_flow_collection_end(FlowSequenceEndToken)

     def fetch_flow_mapping_end(self):
         self.fetch_flow_collection_end(FlowMappingEndToken)

     def fetch_flow_collection_end(self, TokenClass):

         # Reset possible simple key on the current level.
         self.remove_possible_simple_key()

         # Decrease the flow level.
         self.flow_level -= 1

         # No simple keys after ']' or '}'.
         self.allow_simple_key = False

         # Add FLOW-SEQUENCE-END or FLOW-MAPPING-END.
         start_mark = self.get_mark()
         self.forward()
         end_mark = self.get_mark()
         self.tokens.append(TokenClass(start_mark, end_mark))

     def fetch_flow_entry(self):

         # Simple keys are allowed after ','.
         self.allow_simple_key = True

         # Reset possible simple key on the current level.
         self.remove_possible_simple_key()

         # Add FLOW-ENTRY.
         start_mark = self.get_mark()
         self.forward()
         end_mark = self.get_mark()
         self.tokens.append(FlowEntryToken(start_mark, end_mark))

     def fetch_block_entry(self):

         # Block context needs additional checks.
         if not self.flow_level:

             # Are we allowed to start a new entry?
             if not self.allow_simple_key:
                 raise ScannerError(None, None,
                         "sequence entries are not allowed here",
                         self.get_mark())

             # We may need to add BLOCK-SEQUENCE-START.
             if self.add_indent(self.column):
                 mark = self.get_mark()
                 self.tokens.append(BlockSequenceStartToken(mark, mark))

         # It's an error for the block entry to occur in the flow context,
         # but we let the parser detect this.
         else:
             pass

         # Simple keys are allowed after '-'.
         self.allow_simple_key = True

         # Reset possible simple key on the current level.
         self.remove_possible_simple_key()

         # Add BLOCK-ENTRY.
         start_mark = self.get_mark()
         self.forward()
         end_mark = self.get_mark()
         self.tokens.append(BlockEntryToken(start_mark, end_mark))

     def fetch_key(self):

         # Block context needs additional checks.
         if not self.flow_level:

             # Are we allowed to start a key (not necessary a simple)?
             if not self.allow_simple_key:
                 raise ScannerError(None, None,
                         "mapping keys are not allowed here",
                         self.get_mark())

             # We may need to add BLOCK-MAPPING-START.
             if self.add_indent(self.column):
                 mark = self.get_mark()
                 self.tokens.append(BlockMappingStartToken(mark, mark))

         # Simple keys are allowed after '?' in the block context.
         self.allow_simple_key = not self.flow_level

         # Reset possible simple key on the current level.
         self.remove_possible_simple_key()

         # Add KEY.
         start_mark = self.get_mark()
         self.forward()
         end_mark = self.get_mark()
         self.tokens.append(KeyToken(start_mark, end_mark))

     def fetch_value(self):

         # Do we determine a simple key?
         if self.flow_level in self.possible_simple_keys:

             # Add KEY.
             key = self.possible_simple_keys[self.flow_level]
             del self.possible_simple_keys[self.flow_level]
             self.tokens.insert(key.token_number-self.tokens_taken,
                     KeyToken(key.mark, key.mark))

             # If this key starts a new block mapping, we need to add
             # BLOCK-MAPPING-START.
             if not self.flow_level:
                 if self.add_indent(key.column):
                     self.tokens.insert(key.token_number-self.tokens_taken,
                             BlockMappingStartToken(key.mark, key.mark))

             # There cannot be two simple keys one after another.
             self.allow_simple_key = False

         # It must be a part of a complex key.
         else:

             # Block context needs additional checks.
             # (Do we really need them? They will be caught by the parser
             # anyway.)
             if not self.flow_level:

                 # We are allowed to start a complex value if and only if
                 # we can start a simple key.
                 if not self.allow_simple_key:
                     raise ScannerError(None, None,
                             "mapping values are not allowed here",
                             self.get_mark())

             # If this value starts a new block mapping, we need to add
             # BLOCK-MAPPING-START.  It will be detected as an error later by
             # the parser.
             if not self.flow_level:
                 if self.add_indent(self.column):
                     mark = self.get_mark()
                     self.tokens.append(BlockMappingStartToken(mark, mark))

             # Simple keys are allowed after ':' in the block context.
             self.allow_simple_key = not self.flow_level

             # Reset possible simple key on the current level.
             self.remove_possible_simple_key()

         # Add VALUE.
         start_mark = self.get_mark()
         self.forward()
         end_mark = self.get_mark()
         self.tokens.append(ValueToken(start_mark, end_mark))

     def fetch_alias(self):

         # ALIAS could be a simple key.
         self.save_possible_simple_key()

         # No simple keys after ALIAS.
         self.allow_simple_key = False

         # Scan and add ALIAS.
         self.tokens.append(self.scan_anchor(AliasToken))

     def fetch_anchor(self):

         # ANCHOR could start a simple key.
         self.save_possible_simple_key()

         # No simple keys after ANCHOR.
         self.allow_simple_key = False

         # Scan and add ANCHOR.
         self.tokens.append(self.scan_anchor(AnchorToken))

     def fetch_tag(self):

         # TAG could start a simple key.
         self.save_possible_simple_key()

         # No simple keys after TAG.
         self.allow_simple_key = False

         # Scan and add TAG.
         self.tokens.append(self.scan_tag())

     def fetch_literal(self):
         self.fetch_block_scalar(style='|')

     def fetch_folded(self):
         self.fetch_block_scalar(style='>')

     def fetch_block_scalar(self, style):

         # A simple key may follow a block scalar.
         self.allow_simple_key = True

         # Reset possible simple key on the current level.
         self.remove_possible_simple_key()

         # Scan and add SCALAR.
         self.tokens.append(self.scan_block_scalar(style))

     def fetch_single(self):
         self.fetch_flow_scalar(style='\'')

     def fetch_double(self):
         self.fetch_flow_scalar(style='"')

     def fetch_flow_scalar(self, style):

         # A flow scalar could be a simple key.
         self.save_possible_simple_key()

         # No simple keys after flow scalars.
         self.allow_simple_key = False

         # Scan and add SCALAR.
         self.tokens.append(self.scan_flow_scalar(style))

     def fetch_plain(self):

         # A plain scalar could be a simple key.
         self.save_possible_simple_key()

         # No simple keys after plain scalars. But note that `scan_plain` will
         # change this flag if the scan is finished at the beginning of the
         # line.
         self.allow_simple_key = False

         # Scan and add SCALAR. May change `allow_simple_key`.
         self.tokens.append(self.scan_plain())

     # Checkers.

     def check_directive(self):

         # DIRECTIVE:        ^ '%' ...
         # The '%' indicator is already checked.
         if self.column == 0:
             return True

     def check_document_start(self):

         # DOCUMENT-START:   ^ '---' (' '|'\n')
         if self.column == 0:
             if self.prefix(3) == u'---'  \
                     and self.peek(3) in u'\0 \t\r\n\x85\u2028\u2029':
                 return True

     def check_document_end(self):

         # DOCUMENT-END:     ^ '...' (' '|'\n')
         if self.column == 0:
             if self.prefix(3) == u'...'  \
                     and self.peek(3) in u'\0 \t\r\n\x85\u2028\u2029':
                 return True

     def check_block_entry(self):

         # BLOCK-ENTRY:      '-' (' '|'\n')
         return self.peek(1) in u'\0 \t\r\n\x85\u2028\u2029'

     def check_key(self):

         # KEY(flow context):    '?'
         if self.flow_level:
             return True

         # KEY(block context):   '?' (' '|'\n')
         else:
             return self.peek(1) in u'\0 \t\r\n\x85\u2028\u2029'

     def check_value(self):

         # VALUE(flow context):  ':'
         if self.flow_level:
             return True

         # VALUE(block context): ':' (' '|'\n')
         else:
             return self.peek(1) in u'\0 \t\r\n\x85\u2028\u2029'

     def check_plain(self):

         # A plain scalar may start with any non-space character except:
         #   '-', '?', ':', ',', '[', ']', '{', '}',
         #   '#', '&', '*', '!', '|', '>', '\'', '\"',
         #   '%', '@', '`'.
         #
         # It may also start with
         #   '-', '?', ':'
         # if it is followed by a non-space character.
         #
         # Note that we limit the last rule to the block context (except the
         # '-' character) because we want the flow context to be space
         # independent.
         ch = self.peek()
         return ch not in u'\0 \t\r\n\x85\u2028\u2029-?:,[]{}#&*!|>\'\"%@`'  \
                 or (self.peek(1) not in u'\0 \t\r\n\x85\u2028\u2029'
                         and (ch == u'-' or (not self.flow_level and ch in u'?:')))

     # Scanners.

     def scan_to_next_token(self):
         # We ignore spaces, line breaks and comments.
         # If we find a line break in the block context, we set the flag
         # `allow_simple_key` on.
         # The byte order mark is stripped if it's the first character in the
         # stream. We do not yet support BOM inside the stream as the
         # specification requires. Any such mark will be considered as a part
         # of the document.
         #
         # TODO: We need to make tab handling rules more sane. A good rule is
         #   Tabs cannot precede tokens
         #   BLOCK-SEQUENCE-START, BLOCK-MAPPING-START, BLOCK-END,
         #   KEY(block), VALUE(block), BLOCK-ENTRY
         # So the checking code is
         #   if <TAB>:
         #       self.allow_simple_keys = False
         # We also need to add the check for `allow_simple_keys == True` to
         # `unwind_indent` before issuing BLOCK-END.
         # Scanners for block, flow, and plain scalars need to be modified.

         if self.index == 0 and self.peek() == u'\uFEFF':
             self.forward()
         found = False
         while not found:
             while self.peek() == u' ':
                 self.forward()
             if self.peek() == u'#':
                 while self.peek() not in u'\0\r\n\x85\u2028\u2029':
                     self.forward()
             if self.scan_line_break():
                 if not self.flow_level:
                     self.allow_simple_key = True
             else:
                 found = True

     def scan_directive(self):
         # See the specification for details.
         start_mark = self.get_mark()
         self.forward()
         name = self.scan_directive_name(start_mark)
         value = None
         if name == u'YAML':
             value = self.scan_yaml_directive_value(start_mark)
             end_mark = self.get_mark()
         elif name == u'TAG':
             value = self.scan_tag_directive_value(start_mark)
             end_mark = self.get_mark()
         else:
             end_mark = self.get_mark()
             while self.peek() not in u'\0\r\n\x85\u2028\u2029':
                 self.forward()
         self.scan_directive_ignored_line(start_mark)
         return DirectiveToken(name, value, start_mark, end_mark)

     def scan_directive_name(self, start_mark):
         # See the specification for details.
         length = 0
         ch = self.peek(length)
         while u'0' <= ch <= u'9' or u'A' <= ch <= u'Z' or u'a' <= ch <= u'z'    \
                 or ch in u'-_':
             length += 1
             ch = self.peek(length)
         if not length:
             raise ScannerError("while scanning a directive", start_mark,
                     "expected alphabetic or numeric character, but found %r"
                     % ch.encode('utf-8'), self.get_mark())
         value = self.prefix(length)
         self.forward(length)
         ch = self.peek()
         if ch not in u'\0 \r\n\x85\u2028\u2029':
             raise ScannerError("while scanning a directive", start_mark,
                     "expected alphabetic or numeric character, but found %r"
                     % ch.encode('utf-8'), self.get_mark())
         return value

     def scan_yaml_directive_value(self, start_mark):
         # See the specification for details.
         while self.peek() == u' ':
             self.forward()
         major = self.scan_yaml_directive_number(start_mark)
         if self.peek() != '.':
             raise ScannerError("while scanning a directive", start_mark,
                     "expected a digit or '.', but found %r"
                     % self.peek().encode('utf-8'),
                     self.get_mark())
         self.forward()
         minor = self.scan_yaml_directive_number(start_mark)
         if self.peek() not in u'\0 \r\n\x85\u2028\u2029':
             raise ScannerError("while scanning a directive", start_mark,
                     "expected a digit or ' ', but found %r"
                     % self.peek().encode('utf-8'),
                     self.get_mark())
         return (major, minor)

     def scan_yaml_directive_number(self, start_mark):
         # See the specification for details.
         ch = self.peek()
         if not (u'0' <= ch <= u'9'):
             raise ScannerError("while scanning a directive", start_mark,
                     "expected a digit, but found %r" % ch.encode('utf-8'),
                     self.get_mark())
         length = 0
         while u'0' <= self.peek(length) <= u'9':
             length += 1
         value = int(self.prefix(length))
         self.forward(length)
         return value

     def scan_tag_directive_value(self, start_mark):
         # See the specification for details.
         while self.peek() == u' ':
             self.forward()
         handle = self.scan_tag_directive_handle(start_mark)
         while self.peek() == u' ':
             self.forward()
         prefix = self.scan_tag_directive_prefix(start_mark)
         return (handle, prefix)

     def scan_tag_directive_handle(self, start_mark):
         # See the specification for details.
         value = self.scan_tag_handle('directive', start_mark)
         ch = self.peek()
         if ch != u' ':
             raise ScannerError("while scanning a directive", start_mark,
                     "expected ' ', but found %r" % ch.encode('utf-8'),
                     self.get_mark())
         return value

     def scan_tag_directive_prefix(self, start_mark):
         # See the specification for details.
         value = self.scan_tag_uri('directive', start_mark)
         ch = self.peek()
         if ch not in u'\0 \r\n\x85\u2028\u2029':
             raise ScannerError("while scanning a directive", start_mark,
                     "expected ' ', but found %r" % ch.encode('utf-8'),
                     self.get_mark())
         return value

     def scan_directive_ignored_line(self, start_mark):
         # See the specification for details.
         while self.peek() == u' ':
             self.forward()
         if self.peek() == u'#':
             while self.peek() not in u'\0\r\n\x85\u2028\u2029':
                 self.forward()
         ch = self.peek()
         if ch not in u'\0\r\n\x85\u2028\u2029':
             raise ScannerError("while scanning a directive", start_mark,
                     "expected a comment or a line break, but found %r"
                         % ch.encode('utf-8'), self.get_mark())
         self.scan_line_break()

     def scan_anchor(self, TokenClass):
         # The specification does not restrict characters for anchors and
         # aliases. This may lead to problems, for instance, the document:
         #   [ *alias, value ]
         # can be interpreted in two ways, as
         #   [ "value" ]
         # and
         #   [ *alias , "value" ]
         # Therefore we restrict aliases to numbers and ASCII letters.
         start_mark = self.get_mark()
         indicator = self.peek()
         if indicator == u'*':
             name = 'alias'
         else:
             name = 'anchor'
         self.forward()
         length = 0
         ch = self.peek(length)
         while u'0' <= ch <= u'9' or u'A' <= ch <= u'Z' or u'a' <= ch <= u'z'    \
                 or ch in u'-_':
             length += 1
             ch = self.peek(length)
         if not length:
             raise ScannerError("while scanning an %s" % name, start_mark,
                     "expected alphabetic or numeric character, but found %r"
                     % ch.encode('utf-8'), self.get_mark())
         value = self.prefix(length)
         self.forward(length)
         ch = self.peek()
         if ch not in u'\0 \t\r\n\x85\u2028\u2029?:,]}%@`':
             raise ScannerError("while scanning an %s" % name, start_mark,
                     "expected alphabetic or numeric character, but found %r"
                     % ch.encode('utf-8'), self.get_mark())
         end_mark = self.get_mark()
         return TokenClass(value, start_mark, end_mark)

     def scan_tag(self):
         # See the specification for details.
         start_mark = self.get_mark()
         ch = self.peek(1)
         if ch == u'<':
             handle = None
             self.forward(2)
             suffix = self.scan_tag_uri('tag', start_mark)
             if self.peek() != u'>':
                 raise ScannerError("while parsing a tag", start_mark,
                         "expected '>', but found %r" % self.peek().encode('utf-8'),
                         self.get_mark())
             self.forward()
         elif ch in u'\0 \t\r\n\x85\u2028\u2029':
             handle = None
             suffix = u'!'
             self.forward()
         else:
             length = 1
             use_handle = False
             while ch not in u'\0 \r\n\x85\u2028\u2029':
                 if ch == u'!':
                     use_handle = True
                     break
                 length += 1
                 ch = self.peek(length)
             handle = u'!'
             if use_handle:
                 handle = self.scan_tag_handle('tag', start_mark)
             else:
                 handle = u'!'
                 self.forward()
             suffix = self.scan_tag_uri('tag', start_mark)
         ch = self.peek()
         if ch not in u'\0 \r\n\x85\u2028\u2029':
             raise ScannerError("while scanning a tag", start_mark,
                     "expected ' ', but found %r" % ch.encode('utf-8'),
                     self.get_mark())
         value = (handle, suffix)
         end_mark = self.get_mark()
         return TagToken(value, start_mark, end_mark)

     def scan_block_scalar(self, style):
         # See the specification for details.

         if style == '>':
             folded = True
         else:
             folded = False

         chunks = []
         start_mark = self.get_mark()

         # Scan the header.
         self.forward()
         chomping, increment = self.scan_block_scalar_indicators(start_mark)
         self.scan_block_scalar_ignored_line(start_mark)

         # Determine the indentation level and go to the first non-empty line.
         min_indent = self.indent+1
         if min_indent < 1:
             min_indent = 1
         if increment is None:
             breaks, max_indent, end_mark = self.scan_block_scalar_indentation()
             indent = max(min_indent, max_indent)
         else:
             indent = min_indent+increment-1
             breaks, end_mark = self.scan_block_scalar_breaks(indent)
         line_break = u''

         # Scan the inner part of the block scalar.
         while self.column == indent and self.peek() != u'\0':
             chunks.extend(breaks)
             leading_non_space = self.peek() not in u' \t'
             length = 0
             while self.peek(length) not in u'\0\r\n\x85\u2028\u2029':
                 length += 1
             chunks.append(self.prefix(length))
             self.forward(length)
             line_break = self.scan_line_break()
             breaks, end_mark = self.scan_block_scalar_breaks(indent)
             if self.column == indent and self.peek() != u'\0':

                 # Unfortunately, folding rules are ambiguous.
                 #
                 # This is the folding according to the specification:

                 if folded and line_break == u'\n'   \
                         and leading_non_space and self.peek() not in u' \t':
                     if not breaks:
                         chunks.append(u' ')
                 else:
                     chunks.append(line_break)

                 # This is Clark Evans's interpretation (also in the spec
                 # examples):
                 #
                 #if folded and line_break == u'\n':
                 #    if not breaks:
                 #        if self.peek() not in ' \t':
                 #            chunks.append(u' ')
                 #        else:
                 #            chunks.append(line_break)
                 #else:
                 #    chunks.append(line_break)
             else:
                 break

         # Chomp the tail.
         if chomping is not False:
             chunks.append(line_break)
         if chomping is True:
             chunks.extend(breaks)

         # We are done.
         return ScalarToken(u''.join(chunks), False, start_mark, end_mark,
                 style)

     def scan_block_scalar_indicators(self, start_mark):
         # See the specification for details.
         chomping = None
         increment = None
         ch = self.peek()
         if ch in u'+-':
             if ch == '+':
                 chomping = True
             else:
                 chomping = False
             self.forward()
             ch = self.peek()
             if ch in u'0123456789':
                 increment = int(ch)
                 if increment == 0:
                     raise ScannerError("while scanning a block scalar", start_mark,
                             "expected indentation indicator in the range 1-9, but found 0",
                             self.get_mark())
                 self.forward()
         elif ch in u'0123456789':
             increment = int(ch)
             if increment == 0:
                 raise ScannerError("while scanning a block scalar", start_mark,
                         "expected indentation indicator in the range 1-9, but found 0",
                         self.get_mark())
             self.forward()
             ch = self.peek()
             if ch in u'+-':
                 if ch == '+':
                     chomping = True
                 else:
                     chomping = False
                 self.forward()
         ch = self.peek()
         if ch not in u'\0 \r\n\x85\u2028\u2029':
             raise ScannerError("while scanning a block scalar", start_mark,
                     "expected chomping or indentation indicators, but found %r"
                         % ch.encode('utf-8'), self.get_mark())
         return chomping, increment

     def scan_block_scalar_ignored_line(self, start_mark):
         # See the specification for details.
         while self.peek() == u' ':
             self.forward()
         if self.peek() == u'#':
             while self.peek() not in u'\0\r\n\x85\u2028\u2029':
                 self.forward()
         ch = self.peek()
         if ch not in u'\0\r\n\x85\u2028\u2029':
             raise ScannerError("while scanning a block scalar", start_mark,
                     "expected a comment or a line break, but found %r"
                         % ch.encode('utf-8'), self.get_mark())
         self.scan_line_break()

     def scan_block_scalar_indentation(self):
         # See the specification for details.
         chunks = []
         max_indent = 0
         end_mark = self.get_mark()
         while self.peek() in u' \r\n\x85\u2028\u2029':
             if self.peek() != u' ':
                 chunks.append(self.scan_line_break())
                 end_mark = self.get_mark()
             else:
                 self.forward()
                 if self.column > max_indent:
                     max_indent = self.column
         return chunks, max_indent, end_mark

     def scan_block_scalar_breaks(self, indent):
         # See the specification for details.
         chunks = []
         end_mark = self.get_mark()
         while self.column < indent and self.peek() == u' ':
             self.forward()
         while self.peek() in u'\r\n\x85\u2028\u2029':
             chunks.append(self.scan_line_break())
             end_mark = self.get_mark()
             while self.column < indent and self.peek() == u' ':
                 self.forward()
         return chunks, end_mark

     def scan_flow_scalar(self, style):
         # See the specification for details.
         # Note that we loose indentation rules for quoted scalars. Quoted
         # scalars don't need to adhere indentation because " and ' clearly
         # mark the beginning and the end of them. Therefore we are less
         # restrictive then the specification requires. We only need to check
         # that document separators are not included in scalars.
         if style == '"':
             double = True
         else:
             double = False
         chunks = []
         start_mark = self.get_mark()
         quote = self.peek()
         self.forward()
         chunks.extend(self.scan_flow_scalar_non_spaces(double, start_mark))
         while self.peek() != quote:
             chunks.extend(self.scan_flow_scalar_spaces(double, start_mark))
             chunks.extend(self.scan_flow_scalar_non_spaces(double, start_mark))
         self.forward()
         end_mark = self.get_mark()
         return ScalarToken(u''.join(chunks), False, start_mark, end_mark,
                 style)

     ESCAPE_REPLACEMENTS = {
         u'0':   u'\0',
         u'a':   u'\x07',
         u'b':   u'\x08',
         u't':   u'\x09',
         u'\t':  u'\x09',
         u'n':   u'\x0A',
         u'v':   u'\x0B',
         u'f':   u'\x0C',
         u'r':   u'\x0D',
         u'e':   u'\x1B',
         u' ':   u'\x20',
         u'\"':  u'\"',
         u'\\':  u'\\',
         u'/':   u'/',
         u'N':   u'\x85',
         u'_':   u'\xA0',
         u'L':   u'\u2028',
         u'P':   u'\u2029',
     }

     ESCAPE_CODES = {
         u'x':   2,
         u'u':   4,
         u'U':   8,
     }

     def scan_flow_scalar_non_spaces(self, double, start_mark):
         # See the specification for details.
         chunks = []
         while True:
             length = 0
             while self.peek(length) not in u'\'\"\\\0 \t\r\n\x85\u2028\u2029':
                 length += 1
             if length:
                 chunks.append(self.prefix(length))
                 self.forward(length)
             ch = self.peek()
             if not double and ch == u'\'' and self.peek(1) == u'\'':
                 chunks.append(u'\'')
                 self.forward(2)
             elif (double and ch == u'\'') or (not double and ch in u'\"\\'):
                 chunks.append(ch)
                 self.forward()
             elif double and ch == u'\\':
                 self.forward()
                 ch = self.peek()
                 if ch in self.ESCAPE_REPLACEMENTS:
                     chunks.append(self.ESCAPE_REPLACEMENTS[ch])
                     self.forward()
                 elif ch in self.ESCAPE_CODES:
                     length = self.ESCAPE_CODES[ch]
                     self.forward()
                     for k in range(length):
                         if self.peek(k) not in u'0123456789ABCDEFabcdef':
                             raise ScannerError("while scanning a double-quoted scalar", start_mark,
                                     "expected escape sequence of %d hexdecimal numbers, but found %r" %
                                         (length, self.peek(k).encode('utf-8')), self.get_mark())
                     code = int(self.prefix(length), 16)
                     chunks.append(unichr(code))
                     self.forward(length)
                 elif ch in u'\r\n\x85\u2028\u2029':
                     self.scan_line_break()
                     chunks.extend(self.scan_flow_scalar_breaks(double, start_mark))
                 else:
                     raise ScannerError("while scanning a double-quoted scalar", start_mark,
                             "found unknown escape character %r" % ch.encode('utf-8'), self.get_mark())
             else:
                 return chunks

     def scan_flow_scalar_spaces(self, double, start_mark):
         # See the specification for details.
         chunks = []
         length = 0
         while self.peek(length) in u' \t':
             length += 1
         whitespaces = self.prefix(length)
         self.forward(length)
         ch = self.peek()
         if ch == u'\0':
             raise ScannerError("while scanning a quoted scalar", start_mark,
                     "found unexpected end of stream", self.get_mark())
         elif ch in u'\r\n\x85\u2028\u2029':
             line_break = self.scan_line_break()
             breaks = self.scan_flow_scalar_breaks(double, start_mark)
             if line_break != u'\n':
                 chunks.append(line_break)
             elif not breaks:
                 chunks.append(u' ')
             chunks.extend(breaks)
         else:
             chunks.append(whitespaces)
         return chunks

     def scan_flow_scalar_breaks(self, double, start_mark):
         # See the specification for details.
         chunks = []
         while True:
             # Instead of checking indentation, we check for document
             # separators.
             prefix = self.prefix(3)
             if (prefix == u'---' or prefix == u'...')   \
                     and self.peek(3) in u'\0 \t\r\n\x85\u2028\u2029':
                 raise ScannerError("while scanning a quoted scalar", start_mark,
                         "found unexpected document separator", self.get_mark())
             while self.peek() in u' \t':
                 self.forward()
             if self.peek() in u'\r\n\x85\u2028\u2029':
                 chunks.append(self.scan_line_break())
             else:
                 return chunks

     def scan_plain(self):
         # See the specification for details.
         # We add an additional restriction for the flow context:
         #   plain scalars in the flow context cannot contain ',' or '?'.
         # We also keep track of the `allow_simple_key` flag here.
         # Indentation rules are loosed for the flow context.
         chunks = []
         start_mark = self.get_mark()
         end_mark = start_mark
         indent = self.indent+1
         # We allow zero indentation for scalars, but then we need to check for
         # document separators at the beginning of the line.
         #if indent == 0:
         #    indent = 1
         spaces = []
         while True:
             length = 0
             if self.peek() == u'#':
                 break
             while True:
                 ch = self.peek(length)
                 if ch in u'\0 \t\r\n\x85\u2028\u2029'   \
                         or (ch == u':' and
                                 self.peek(length+1) in u'\0 \t\r\n\x85\u2028\u2029'
                                       + (u',[]{}' if self.flow_level else u''))\
                         or (self.flow_level and ch in u',?[]{}'):
                     break
                 length += 1
             if length == 0:
                 break
             self.allow_simple_key = False
             chunks.extend(spaces)
             chunks.append(self.prefix(length))
             self.forward(length)
             end_mark = self.get_mark()
             spaces = self.scan_plain_spaces(indent, start_mark)
             if not spaces or self.peek() == u'#' \
                     or (not self.flow_level and self.column < indent):
                 break
         return ScalarToken(u''.join(chunks), True, start_mark, end_mark)

     def scan_plain_spaces(self, indent, start_mark):
         # See the specification for details.
         # The specification is really confusing about tabs in plain scalars.
         # We just forbid them completely. Do not use tabs in YAML!
         chunks = []
         length = 0
         while self.peek(length) in u' ':
             length += 1
         whitespaces = self.prefix(length)
         self.forward(length)
         ch = self.peek()
         if ch in u'\r\n\x85\u2028\u2029':
             line_break = self.scan_line_break()
             self.allow_simple_key = True
             prefix = self.prefix(3)
             if (prefix == u'---' or prefix == u'...')   \
                     and self.peek(3) in u'\0 \t\r\n\x85\u2028\u2029':
                 return
             breaks = []
             while self.peek() in u' \r\n\x85\u2028\u2029':
                 if self.peek() == ' ':
                     self.forward()
                 else:
                     breaks.append(self.scan_line_break())
                     prefix = self.prefix(3)
                     if (prefix == u'---' or prefix == u'...')   \
                             and self.peek(3) in u'\0 \t\r\n\x85\u2028\u2029':
                         return
             if line_break != u'\n':
                 chunks.append(line_break)
             elif not breaks:
                 chunks.append(u' ')
             chunks.extend(breaks)
         elif whitespaces:
             chunks.append(whitespaces)
         return chunks

     def scan_tag_handle(self, name, start_mark):
         # See the specification for details.
         # For some strange reasons, the specification does not allow '_' in
         # tag handles. I have allowed it anyway.
         ch = self.peek()
         if ch != u'!':
             raise ScannerError("while scanning a %s" % name, start_mark,
                     "expected '!', but found %r" % ch.encode('utf-8'),
                     self.get_mark())
         length = 1
         ch = self.peek(length)
         if ch != u' ':
             while u'0' <= ch <= u'9' or u'A' <= ch <= u'Z' or u'a' <= ch <= u'z'    \
                     or ch in u'-_':
                 length += 1
                 ch = self.peek(length)
             if ch != u'!':
                 self.forward(length)
                 raise ScannerError("while scanning a %s" % name, start_mark,
                         "expected '!', but found %r" % ch.encode('utf-8'),
                         self.get_mark())
             length += 1
         value = self.prefix(length)
         self.forward(length)
         return value

     def scan_tag_uri(self, name, start_mark):
         # See the specification for details.
         # Note: we do not check if URI is well-formed.
         chunks = []
         length = 0
         ch = self.peek(length)
         while u'0' <= ch <= u'9' or u'A' <= ch <= u'Z' or u'a' <= ch <= u'z'    \
                 or ch in u'-;/?:@&=+$,_.!~*\'()[]%':
             if ch == u'%':
                 chunks.append(self.prefix(length))
                 self.forward(length)
                 length = 0
                 chunks.append(self.scan_uri_escapes(name, start_mark))
             else:
                 length += 1
             ch = self.peek(length)
         if length:
             chunks.append(self.prefix(length))
             self.forward(length)
             length = 0
         if not chunks:
             raise ScannerError("while parsing a %s" % name, start_mark,
                     "expected URI, but found %r" % ch.encode('utf-8'),
                     self.get_mark())
         return u''.join(chunks)

     def scan_uri_escapes(self, name, start_mark):
         # See the specification for details.
         bytes = []
         mark = self.get_mark()
         while self.peek() == u'%':
             self.forward()
             for k in range(2):
                 if self.peek(k) not in u'0123456789ABCDEFabcdef':
                     raise ScannerError("while scanning a %s" % name, start_mark,
                             "expected URI escape sequence of 2 hexdecimal numbers, but found %r" %
                                 (self.peek(k).encode('utf-8')), self.get_mark())
             bytes.append(chr(int(self.prefix(2), 16)))
             self.forward(2)
         try:
             value = unicode(''.join(bytes), 'utf-8')
         except UnicodeDecodeError, exc:
             raise ScannerError("while scanning a %s" % name, start_mark, str(exc), mark)
         return value

     def scan_line_break(self):
         # Transforms:
         #   '\r\n'      :   '\n'
         #   '\r'        :   '\n'
         #   '\n'        :   '\n'
         #   '\x85'      :   '\n'
         #   '\u2028'    :   '\u2028'
         #   '\u2029     :   '\u2029'
         #   default     :   ''
         ch = self.peek()
         if ch in u'\r\n\x85':
             if self.prefix(2) == u'\r\n':
                 self.forward(2)
             else:
                 self.forward()
             return u'\n'
         elif ch in u'\u2028\u2029':
             self.forward()
             return ch
         return u''