# vim:fileencoding=utf-8:noet from __future__ import (unicode_literals, division, absolute_import, print_function) from string import hexdigits from powerline.lint.markedjson.error import MarkedError from powerline.lint.markedjson import tokens from powerline.lib.unicode import unicode, unichr, surrogate_pair_to_character hexdigits_set = set(hexdigits) # Scanner produces tokens of the following types: # STREAM-START # STREAM-END # DOCUMENT-START # DOCUMENT-END # FLOW-SEQUENCE-START # FLOW-MAPPING-START # FLOW-SEQUENCE-END # FLOW-MAPPING-END # FLOW-ENTRY # KEY # VALUE # SCALAR(value, plain, style) # # Read comments in the Scanner code for more details. class ScannerError(MarkedError): pass class SimpleKey: # See below simple keys treatment. def __init__(self, token_number, index, line, column, mark): self.token_number = token_number self.index = index self.line = line self.column = column self.mark = mark class Scanner: def __init__(self): '''Initialize the scanner.''' # It is assumed that Scanner and Reader will have a common descendant. # Reader do the dirty work of checking for BOM and converting the # input data to Unicode. It also adds NUL to the end. # # Reader supports the following methods # self.peek(i=0) # peek the next i-th character # self.prefix(l=1) # peek the next l characters # self.forward(l=1) # read the next l characters and move the pointer. # Had we reached the end of the stream? self.done = False # The number of unclosed '{' and '['. `flow_level == 0` means block # context. self.flow_level = 0 # List of processed tokens that are not yet emitted. self.tokens = [] # Add the STREAM-START token. self.fetch_stream_start() # Number of tokens that were emitted through the `get_token` method. self.tokens_taken = 0 # Variables related to simple keys treatment. # A simple key is a key that is not denoted by the '?' indicator. # We emit the KEY token before all keys, so when we find a potential # simple key, we try to locate the corresponding ':' indicator. # Simple keys should be limited to a single line. # Can a simple key start at the current position? A simple key may # start: # - after '{', '[', ',' (in the flow context), self.allow_simple_key = False # Keep track of possible simple keys. This is a dictionary. The key # is `flow_level`; there can be no more that one possible simple key # for each level. The value is a SimpleKey record: # (token_number, index, line, column, mark) # A simple key may start with SCALAR(flow), '[', or '{' tokens. self.possible_simple_keys = {} # Public methods. def check_token(self, *choices): # Check if the next token is one of the given types. while self.need_more_tokens(): self.fetch_more_tokens() if self.tokens: if not choices: return True for choice in choices: if isinstance(self.tokens[0], choice): return True return False def peek_token(self): # Return the next token, but do not delete if from the queue. while self.need_more_tokens(): self.fetch_more_tokens() if self.tokens: return self.tokens[0] def get_token(self): # Return the next token. while self.need_more_tokens(): self.fetch_more_tokens() if self.tokens: self.tokens_taken += 1 return self.tokens.pop(0) # Private methods. def need_more_tokens(self): if self.done: return False if not self.tokens: return True # The current token may be a potential simple key, so we # need to look further. self.stale_possible_simple_keys() if self.next_possible_simple_key() == self.tokens_taken: return True def fetch_more_tokens(self): # Eat whitespaces and comments until we reach the next token. self.scan_to_next_token() # Remove obsolete possible simple keys. self.stale_possible_simple_keys() # Peek the next character. ch = self.peek() # Is it the end of stream? if ch == '\0': return self.fetch_stream_end() # Note: the order of the following checks is NOT significant. # Is it the flow sequence start indicator? if ch == '[': return self.fetch_flow_sequence_start() # Is it the flow mapping start indicator? if ch == '{': return self.fetch_flow_mapping_start() # Is it the flow sequence end indicator? if ch == ']': return self.fetch_flow_sequence_end() # Is it the flow mapping end indicator? if ch == '}': return self.fetch_flow_mapping_end() # Is it the flow entry indicator? if ch == ',': return self.fetch_flow_entry() # Is it the value indicator? if ch == ':' and self.flow_level: return self.fetch_value() # Is it a double quoted scalar? if ch == '"': return self.fetch_double() # It must be a plain scalar then. if self.check_plain(): return self.fetch_plain() # No? It’s an error. Let’s produce a nice error message. raise ScannerError( 'while scanning for the next token', None, 'found character %r that cannot start any token' % ch, self.get_mark() ) # Simple keys treatment. def next_possible_simple_key(self): # Return the number of the nearest possible simple key. Actually we # don’t need to loop through the whole dictionary. We may replace it # with the following code: # if not self.possible_simple_keys: # return None # return self.possible_simple_keys[ # min(self.possible_simple_keys.keys())].token_number min_token_number = None for level in self.possible_simple_keys: key = self.possible_simple_keys[level] if min_token_number is None or key.token_number < min_token_number: min_token_number = key.token_number return min_token_number def stale_possible_simple_keys(self): # Remove entries that are no longer possible simple keys. According to # the YAML specification, simple keys # - should be limited to a single line, # Disabling this procedure will allow simple keys of any length and # height (may cause problems if indentation is broken though). for level in list(self.possible_simple_keys): key = self.possible_simple_keys[level] if key.line != self.line: del self.possible_simple_keys[level] def save_possible_simple_key(self): # The next token may start a simple key. We check if it’s possible # and save its position. This function is called for # SCALAR(flow), '[', and '{'. # The next token might be a simple key. Let’s save it’s number and # position. if self.allow_simple_key: self.remove_possible_simple_key() token_number = self.tokens_taken + len(self.tokens) key = SimpleKey(token_number, self.index, self.line, self.column, self.get_mark()) self.possible_simple_keys[self.flow_level] = key def remove_possible_simple_key(self): # Remove the saved possible key position at the current flow level. if self.flow_level in self.possible_simple_keys: del self.possible_simple_keys[self.flow_level] # Fetchers. def fetch_stream_start(self): # We always add STREAM-START as the first token and STREAM-END as the # last token. # Read the token. mark = self.get_mark() # Add STREAM-START. self.tokens.append(tokens.StreamStartToken(mark, mark, encoding=self.encoding)) def fetch_stream_end(self): # Reset simple keys. self.remove_possible_simple_key() self.allow_simple_key = False self.possible_simple_keys = {} # Read the token. mark = self.get_mark() # Add STREAM-END. self.tokens.append(tokens.StreamEndToken(mark, mark)) # The steam is finished. self.done = True def fetch_flow_sequence_start(self): self.fetch_flow_collection_start(tokens.FlowSequenceStartToken) def fetch_flow_mapping_start(self): self.fetch_flow_collection_start(tokens.FlowMappingStartToken) def fetch_flow_collection_start(self, TokenClass): # '[' and '{' may start a simple key. self.save_possible_simple_key() # Increase the flow level. self.flow_level += 1 # Simple keys are allowed after '[' and '{'. self.allow_simple_key = True # Add FLOW-SEQUENCE-START or FLOW-MAPPING-START. start_mark = self.get_mark() self.forward() end_mark = self.get_mark() self.tokens.append(TokenClass(start_mark, end_mark)) def fetch_flow_sequence_end(self): self.fetch_flow_collection_end(tokens.FlowSequenceEndToken) def fetch_flow_mapping_end(self): self.fetch_flow_collection_end(tokens.FlowMappingEndToken) def fetch_flow_collection_end(self, TokenClass): # Reset possible simple key on the current level. self.remove_possible_simple_key() # Decrease the flow level. self.flow_level -= 1 # No simple keys after ']' or '}'. self.allow_simple_key = False # Add FLOW-SEQUENCE-END or FLOW-MAPPING-END. start_mark = self.get_mark() self.forward() end_mark = self.get_mark() self.tokens.append(TokenClass(start_mark, end_mark)) def fetch_value(self): # Do we determine a simple key? if self.flow_level in self.possible_simple_keys: # Add KEY. key = self.possible_simple_keys[self.flow_level] del self.possible_simple_keys[self.flow_level] self.tokens.insert(key.token_number - self.tokens_taken, tokens.KeyToken(key.mark, key.mark)) # There cannot be two simple keys one after another. self.allow_simple_key = False # Add VALUE. start_mark = self.get_mark() self.forward() end_mark = self.get_mark() self.tokens.append(tokens.ValueToken(start_mark, end_mark)) def fetch_flow_entry(self): # Simple keys are allowed after ','. self.allow_simple_key = True # Reset possible simple key on the current level. self.remove_possible_simple_key() # Add FLOW-ENTRY. start_mark = self.get_mark() self.forward() end_mark = self.get_mark() self.tokens.append(tokens.FlowEntryToken(start_mark, end_mark)) def fetch_double(self): # A flow scalar could be a simple key. self.save_possible_simple_key() # No simple keys after flow scalars. self.allow_simple_key = False # Scan and add SCALAR. self.tokens.append(self.scan_flow_scalar()) def fetch_plain(self): self.save_possible_simple_key() # No simple keys after plain scalars. self.allow_simple_key = False # Scan and add SCALAR. May change `allow_simple_key`. self.tokens.append(self.scan_plain()) # Checkers. def check_plain(self): return self.peek() in '0123456789-ntf' # Scanners. def scan_to_next_token(self): while self.peek() in ' \t\n': self.forward() def scan_flow_scalar(self): # See the specification for details. # Note that we loose indentation rules for quoted scalars. Quoted # scalars don’t need to adhere indentation because " and ' clearly # mark the beginning and the end of them. Therefore we are less # restrictive then the specification requires. We only need to check # that document separators are not included in scalars. chunks = [] start_mark = self.get_mark() quote = self.peek() self.forward() chunks.extend(self.scan_flow_scalar_non_spaces(start_mark)) while self.peek() != quote: chunks.extend(self.scan_flow_scalar_spaces(start_mark)) chunks.extend(self.scan_flow_scalar_non_spaces(start_mark)) self.forward() end_mark = self.get_mark() return tokens.ScalarToken(unicode().join(chunks), False, start_mark, end_mark, '"') ESCAPE_REPLACEMENTS = { 'b': '\x08', 't': '\x09', 'n': '\x0A', 'f': '\x0C', 'r': '\x0D', '"': '\"', '\\': '\\', } ESCAPE_CODES = { 'u': 4, } def scan_flow_scalar_non_spaces(self, start_mark): # See the specification for details. chunks = [] while True: length = 0 while self.peek(length) not in '\"\\\0 \t\n': length += 1 if length: chunks.append(self.prefix(length)) self.forward(length) ch = self.peek() if ch == '\\': self.forward() ch = self.peek() if ch in self.ESCAPE_REPLACEMENTS: chunks.append(self.ESCAPE_REPLACEMENTS[ch]) self.forward() elif ch in self.ESCAPE_CODES: length = self.ESCAPE_CODES[ch] self.forward() for k in range(length): if self.peek(k) not in hexdigits: raise ScannerError( 'while scanning a double-quoted scalar', start_mark, 'expected escape sequence of %d hexdecimal numbers, but found %r' % ( length, self.peek(k)), self.get_mark() ) code = int(self.prefix(length), 16) self.forward(length) if 0xD800 <= code <= 0xDC00: # Start of the surrogate pair next_char = self.prefix(6) if ( next_char[0] != '\\' or next_char[1] != 'u' or not (set(next_char[2:]) < hexdigits_set) or not (0xDC00 <= int(next_char[2:], 16) <= 0xDFFF) ): raise ScannerError( 'while scanning a double-quoted scalar', start_mark, 'expected escape sequence with the next character in surrogate pair, but found %r' % ( next_char ), self.get_mark() ) code = surrogate_pair_to_character(code, int(next_char[2:], 16)) self.forward(6) chunks.append(unichr(code)) else: raise ScannerError( 'while scanning a double-quoted scalar', start_mark, ('found unknown escape character %r' % ch), self.get_mark() ) else: return chunks def scan_flow_scalar_spaces(self, start_mark): # See the specification for details. chunks = [] length = 0 while self.peek(length) in ' \t': length += 1 whitespaces = self.prefix(length) self.forward(length) ch = self.peek() if ch == '\0': raise ScannerError( 'while scanning a quoted scalar', start_mark, 'found unexpected end of stream', self.get_mark() ) elif ch == '\n': raise ScannerError( 'while scanning a quoted scalar', start_mark, 'found unexpected line end', self.get_mark() ) else: chunks.append(whitespaces) return chunks def scan_plain(self): chunks = [] start_mark = self.get_mark() spaces = [] while True: length = 0 while True: if self.peek(length) not in 'eE.0123456789nul-tr+fas': break length += 1 if length == 0: break self.allow_simple_key = False chunks.extend(spaces) chunks.append(self.prefix(length)) self.forward(length) end_mark = self.get_mark() return tokens.ScalarToken(''.join(chunks), True, start_mark, end_mark)