123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449 |
- # mako/lexer.py
- # Copyright (C) 2006-2016 the Mako authors and contributors <see AUTHORS file>
- #
- # This module is part of Mako and is released under
- # the MIT License: http://www.opensource.org/licenses/mit-license.php
- """provides the Lexer class for parsing template strings into parse trees."""
- import re
- import codecs
- from mako import parsetree, exceptions, compat
- from mako.pygen import adjust_whitespace
- _regexp_cache = {}
- class Lexer(object):
- def __init__(self, text, filename=None,
- disable_unicode=False,
- input_encoding=None, preprocessor=None):
- self.text = text
- self.filename = filename
- self.template = parsetree.TemplateNode(self.filename)
- self.matched_lineno = 1
- self.matched_charpos = 0
- self.lineno = 1
- self.match_position = 0
- self.tag = []
- self.control_line = []
- self.ternary_stack = []
- self.disable_unicode = disable_unicode
- self.encoding = input_encoding
- if compat.py3k and disable_unicode:
- raise exceptions.UnsupportedError(
- "Mako for Python 3 does not "
- "support disabling Unicode")
- if preprocessor is None:
- self.preprocessor = []
- elif not hasattr(preprocessor, '__iter__'):
- self.preprocessor = [preprocessor]
- else:
- self.preprocessor = preprocessor
- @property
- def exception_kwargs(self):
- return {'source': self.text,
- 'lineno': self.matched_lineno,
- 'pos': self.matched_charpos,
- 'filename': self.filename}
- def match(self, regexp, flags=None):
- """compile the given regexp, cache the reg, and call match_reg()."""
- try:
- reg = _regexp_cache[(regexp, flags)]
- except KeyError:
- if flags:
- reg = re.compile(regexp, flags)
- else:
- reg = re.compile(regexp)
- _regexp_cache[(regexp, flags)] = reg
- return self.match_reg(reg)
- def match_reg(self, reg):
- """match the given regular expression object to the current text
- position.
- if a match occurs, update the current text and line position.
- """
- mp = self.match_position
- match = reg.match(self.text, self.match_position)
- if match:
- (start, end) = match.span()
- if end == start:
- self.match_position = end + 1
- else:
- self.match_position = end
- self.matched_lineno = self.lineno
- lines = re.findall(r"\n", self.text[mp:self.match_position])
- cp = mp - 1
- while (cp >= 0 and cp < self.textlength and self.text[cp] != '\n'):
- cp -= 1
- self.matched_charpos = mp - cp
- self.lineno += len(lines)
- # print "MATCHED:", match.group(0), "LINE START:",
- # self.matched_lineno, "LINE END:", self.lineno
- # print "MATCH:", regexp, "\n", self.text[mp : mp + 15], \
- # (match and "TRUE" or "FALSE")
- return match
- def parse_until_text(self, watch_nesting, *text):
- startpos = self.match_position
- text_re = r'|'.join(text)
- brace_level = 0
- paren_level = 0
- bracket_level = 0
- while True:
- match = self.match(r'#.*\n')
- if match:
- continue
- match = self.match(r'(\"\"\"|\'\'\'|\"|\')[^\\]*?(\\.[^\\]*?)*\1',
- re.S)
- if match:
- continue
- match = self.match(r'(%s)' % text_re)
- if match and not (watch_nesting
- and (brace_level > 0 or paren_level > 0
- or bracket_level > 0)):
- return \
- self.text[startpos:
- self.match_position - len(match.group(1))],\
- match.group(1)
- elif not match:
- match = self.match(r"(.*?)(?=\"|\'|#|%s)" % text_re, re.S)
- if match:
- brace_level += match.group(1).count('{')
- brace_level -= match.group(1).count('}')
- paren_level += match.group(1).count('(')
- paren_level -= match.group(1).count(')')
- bracket_level += match.group(1).count('[')
- bracket_level -= match.group(1).count(']')
- continue
- raise exceptions.SyntaxException(
- "Expected: %s" %
- ','.join(text),
- **self.exception_kwargs)
- def append_node(self, nodecls, *args, **kwargs):
- kwargs.setdefault('source', self.text)
- kwargs.setdefault('lineno', self.matched_lineno)
- kwargs.setdefault('pos', self.matched_charpos)
- kwargs['filename'] = self.filename
- node = nodecls(*args, **kwargs)
- if len(self.tag):
- self.tag[-1].nodes.append(node)
- else:
- self.template.nodes.append(node)
- # build a set of child nodes for the control line
- # (used for loop variable detection)
- # also build a set of child nodes on ternary control lines
- # (used for determining if a pass needs to be auto-inserted
- if self.control_line:
- control_frame = self.control_line[-1]
- control_frame.nodes.append(node)
- if not (isinstance(node, parsetree.ControlLine) and
- control_frame.is_ternary(node.keyword)):
- if self.ternary_stack and self.ternary_stack[-1]:
- self.ternary_stack[-1][-1].nodes.append(node)
- if isinstance(node, parsetree.Tag):
- if len(self.tag):
- node.parent = self.tag[-1]
- self.tag.append(node)
- elif isinstance(node, parsetree.ControlLine):
- if node.isend:
- self.control_line.pop()
- self.ternary_stack.pop()
- elif node.is_primary:
- self.control_line.append(node)
- self.ternary_stack.append([])
- elif self.control_line and \
- self.control_line[-1].is_ternary(node.keyword):
- self.ternary_stack[-1].append(node)
- elif self.control_line and \
- not self.control_line[-1].is_ternary(node.keyword):
- raise exceptions.SyntaxException(
- "Keyword '%s' not a legal ternary for keyword '%s'" %
- (node.keyword, self.control_line[-1].keyword),
- **self.exception_kwargs)
- _coding_re = re.compile(r'#.*coding[:=]\s*([-\w.]+).*\r?\n')
- def decode_raw_stream(self, text, decode_raw, known_encoding, filename):
- """given string/unicode or bytes/string, determine encoding
- from magic encoding comment, return body as unicode
- or raw if decode_raw=False
- """
- if isinstance(text, compat.text_type):
- m = self._coding_re.match(text)
- encoding = m and m.group(1) or known_encoding or 'ascii'
- return encoding, text
- if text.startswith(codecs.BOM_UTF8):
- text = text[len(codecs.BOM_UTF8):]
- parsed_encoding = 'utf-8'
- m = self._coding_re.match(text.decode('utf-8', 'ignore'))
- if m is not None and m.group(1) != 'utf-8':
- raise exceptions.CompileException(
- "Found utf-8 BOM in file, with conflicting "
- "magic encoding comment of '%s'" % m.group(1),
- text.decode('utf-8', 'ignore'),
- 0, 0, filename)
- else:
- m = self._coding_re.match(text.decode('utf-8', 'ignore'))
- if m:
- parsed_encoding = m.group(1)
- else:
- parsed_encoding = known_encoding or 'ascii'
- if decode_raw:
- try:
- text = text.decode(parsed_encoding)
- except UnicodeDecodeError:
- raise exceptions.CompileException(
- "Unicode decode operation of encoding '%s' failed" %
- parsed_encoding,
- text.decode('utf-8', 'ignore'),
- 0, 0, filename)
- return parsed_encoding, text
- def parse(self):
- self.encoding, self.text = self.decode_raw_stream(
- self.text,
- not self.disable_unicode,
- self.encoding,
- self.filename)
- for preproc in self.preprocessor:
- self.text = preproc(self.text)
- # push the match marker past the
- # encoding comment.
- self.match_reg(self._coding_re)
- self.textlength = len(self.text)
- while (True):
- if self.match_position > self.textlength:
- break
- if self.match_end():
- break
- if self.match_expression():
- continue
- if self.match_control_line():
- continue
- if self.match_comment():
- continue
- if self.match_tag_start():
- continue
- if self.match_tag_end():
- continue
- if self.match_python_block():
- continue
- if self.match_text():
- continue
- if self.match_position > self.textlength:
- break
- raise exceptions.CompileException("assertion failed")
- if len(self.tag):
- raise exceptions.SyntaxException("Unclosed tag: <%%%s>" %
- self.tag[-1].keyword,
- **self.exception_kwargs)
- if len(self.control_line):
- raise exceptions.SyntaxException(
- "Unterminated control keyword: '%s'" %
- self.control_line[-1].keyword,
- self.text,
- self.control_line[-1].lineno,
- self.control_line[-1].pos, self.filename)
- return self.template
- def match_tag_start(self):
- match = self.match(r'''
- \<% # opening tag
- ([\w\.\:]+) # keyword
- ((?:\s+\w+|\s*=\s*|".*?"|'.*?')*) # attrname, = \
- # sign, string expression
- \s* # more whitespace
- (/)?> # closing
- ''',
- re.I | re.S | re.X)
- if match:
- keyword, attr, isend = match.groups()
- self.keyword = keyword
- attributes = {}
- if attr:
- for att in re.findall(
- r"\s*(\w+)\s*=\s*(?:'([^']*)'|\"([^\"]*)\")", attr):
- key, val1, val2 = att
- text = val1 or val2
- text = text.replace('\r\n', '\n')
- attributes[key] = text
- self.append_node(parsetree.Tag, keyword, attributes)
- if isend:
- self.tag.pop()
- else:
- if keyword == 'text':
- match = self.match(r'(.*?)(?=\</%text>)', re.S)
- if not match:
- raise exceptions.SyntaxException(
- "Unclosed tag: <%%%s>" %
- self.tag[-1].keyword,
- **self.exception_kwargs)
- self.append_node(parsetree.Text, match.group(1))
- return self.match_tag_end()
- return True
- else:
- return False
- def match_tag_end(self):
- match = self.match(r'\</%[\t ]*(.+?)[\t ]*>')
- if match:
- if not len(self.tag):
- raise exceptions.SyntaxException(
- "Closing tag without opening tag: </%%%s>" %
- match.group(1),
- **self.exception_kwargs)
- elif self.tag[-1].keyword != match.group(1):
- raise exceptions.SyntaxException(
- "Closing tag </%%%s> does not match tag: <%%%s>" %
- (match.group(1), self.tag[-1].keyword),
- **self.exception_kwargs)
- self.tag.pop()
- return True
- else:
- return False
- def match_end(self):
- match = self.match(r'\Z', re.S)
- if match:
- string = match.group()
- if string:
- return string
- else:
- return True
- else:
- return False
- def match_text(self):
- match = self.match(r"""
- (.*?) # anything, followed by:
- (
- (?<=\n)(?=[ \t]*(?=%|\#\#)) # an eval or line-based
- # comment preceded by a
- # consumed newline and whitespace
- |
- (?=\${) # an expression
- |
- (?=</?[%&]) # a substitution or block or call start or end
- # - don't consume
- |
- (\\\r?\n) # an escaped newline - throw away
- |
- \Z # end of string
- )""", re.X | re.S)
- if match:
- text = match.group(1)
- if text:
- self.append_node(parsetree.Text, text)
- return True
- else:
- return False
- def match_python_block(self):
- match = self.match(r"<%(!)?")
- if match:
- line, pos = self.matched_lineno, self.matched_charpos
- text, end = self.parse_until_text(False, r'%>')
- # the trailing newline helps
- # compiler.parse() not complain about indentation
- text = adjust_whitespace(text) + "\n"
- self.append_node(
- parsetree.Code,
- text,
- match.group(1) == '!', lineno=line, pos=pos)
- return True
- else:
- return False
- def match_expression(self):
- match = self.match(r"\${")
- if match:
- line, pos = self.matched_lineno, self.matched_charpos
- text, end = self.parse_until_text(True, r'\|', r'}')
- if end == '|':
- escapes, end = self.parse_until_text(True, r'}')
- else:
- escapes = ""
- text = text.replace('\r\n', '\n')
- self.append_node(
- parsetree.Expression,
- text, escapes.strip(),
- lineno=line, pos=pos)
- return True
- else:
- return False
- def match_control_line(self):
- match = self.match(
- r"(?<=^)[\t ]*(%(?!%)|##)[\t ]*((?:(?:\\r?\n)|[^\r\n])*)"
- r"(?:\r?\n|\Z)", re.M)
- if match:
- operator = match.group(1)
- text = match.group(2)
- if operator == '%':
- m2 = re.match(r'(end)?(\w+)\s*(.*)', text)
- if not m2:
- raise exceptions.SyntaxException(
- "Invalid control line: '%s'" %
- text,
- **self.exception_kwargs)
- isend, keyword = m2.group(1, 2)
- isend = (isend is not None)
- if isend:
- if not len(self.control_line):
- raise exceptions.SyntaxException(
- "No starting keyword '%s' for '%s'" %
- (keyword, text),
- **self.exception_kwargs)
- elif self.control_line[-1].keyword != keyword:
- raise exceptions.SyntaxException(
- "Keyword '%s' doesn't match keyword '%s'" %
- (text, self.control_line[-1].keyword),
- **self.exception_kwargs)
- self.append_node(parsetree.ControlLine, keyword, isend, text)
- else:
- self.append_node(parsetree.Comment, text)
- return True
- else:
- return False
- def match_comment(self):
- """matches the multiline version of a comment"""
- match = self.match(r"<%doc>(.*?)</%doc>", re.S)
- if match:
- self.append_node(parsetree.Comment, match.group(1))
- return True
- else:
- return False
|