lexer.py 17 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449
  1. # mako/lexer.py
  2. # Copyright (C) 2006-2016 the Mako authors and contributors <see AUTHORS file>
  3. #
  4. # This module is part of Mako and is released under
  5. # the MIT License: http://www.opensource.org/licenses/mit-license.php
  6. """provides the Lexer class for parsing template strings into parse trees."""
  7. import re
  8. import codecs
  9. from mako import parsetree, exceptions, compat
  10. from mako.pygen import adjust_whitespace
  11. _regexp_cache = {}
  12. class Lexer(object):
  13. def __init__(self, text, filename=None,
  14. disable_unicode=False,
  15. input_encoding=None, preprocessor=None):
  16. self.text = text
  17. self.filename = filename
  18. self.template = parsetree.TemplateNode(self.filename)
  19. self.matched_lineno = 1
  20. self.matched_charpos = 0
  21. self.lineno = 1
  22. self.match_position = 0
  23. self.tag = []
  24. self.control_line = []
  25. self.ternary_stack = []
  26. self.disable_unicode = disable_unicode
  27. self.encoding = input_encoding
  28. if compat.py3k and disable_unicode:
  29. raise exceptions.UnsupportedError(
  30. "Mako for Python 3 does not "
  31. "support disabling Unicode")
  32. if preprocessor is None:
  33. self.preprocessor = []
  34. elif not hasattr(preprocessor, '__iter__'):
  35. self.preprocessor = [preprocessor]
  36. else:
  37. self.preprocessor = preprocessor
  38. @property
  39. def exception_kwargs(self):
  40. return {'source': self.text,
  41. 'lineno': self.matched_lineno,
  42. 'pos': self.matched_charpos,
  43. 'filename': self.filename}
  44. def match(self, regexp, flags=None):
  45. """compile the given regexp, cache the reg, and call match_reg()."""
  46. try:
  47. reg = _regexp_cache[(regexp, flags)]
  48. except KeyError:
  49. if flags:
  50. reg = re.compile(regexp, flags)
  51. else:
  52. reg = re.compile(regexp)
  53. _regexp_cache[(regexp, flags)] = reg
  54. return self.match_reg(reg)
  55. def match_reg(self, reg):
  56. """match the given regular expression object to the current text
  57. position.
  58. if a match occurs, update the current text and line position.
  59. """
  60. mp = self.match_position
  61. match = reg.match(self.text, self.match_position)
  62. if match:
  63. (start, end) = match.span()
  64. if end == start:
  65. self.match_position = end + 1
  66. else:
  67. self.match_position = end
  68. self.matched_lineno = self.lineno
  69. lines = re.findall(r"\n", self.text[mp:self.match_position])
  70. cp = mp - 1
  71. while (cp >= 0 and cp < self.textlength and self.text[cp] != '\n'):
  72. cp -= 1
  73. self.matched_charpos = mp - cp
  74. self.lineno += len(lines)
  75. # print "MATCHED:", match.group(0), "LINE START:",
  76. # self.matched_lineno, "LINE END:", self.lineno
  77. # print "MATCH:", regexp, "\n", self.text[mp : mp + 15], \
  78. # (match and "TRUE" or "FALSE")
  79. return match
  80. def parse_until_text(self, watch_nesting, *text):
  81. startpos = self.match_position
  82. text_re = r'|'.join(text)
  83. brace_level = 0
  84. paren_level = 0
  85. bracket_level = 0
  86. while True:
  87. match = self.match(r'#.*\n')
  88. if match:
  89. continue
  90. match = self.match(r'(\"\"\"|\'\'\'|\"|\')[^\\]*?(\\.[^\\]*?)*\1',
  91. re.S)
  92. if match:
  93. continue
  94. match = self.match(r'(%s)' % text_re)
  95. if match and not (watch_nesting
  96. and (brace_level > 0 or paren_level > 0
  97. or bracket_level > 0)):
  98. return \
  99. self.text[startpos:
  100. self.match_position - len(match.group(1))],\
  101. match.group(1)
  102. elif not match:
  103. match = self.match(r"(.*?)(?=\"|\'|#|%s)" % text_re, re.S)
  104. if match:
  105. brace_level += match.group(1).count('{')
  106. brace_level -= match.group(1).count('}')
  107. paren_level += match.group(1).count('(')
  108. paren_level -= match.group(1).count(')')
  109. bracket_level += match.group(1).count('[')
  110. bracket_level -= match.group(1).count(']')
  111. continue
  112. raise exceptions.SyntaxException(
  113. "Expected: %s" %
  114. ','.join(text),
  115. **self.exception_kwargs)
  116. def append_node(self, nodecls, *args, **kwargs):
  117. kwargs.setdefault('source', self.text)
  118. kwargs.setdefault('lineno', self.matched_lineno)
  119. kwargs.setdefault('pos', self.matched_charpos)
  120. kwargs['filename'] = self.filename
  121. node = nodecls(*args, **kwargs)
  122. if len(self.tag):
  123. self.tag[-1].nodes.append(node)
  124. else:
  125. self.template.nodes.append(node)
  126. # build a set of child nodes for the control line
  127. # (used for loop variable detection)
  128. # also build a set of child nodes on ternary control lines
  129. # (used for determining if a pass needs to be auto-inserted
  130. if self.control_line:
  131. control_frame = self.control_line[-1]
  132. control_frame.nodes.append(node)
  133. if not (isinstance(node, parsetree.ControlLine) and
  134. control_frame.is_ternary(node.keyword)):
  135. if self.ternary_stack and self.ternary_stack[-1]:
  136. self.ternary_stack[-1][-1].nodes.append(node)
  137. if isinstance(node, parsetree.Tag):
  138. if len(self.tag):
  139. node.parent = self.tag[-1]
  140. self.tag.append(node)
  141. elif isinstance(node, parsetree.ControlLine):
  142. if node.isend:
  143. self.control_line.pop()
  144. self.ternary_stack.pop()
  145. elif node.is_primary:
  146. self.control_line.append(node)
  147. self.ternary_stack.append([])
  148. elif self.control_line and \
  149. self.control_line[-1].is_ternary(node.keyword):
  150. self.ternary_stack[-1].append(node)
  151. elif self.control_line and \
  152. not self.control_line[-1].is_ternary(node.keyword):
  153. raise exceptions.SyntaxException(
  154. "Keyword '%s' not a legal ternary for keyword '%s'" %
  155. (node.keyword, self.control_line[-1].keyword),
  156. **self.exception_kwargs)
  157. _coding_re = re.compile(r'#.*coding[:=]\s*([-\w.]+).*\r?\n')
  158. def decode_raw_stream(self, text, decode_raw, known_encoding, filename):
  159. """given string/unicode or bytes/string, determine encoding
  160. from magic encoding comment, return body as unicode
  161. or raw if decode_raw=False
  162. """
  163. if isinstance(text, compat.text_type):
  164. m = self._coding_re.match(text)
  165. encoding = m and m.group(1) or known_encoding or 'ascii'
  166. return encoding, text
  167. if text.startswith(codecs.BOM_UTF8):
  168. text = text[len(codecs.BOM_UTF8):]
  169. parsed_encoding = 'utf-8'
  170. m = self._coding_re.match(text.decode('utf-8', 'ignore'))
  171. if m is not None and m.group(1) != 'utf-8':
  172. raise exceptions.CompileException(
  173. "Found utf-8 BOM in file, with conflicting "
  174. "magic encoding comment of '%s'" % m.group(1),
  175. text.decode('utf-8', 'ignore'),
  176. 0, 0, filename)
  177. else:
  178. m = self._coding_re.match(text.decode('utf-8', 'ignore'))
  179. if m:
  180. parsed_encoding = m.group(1)
  181. else:
  182. parsed_encoding = known_encoding or 'ascii'
  183. if decode_raw:
  184. try:
  185. text = text.decode(parsed_encoding)
  186. except UnicodeDecodeError:
  187. raise exceptions.CompileException(
  188. "Unicode decode operation of encoding '%s' failed" %
  189. parsed_encoding,
  190. text.decode('utf-8', 'ignore'),
  191. 0, 0, filename)
  192. return parsed_encoding, text
  193. def parse(self):
  194. self.encoding, self.text = self.decode_raw_stream(
  195. self.text,
  196. not self.disable_unicode,
  197. self.encoding,
  198. self.filename)
  199. for preproc in self.preprocessor:
  200. self.text = preproc(self.text)
  201. # push the match marker past the
  202. # encoding comment.
  203. self.match_reg(self._coding_re)
  204. self.textlength = len(self.text)
  205. while (True):
  206. if self.match_position > self.textlength:
  207. break
  208. if self.match_end():
  209. break
  210. if self.match_expression():
  211. continue
  212. if self.match_control_line():
  213. continue
  214. if self.match_comment():
  215. continue
  216. if self.match_tag_start():
  217. continue
  218. if self.match_tag_end():
  219. continue
  220. if self.match_python_block():
  221. continue
  222. if self.match_text():
  223. continue
  224. if self.match_position > self.textlength:
  225. break
  226. raise exceptions.CompileException("assertion failed")
  227. if len(self.tag):
  228. raise exceptions.SyntaxException("Unclosed tag: <%%%s>" %
  229. self.tag[-1].keyword,
  230. **self.exception_kwargs)
  231. if len(self.control_line):
  232. raise exceptions.SyntaxException(
  233. "Unterminated control keyword: '%s'" %
  234. self.control_line[-1].keyword,
  235. self.text,
  236. self.control_line[-1].lineno,
  237. self.control_line[-1].pos, self.filename)
  238. return self.template
  239. def match_tag_start(self):
  240. match = self.match(r'''
  241. \<% # opening tag
  242. ([\w\.\:]+) # keyword
  243. ((?:\s+\w+|\s*=\s*|".*?"|'.*?')*) # attrname, = \
  244. # sign, string expression
  245. \s* # more whitespace
  246. (/)?> # closing
  247. ''',
  248. re.I | re.S | re.X)
  249. if match:
  250. keyword, attr, isend = match.groups()
  251. self.keyword = keyword
  252. attributes = {}
  253. if attr:
  254. for att in re.findall(
  255. r"\s*(\w+)\s*=\s*(?:'([^']*)'|\"([^\"]*)\")", attr):
  256. key, val1, val2 = att
  257. text = val1 or val2
  258. text = text.replace('\r\n', '\n')
  259. attributes[key] = text
  260. self.append_node(parsetree.Tag, keyword, attributes)
  261. if isend:
  262. self.tag.pop()
  263. else:
  264. if keyword == 'text':
  265. match = self.match(r'(.*?)(?=\</%text>)', re.S)
  266. if not match:
  267. raise exceptions.SyntaxException(
  268. "Unclosed tag: <%%%s>" %
  269. self.tag[-1].keyword,
  270. **self.exception_kwargs)
  271. self.append_node(parsetree.Text, match.group(1))
  272. return self.match_tag_end()
  273. return True
  274. else:
  275. return False
  276. def match_tag_end(self):
  277. match = self.match(r'\</%[\t ]*(.+?)[\t ]*>')
  278. if match:
  279. if not len(self.tag):
  280. raise exceptions.SyntaxException(
  281. "Closing tag without opening tag: </%%%s>" %
  282. match.group(1),
  283. **self.exception_kwargs)
  284. elif self.tag[-1].keyword != match.group(1):
  285. raise exceptions.SyntaxException(
  286. "Closing tag </%%%s> does not match tag: <%%%s>" %
  287. (match.group(1), self.tag[-1].keyword),
  288. **self.exception_kwargs)
  289. self.tag.pop()
  290. return True
  291. else:
  292. return False
  293. def match_end(self):
  294. match = self.match(r'\Z', re.S)
  295. if match:
  296. string = match.group()
  297. if string:
  298. return string
  299. else:
  300. return True
  301. else:
  302. return False
  303. def match_text(self):
  304. match = self.match(r"""
  305. (.*?) # anything, followed by:
  306. (
  307. (?<=\n)(?=[ \t]*(?=%|\#\#)) # an eval or line-based
  308. # comment preceded by a
  309. # consumed newline and whitespace
  310. |
  311. (?=\${) # an expression
  312. |
  313. (?=</?[%&]) # a substitution or block or call start or end
  314. # - don't consume
  315. |
  316. (\\\r?\n) # an escaped newline - throw away
  317. |
  318. \Z # end of string
  319. )""", re.X | re.S)
  320. if match:
  321. text = match.group(1)
  322. if text:
  323. self.append_node(parsetree.Text, text)
  324. return True
  325. else:
  326. return False
  327. def match_python_block(self):
  328. match = self.match(r"<%(!)?")
  329. if match:
  330. line, pos = self.matched_lineno, self.matched_charpos
  331. text, end = self.parse_until_text(False, r'%>')
  332. # the trailing newline helps
  333. # compiler.parse() not complain about indentation
  334. text = adjust_whitespace(text) + "\n"
  335. self.append_node(
  336. parsetree.Code,
  337. text,
  338. match.group(1) == '!', lineno=line, pos=pos)
  339. return True
  340. else:
  341. return False
  342. def match_expression(self):
  343. match = self.match(r"\${")
  344. if match:
  345. line, pos = self.matched_lineno, self.matched_charpos
  346. text, end = self.parse_until_text(True, r'\|', r'}')
  347. if end == '|':
  348. escapes, end = self.parse_until_text(True, r'}')
  349. else:
  350. escapes = ""
  351. text = text.replace('\r\n', '\n')
  352. self.append_node(
  353. parsetree.Expression,
  354. text, escapes.strip(),
  355. lineno=line, pos=pos)
  356. return True
  357. else:
  358. return False
  359. def match_control_line(self):
  360. match = self.match(
  361. r"(?<=^)[\t ]*(%(?!%)|##)[\t ]*((?:(?:\\r?\n)|[^\r\n])*)"
  362. r"(?:\r?\n|\Z)", re.M)
  363. if match:
  364. operator = match.group(1)
  365. text = match.group(2)
  366. if operator == '%':
  367. m2 = re.match(r'(end)?(\w+)\s*(.*)', text)
  368. if not m2:
  369. raise exceptions.SyntaxException(
  370. "Invalid control line: '%s'" %
  371. text,
  372. **self.exception_kwargs)
  373. isend, keyword = m2.group(1, 2)
  374. isend = (isend is not None)
  375. if isend:
  376. if not len(self.control_line):
  377. raise exceptions.SyntaxException(
  378. "No starting keyword '%s' for '%s'" %
  379. (keyword, text),
  380. **self.exception_kwargs)
  381. elif self.control_line[-1].keyword != keyword:
  382. raise exceptions.SyntaxException(
  383. "Keyword '%s' doesn't match keyword '%s'" %
  384. (text, self.control_line[-1].keyword),
  385. **self.exception_kwargs)
  386. self.append_node(parsetree.ControlLine, keyword, isend, text)
  387. else:
  388. self.append_node(parsetree.Comment, text)
  389. return True
  390. else:
  391. return False
  392. def match_comment(self):
  393. """matches the multiline version of a comment"""
  394. match = self.match(r"<%doc>(.*?)</%doc>", re.S)
  395. if match:
  396. self.append_node(parsetree.Comment, match.group(1))
  397. return True
  398. else:
  399. return False