sre_parse.py 29 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834
  1. #
  2. # Secret Labs' Regular Expression Engine
  3. #
  4. # convert re-style regular expression to sre pattern
  5. #
  6. # Copyright (c) 1998-2001 by Secret Labs AB. All rights reserved.
  7. #
  8. # See the sre.py file for information on usage and redistribution.
  9. #
  10. """Internal support module for sre"""
  11. # XXX: show string offset and offending character for all errors
  12. import sys
  13. from sre_constants import *
  14. SPECIAL_CHARS = ".\\[{()*+?^$|"
  15. REPEAT_CHARS = "*+?{"
  16. DIGITS = set("0123456789")
  17. OCTDIGITS = set("01234567")
  18. HEXDIGITS = set("0123456789abcdefABCDEF")
  19. WHITESPACE = set(" \t\n\r\v\f")
  20. ESCAPES = {
  21. r"\a": (LITERAL, ord("\a")),
  22. r"\b": (LITERAL, ord("\b")),
  23. r"\f": (LITERAL, ord("\f")),
  24. r"\n": (LITERAL, ord("\n")),
  25. r"\r": (LITERAL, ord("\r")),
  26. r"\t": (LITERAL, ord("\t")),
  27. r"\v": (LITERAL, ord("\v")),
  28. r"\\": (LITERAL, ord("\\"))
  29. }
  30. CATEGORIES = {
  31. r"\A": (AT, AT_BEGINNING_STRING), # start of string
  32. r"\b": (AT, AT_BOUNDARY),
  33. r"\B": (AT, AT_NON_BOUNDARY),
  34. r"\d": (IN, [(CATEGORY, CATEGORY_DIGIT)]),
  35. r"\D": (IN, [(CATEGORY, CATEGORY_NOT_DIGIT)]),
  36. r"\s": (IN, [(CATEGORY, CATEGORY_SPACE)]),
  37. r"\S": (IN, [(CATEGORY, CATEGORY_NOT_SPACE)]),
  38. r"\w": (IN, [(CATEGORY, CATEGORY_WORD)]),
  39. r"\W": (IN, [(CATEGORY, CATEGORY_NOT_WORD)]),
  40. r"\Z": (AT, AT_END_STRING), # end of string
  41. }
  42. FLAGS = {
  43. # standard flags
  44. "i": SRE_FLAG_IGNORECASE,
  45. "L": SRE_FLAG_LOCALE,
  46. "m": SRE_FLAG_MULTILINE,
  47. "s": SRE_FLAG_DOTALL,
  48. "x": SRE_FLAG_VERBOSE,
  49. # extensions
  50. "t": SRE_FLAG_TEMPLATE,
  51. "u": SRE_FLAG_UNICODE,
  52. }
  53. class Pattern:
  54. # master pattern object. keeps track of global attributes
  55. def __init__(self):
  56. self.flags = 0
  57. self.open = []
  58. self.groups = 1
  59. self.groupdict = {}
  60. self.lookbehind = 0
  61. def opengroup(self, name=None):
  62. gid = self.groups
  63. self.groups = gid + 1
  64. if name is not None:
  65. ogid = self.groupdict.get(name, None)
  66. if ogid is not None:
  67. raise error, ("redefinition of group name %s as group %d; "
  68. "was group %d" % (repr(name), gid, ogid))
  69. self.groupdict[name] = gid
  70. self.open.append(gid)
  71. return gid
  72. def closegroup(self, gid):
  73. self.open.remove(gid)
  74. def checkgroup(self, gid):
  75. return gid < self.groups and gid not in self.open
  76. class SubPattern:
  77. # a subpattern, in intermediate form
  78. def __init__(self, pattern, data=None):
  79. self.pattern = pattern
  80. if data is None:
  81. data = []
  82. self.data = data
  83. self.width = None
  84. def dump(self, level=0):
  85. seqtypes = (tuple, list)
  86. for op, av in self.data:
  87. print level*" " + op,
  88. if op == IN:
  89. # member sublanguage
  90. print
  91. for op, a in av:
  92. print (level+1)*" " + op, a
  93. elif op == BRANCH:
  94. print
  95. for i, a in enumerate(av[1]):
  96. if i:
  97. print level*" " + "or"
  98. a.dump(level+1)
  99. elif op == GROUPREF_EXISTS:
  100. condgroup, item_yes, item_no = av
  101. print condgroup
  102. item_yes.dump(level+1)
  103. if item_no:
  104. print level*" " + "else"
  105. item_no.dump(level+1)
  106. elif isinstance(av, seqtypes):
  107. nl = 0
  108. for a in av:
  109. if isinstance(a, SubPattern):
  110. if not nl:
  111. print
  112. a.dump(level+1)
  113. nl = 1
  114. else:
  115. print a,
  116. nl = 0
  117. if not nl:
  118. print
  119. else:
  120. print av
  121. def __repr__(self):
  122. return repr(self.data)
  123. def __len__(self):
  124. return len(self.data)
  125. def __delitem__(self, index):
  126. del self.data[index]
  127. def __getitem__(self, index):
  128. if isinstance(index, slice):
  129. return SubPattern(self.pattern, self.data[index])
  130. return self.data[index]
  131. def __setitem__(self, index, code):
  132. self.data[index] = code
  133. def insert(self, index, code):
  134. self.data.insert(index, code)
  135. def append(self, code):
  136. self.data.append(code)
  137. def getwidth(self):
  138. # determine the width (min, max) for this subpattern
  139. if self.width:
  140. return self.width
  141. lo = hi = 0
  142. UNITCODES = (ANY, RANGE, IN, LITERAL, NOT_LITERAL, CATEGORY)
  143. REPEATCODES = (MIN_REPEAT, MAX_REPEAT)
  144. for op, av in self.data:
  145. if op is BRANCH:
  146. i = MAXREPEAT - 1
  147. j = 0
  148. for av in av[1]:
  149. l, h = av.getwidth()
  150. i = min(i, l)
  151. j = max(j, h)
  152. lo = lo + i
  153. hi = hi + j
  154. elif op is CALL:
  155. i, j = av.getwidth()
  156. lo = lo + i
  157. hi = hi + j
  158. elif op is SUBPATTERN:
  159. i, j = av[1].getwidth()
  160. lo = lo + i
  161. hi = hi + j
  162. elif op in REPEATCODES:
  163. i, j = av[2].getwidth()
  164. lo = lo + i * av[0]
  165. hi = hi + j * av[1]
  166. elif op in UNITCODES:
  167. lo = lo + 1
  168. hi = hi + 1
  169. elif op == SUCCESS:
  170. break
  171. self.width = min(lo, MAXREPEAT - 1), min(hi, MAXREPEAT)
  172. return self.width
  173. class Tokenizer:
  174. def __init__(self, string):
  175. self.string = string
  176. self.index = 0
  177. self.__next()
  178. def __next(self):
  179. if self.index >= len(self.string):
  180. self.next = None
  181. return
  182. char = self.string[self.index]
  183. if char[0] == "\\":
  184. try:
  185. c = self.string[self.index + 1]
  186. except IndexError:
  187. raise error, "bogus escape (end of line)"
  188. char = char + c
  189. self.index = self.index + len(char)
  190. self.next = char
  191. def match(self, char, skip=1):
  192. if char == self.next:
  193. if skip:
  194. self.__next()
  195. return 1
  196. return 0
  197. def get(self):
  198. this = self.next
  199. self.__next()
  200. return this
  201. def tell(self):
  202. return self.index, self.next
  203. def seek(self, index):
  204. self.index, self.next = index
  205. def isident(char):
  206. return "a" <= char <= "z" or "A" <= char <= "Z" or char == "_"
  207. def isdigit(char):
  208. return "0" <= char <= "9"
  209. def isname(name):
  210. # check that group name is a valid string
  211. if not isident(name[0]):
  212. return False
  213. for char in name[1:]:
  214. if not isident(char) and not isdigit(char):
  215. return False
  216. return True
  217. def _class_escape(source, escape):
  218. # handle escape code inside character class
  219. code = ESCAPES.get(escape)
  220. if code:
  221. return code
  222. code = CATEGORIES.get(escape)
  223. if code and code[0] == IN:
  224. return code
  225. try:
  226. c = escape[1:2]
  227. if c == "x":
  228. # hexadecimal escape (exactly two digits)
  229. while source.next in HEXDIGITS and len(escape) < 4:
  230. escape = escape + source.get()
  231. escape = escape[2:]
  232. if len(escape) != 2:
  233. raise error, "bogus escape: %s" % repr("\\" + escape)
  234. return LITERAL, int(escape, 16) & 0xff
  235. elif c in OCTDIGITS:
  236. # octal escape (up to three digits)
  237. while source.next in OCTDIGITS and len(escape) < 4:
  238. escape = escape + source.get()
  239. escape = escape[1:]
  240. return LITERAL, int(escape, 8) & 0xff
  241. elif c in DIGITS:
  242. raise error, "bogus escape: %s" % repr(escape)
  243. if len(escape) == 2:
  244. return LITERAL, ord(escape[1])
  245. except ValueError:
  246. pass
  247. raise error, "bogus escape: %s" % repr(escape)
  248. def _escape(source, escape, state):
  249. # handle escape code in expression
  250. code = CATEGORIES.get(escape)
  251. if code:
  252. return code
  253. code = ESCAPES.get(escape)
  254. if code:
  255. return code
  256. try:
  257. c = escape[1:2]
  258. if c == "x":
  259. # hexadecimal escape
  260. while source.next in HEXDIGITS and len(escape) < 4:
  261. escape = escape + source.get()
  262. if len(escape) != 4:
  263. raise ValueError
  264. return LITERAL, int(escape[2:], 16) & 0xff
  265. elif c == "0":
  266. # octal escape
  267. while source.next in OCTDIGITS and len(escape) < 4:
  268. escape = escape + source.get()
  269. return LITERAL, int(escape[1:], 8) & 0xff
  270. elif c in DIGITS:
  271. # octal escape *or* decimal group reference (sigh)
  272. if source.next in DIGITS:
  273. escape = escape + source.get()
  274. if (escape[1] in OCTDIGITS and escape[2] in OCTDIGITS and
  275. source.next in OCTDIGITS):
  276. # got three octal digits; this is an octal escape
  277. escape = escape + source.get()
  278. return LITERAL, int(escape[1:], 8) & 0xff
  279. # not an octal escape, so this is a group reference
  280. group = int(escape[1:])
  281. if group < state.groups:
  282. if not state.checkgroup(group):
  283. raise error, "cannot refer to open group"
  284. if state.lookbehind:
  285. import warnings
  286. warnings.warn('group references in lookbehind '
  287. 'assertions are not supported',
  288. RuntimeWarning)
  289. return GROUPREF, group
  290. raise ValueError
  291. if len(escape) == 2:
  292. return LITERAL, ord(escape[1])
  293. except ValueError:
  294. pass
  295. raise error, "bogus escape: %s" % repr(escape)
  296. def _parse_sub(source, state, nested=1):
  297. # parse an alternation: a|b|c
  298. items = []
  299. itemsappend = items.append
  300. sourcematch = source.match
  301. while 1:
  302. itemsappend(_parse(source, state))
  303. if sourcematch("|"):
  304. continue
  305. if not nested:
  306. break
  307. if not source.next or sourcematch(")", 0):
  308. break
  309. else:
  310. raise error, "pattern not properly closed"
  311. if len(items) == 1:
  312. return items[0]
  313. subpattern = SubPattern(state)
  314. subpatternappend = subpattern.append
  315. # check if all items share a common prefix
  316. while 1:
  317. prefix = None
  318. for item in items:
  319. if not item:
  320. break
  321. if prefix is None:
  322. prefix = item[0]
  323. elif item[0] != prefix:
  324. break
  325. else:
  326. # all subitems start with a common "prefix".
  327. # move it out of the branch
  328. for item in items:
  329. del item[0]
  330. subpatternappend(prefix)
  331. continue # check next one
  332. break
  333. # check if the branch can be replaced by a character set
  334. for item in items:
  335. if len(item) != 1 or item[0][0] != LITERAL:
  336. break
  337. else:
  338. # we can store this as a character set instead of a
  339. # branch (the compiler may optimize this even more)
  340. set = []
  341. setappend = set.append
  342. for item in items:
  343. setappend(item[0])
  344. subpatternappend((IN, set))
  345. return subpattern
  346. subpattern.append((BRANCH, (None, items)))
  347. return subpattern
  348. def _parse_sub_cond(source, state, condgroup):
  349. item_yes = _parse(source, state)
  350. if source.match("|"):
  351. item_no = _parse(source, state)
  352. if source.match("|"):
  353. raise error, "conditional backref with more than two branches"
  354. else:
  355. item_no = None
  356. if source.next and not source.match(")", 0):
  357. raise error, "pattern not properly closed"
  358. subpattern = SubPattern(state)
  359. subpattern.append((GROUPREF_EXISTS, (condgroup, item_yes, item_no)))
  360. return subpattern
  361. _PATTERNENDERS = set("|)")
  362. _ASSERTCHARS = set("=!<")
  363. _LOOKBEHINDASSERTCHARS = set("=!")
  364. _REPEATCODES = set([MIN_REPEAT, MAX_REPEAT])
  365. def _parse(source, state):
  366. # parse a simple pattern
  367. subpattern = SubPattern(state)
  368. # precompute constants into local variables
  369. subpatternappend = subpattern.append
  370. sourceget = source.get
  371. sourcematch = source.match
  372. _len = len
  373. PATTERNENDERS = _PATTERNENDERS
  374. ASSERTCHARS = _ASSERTCHARS
  375. LOOKBEHINDASSERTCHARS = _LOOKBEHINDASSERTCHARS
  376. REPEATCODES = _REPEATCODES
  377. while 1:
  378. if source.next in PATTERNENDERS:
  379. break # end of subpattern
  380. this = sourceget()
  381. if this is None:
  382. break # end of pattern
  383. if state.flags & SRE_FLAG_VERBOSE:
  384. # skip whitespace and comments
  385. if this in WHITESPACE:
  386. continue
  387. if this == "#":
  388. while 1:
  389. this = sourceget()
  390. if this in (None, "\n"):
  391. break
  392. continue
  393. if this and this[0] not in SPECIAL_CHARS:
  394. subpatternappend((LITERAL, ord(this)))
  395. elif this == "[":
  396. # character set
  397. set = []
  398. setappend = set.append
  399. ## if sourcematch(":"):
  400. ## pass # handle character classes
  401. if sourcematch("^"):
  402. setappend((NEGATE, None))
  403. # check remaining characters
  404. start = set[:]
  405. while 1:
  406. this = sourceget()
  407. if this == "]" and set != start:
  408. break
  409. elif this and this[0] == "\\":
  410. code1 = _class_escape(source, this)
  411. elif this:
  412. code1 = LITERAL, ord(this)
  413. else:
  414. raise error, "unexpected end of regular expression"
  415. if sourcematch("-"):
  416. # potential range
  417. this = sourceget()
  418. if this == "]":
  419. if code1[0] is IN:
  420. code1 = code1[1][0]
  421. setappend(code1)
  422. setappend((LITERAL, ord("-")))
  423. break
  424. elif this:
  425. if this[0] == "\\":
  426. code2 = _class_escape(source, this)
  427. else:
  428. code2 = LITERAL, ord(this)
  429. if code1[0] != LITERAL or code2[0] != LITERAL:
  430. raise error, "bad character range"
  431. lo = code1[1]
  432. hi = code2[1]
  433. if hi < lo:
  434. raise error, "bad character range"
  435. setappend((RANGE, (lo, hi)))
  436. else:
  437. raise error, "unexpected end of regular expression"
  438. else:
  439. if code1[0] is IN:
  440. code1 = code1[1][0]
  441. setappend(code1)
  442. # XXX: <fl> should move set optimization to compiler!
  443. if _len(set)==1 and set[0][0] is LITERAL:
  444. subpatternappend(set[0]) # optimization
  445. elif _len(set)==2 and set[0][0] is NEGATE and set[1][0] is LITERAL:
  446. subpatternappend((NOT_LITERAL, set[1][1])) # optimization
  447. else:
  448. # XXX: <fl> should add charmap optimization here
  449. subpatternappend((IN, set))
  450. elif this and this[0] in REPEAT_CHARS:
  451. # repeat previous item
  452. if this == "?":
  453. min, max = 0, 1
  454. elif this == "*":
  455. min, max = 0, MAXREPEAT
  456. elif this == "+":
  457. min, max = 1, MAXREPEAT
  458. elif this == "{":
  459. if source.next == "}":
  460. subpatternappend((LITERAL, ord(this)))
  461. continue
  462. here = source.tell()
  463. min, max = 0, MAXREPEAT
  464. lo = hi = ""
  465. while source.next in DIGITS:
  466. lo = lo + source.get()
  467. if sourcematch(","):
  468. while source.next in DIGITS:
  469. hi = hi + sourceget()
  470. else:
  471. hi = lo
  472. if not sourcematch("}"):
  473. subpatternappend((LITERAL, ord(this)))
  474. source.seek(here)
  475. continue
  476. if lo:
  477. min = int(lo)
  478. if min >= MAXREPEAT:
  479. raise OverflowError("the repetition number is too large")
  480. if hi:
  481. max = int(hi)
  482. if max >= MAXREPEAT:
  483. raise OverflowError("the repetition number is too large")
  484. if max < min:
  485. raise error("bad repeat interval")
  486. else:
  487. raise error, "not supported"
  488. # figure out which item to repeat
  489. if subpattern:
  490. item = subpattern[-1:]
  491. else:
  492. item = None
  493. if not item or (_len(item) == 1 and item[0][0] == AT):
  494. raise error, "nothing to repeat"
  495. if item[0][0] in REPEATCODES:
  496. raise error, "multiple repeat"
  497. if sourcematch("?"):
  498. subpattern[-1] = (MIN_REPEAT, (min, max, item))
  499. else:
  500. subpattern[-1] = (MAX_REPEAT, (min, max, item))
  501. elif this == ".":
  502. subpatternappend((ANY, None))
  503. elif this == "(":
  504. group = 1
  505. name = None
  506. condgroup = None
  507. if sourcematch("?"):
  508. group = 0
  509. # options
  510. if sourcematch("P"):
  511. # python extensions
  512. if sourcematch("<"):
  513. # named group: skip forward to end of name
  514. name = ""
  515. while 1:
  516. char = sourceget()
  517. if char is None:
  518. raise error, "unterminated name"
  519. if char == ">":
  520. break
  521. name = name + char
  522. group = 1
  523. if not name:
  524. raise error("missing group name")
  525. if not isname(name):
  526. raise error("bad character in group name %r" %
  527. name)
  528. elif sourcematch("="):
  529. # named backreference
  530. name = ""
  531. while 1:
  532. char = sourceget()
  533. if char is None:
  534. raise error, "unterminated name"
  535. if char == ")":
  536. break
  537. name = name + char
  538. if not name:
  539. raise error("missing group name")
  540. if not isname(name):
  541. raise error("bad character in backref group name "
  542. "%r" % name)
  543. gid = state.groupdict.get(name)
  544. if gid is None:
  545. msg = "unknown group name: {0!r}".format(name)
  546. raise error(msg)
  547. if state.lookbehind:
  548. import warnings
  549. warnings.warn('group references in lookbehind '
  550. 'assertions are not supported',
  551. RuntimeWarning)
  552. subpatternappend((GROUPREF, gid))
  553. continue
  554. else:
  555. char = sourceget()
  556. if char is None:
  557. raise error, "unexpected end of pattern"
  558. raise error, "unknown specifier: ?P%s" % char
  559. elif sourcematch(":"):
  560. # non-capturing group
  561. group = 2
  562. elif sourcematch("#"):
  563. # comment
  564. while 1:
  565. if source.next is None or source.next == ")":
  566. break
  567. sourceget()
  568. if not sourcematch(")"):
  569. raise error, "unbalanced parenthesis"
  570. continue
  571. elif source.next in ASSERTCHARS:
  572. # lookahead assertions
  573. char = sourceget()
  574. dir = 1
  575. if char == "<":
  576. if source.next not in LOOKBEHINDASSERTCHARS:
  577. raise error, "syntax error"
  578. dir = -1 # lookbehind
  579. char = sourceget()
  580. state.lookbehind += 1
  581. p = _parse_sub(source, state)
  582. if dir < 0:
  583. state.lookbehind -= 1
  584. if not sourcematch(")"):
  585. raise error, "unbalanced parenthesis"
  586. if char == "=":
  587. subpatternappend((ASSERT, (dir, p)))
  588. else:
  589. subpatternappend((ASSERT_NOT, (dir, p)))
  590. continue
  591. elif sourcematch("("):
  592. # conditional backreference group
  593. condname = ""
  594. while 1:
  595. char = sourceget()
  596. if char is None:
  597. raise error, "unterminated name"
  598. if char == ")":
  599. break
  600. condname = condname + char
  601. group = 2
  602. if not condname:
  603. raise error("missing group name")
  604. if isname(condname):
  605. condgroup = state.groupdict.get(condname)
  606. if condgroup is None:
  607. msg = "unknown group name: {0!r}".format(condname)
  608. raise error(msg)
  609. else:
  610. try:
  611. condgroup = int(condname)
  612. except ValueError:
  613. raise error, "bad character in group name"
  614. if state.lookbehind:
  615. import warnings
  616. warnings.warn('group references in lookbehind '
  617. 'assertions are not supported',
  618. RuntimeWarning)
  619. else:
  620. # flags
  621. if not source.next in FLAGS:
  622. raise error, "unexpected end of pattern"
  623. while source.next in FLAGS:
  624. state.flags = state.flags | FLAGS[sourceget()]
  625. if group:
  626. # parse group contents
  627. if group == 2:
  628. # anonymous group
  629. group = None
  630. else:
  631. group = state.opengroup(name)
  632. if condgroup:
  633. p = _parse_sub_cond(source, state, condgroup)
  634. else:
  635. p = _parse_sub(source, state)
  636. if not sourcematch(")"):
  637. raise error, "unbalanced parenthesis"
  638. if group is not None:
  639. state.closegroup(group)
  640. subpatternappend((SUBPATTERN, (group, p)))
  641. else:
  642. while 1:
  643. char = sourceget()
  644. if char is None:
  645. raise error, "unexpected end of pattern"
  646. if char == ")":
  647. break
  648. raise error, "unknown extension"
  649. elif this == "^":
  650. subpatternappend((AT, AT_BEGINNING))
  651. elif this == "$":
  652. subpattern.append((AT, AT_END))
  653. elif this and this[0] == "\\":
  654. code = _escape(source, this, state)
  655. subpatternappend(code)
  656. else:
  657. raise error, "parser error"
  658. return subpattern
  659. def parse(str, flags=0, pattern=None):
  660. # parse 're' pattern into list of (opcode, argument) tuples
  661. source = Tokenizer(str)
  662. if pattern is None:
  663. pattern = Pattern()
  664. pattern.flags = flags
  665. pattern.str = str
  666. p = _parse_sub(source, pattern, 0)
  667. tail = source.get()
  668. if tail == ")":
  669. raise error, "unbalanced parenthesis"
  670. elif tail:
  671. raise error, "bogus characters at end of regular expression"
  672. if not (flags & SRE_FLAG_VERBOSE) and p.pattern.flags & SRE_FLAG_VERBOSE:
  673. # the VERBOSE flag was switched on inside the pattern. to be
  674. # on the safe side, we'll parse the whole thing again...
  675. return parse(str, p.pattern.flags)
  676. if flags & SRE_FLAG_DEBUG:
  677. p.dump()
  678. return p
  679. def parse_template(source, pattern):
  680. # parse 're' replacement string into list of literals and
  681. # group references
  682. s = Tokenizer(source)
  683. sget = s.get
  684. p = []
  685. a = p.append
  686. def literal(literal, p=p, pappend=a):
  687. if p and p[-1][0] is LITERAL:
  688. p[-1] = LITERAL, p[-1][1] + literal
  689. else:
  690. pappend((LITERAL, literal))
  691. sep = source[:0]
  692. if type(sep) is type(""):
  693. makechar = chr
  694. else:
  695. makechar = unichr
  696. while 1:
  697. this = sget()
  698. if this is None:
  699. break # end of replacement string
  700. if this and this[0] == "\\":
  701. # group
  702. c = this[1:2]
  703. if c == "g":
  704. name = ""
  705. if s.match("<"):
  706. while 1:
  707. char = sget()
  708. if char is None:
  709. raise error, "unterminated group name"
  710. if char == ">":
  711. break
  712. name = name + char
  713. if not name:
  714. raise error, "missing group name"
  715. try:
  716. index = int(name)
  717. if index < 0:
  718. raise error, "negative group number"
  719. except ValueError:
  720. if not isname(name):
  721. raise error, "bad character in group name"
  722. try:
  723. index = pattern.groupindex[name]
  724. except KeyError:
  725. msg = "unknown group name: {0!r}".format(name)
  726. raise IndexError(msg)
  727. a((MARK, index))
  728. elif c == "0":
  729. if s.next in OCTDIGITS:
  730. this = this + sget()
  731. if s.next in OCTDIGITS:
  732. this = this + sget()
  733. literal(makechar(int(this[1:], 8) & 0xff))
  734. elif c in DIGITS:
  735. isoctal = False
  736. if s.next in DIGITS:
  737. this = this + sget()
  738. if (c in OCTDIGITS and this[2] in OCTDIGITS and
  739. s.next in OCTDIGITS):
  740. this = this + sget()
  741. isoctal = True
  742. literal(makechar(int(this[1:], 8) & 0xff))
  743. if not isoctal:
  744. a((MARK, int(this[1:])))
  745. else:
  746. try:
  747. this = makechar(ESCAPES[this][1])
  748. except KeyError:
  749. pass
  750. literal(this)
  751. else:
  752. literal(this)
  753. # convert template to groups and literals lists
  754. i = 0
  755. groups = []
  756. groupsappend = groups.append
  757. literals = [None] * len(p)
  758. for c, s in p:
  759. if c is MARK:
  760. groupsappend((i, s))
  761. # literal[i] is already None
  762. else:
  763. literals[i] = s
  764. i = i + 1
  765. return groups, literals
  766. def expand_template(template, match):
  767. g = match.group
  768. sep = match.string[:0]
  769. groups, literals = template
  770. literals = literals[:]
  771. try:
  772. for index, group in groups:
  773. literals[index] = s = g(group)
  774. if s is None:
  775. raise error, "unmatched group"
  776. except IndexError:
  777. raise error, "invalid group reference"
  778. return sep.join(literals)