sre_compile.py 20 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596
  1. # -*- coding: utf-8 -*-
  2. #
  3. # Secret Labs' Regular Expression Engine
  4. #
  5. # convert template to internal format
  6. #
  7. # Copyright (c) 1997-2001 by Secret Labs AB. All rights reserved.
  8. #
  9. # See the sre.py file for information on usage and redistribution.
  10. #
  11. """Internal support module for sre"""
  12. import _sre, sys
  13. import sre_parse
  14. from sre_constants import *
  15. assert _sre.MAGIC == MAGIC, "SRE module mismatch"
  16. if _sre.CODESIZE == 2:
  17. MAXCODE = 65535
  18. else:
  19. MAXCODE = 0xFFFFFFFFL
  20. _LITERAL_CODES = set([LITERAL, NOT_LITERAL])
  21. _REPEATING_CODES = set([REPEAT, MIN_REPEAT, MAX_REPEAT])
  22. _SUCCESS_CODES = set([SUCCESS, FAILURE])
  23. _ASSERT_CODES = set([ASSERT, ASSERT_NOT])
  24. # Sets of lowercase characters which have the same uppercase.
  25. _equivalences = (
  26. # LATIN SMALL LETTER I, LATIN SMALL LETTER DOTLESS I
  27. (0x69, 0x131), # iı
  28. # LATIN SMALL LETTER S, LATIN SMALL LETTER LONG S
  29. (0x73, 0x17f), # sſ
  30. # MICRO SIGN, GREEK SMALL LETTER MU
  31. (0xb5, 0x3bc), # µμ
  32. # COMBINING GREEK YPOGEGRAMMENI, GREEK SMALL LETTER IOTA, GREEK PROSGEGRAMMENI
  33. (0x345, 0x3b9, 0x1fbe), # \u0345ιι
  34. # GREEK SMALL LETTER BETA, GREEK BETA SYMBOL
  35. (0x3b2, 0x3d0), # βϐ
  36. # GREEK SMALL LETTER EPSILON, GREEK LUNATE EPSILON SYMBOL
  37. (0x3b5, 0x3f5), # εϵ
  38. # GREEK SMALL LETTER THETA, GREEK THETA SYMBOL
  39. (0x3b8, 0x3d1), # θϑ
  40. # GREEK SMALL LETTER KAPPA, GREEK KAPPA SYMBOL
  41. (0x3ba, 0x3f0), # κϰ
  42. # GREEK SMALL LETTER PI, GREEK PI SYMBOL
  43. (0x3c0, 0x3d6), # πϖ
  44. # GREEK SMALL LETTER RHO, GREEK RHO SYMBOL
  45. (0x3c1, 0x3f1), # ρϱ
  46. # GREEK SMALL LETTER FINAL SIGMA, GREEK SMALL LETTER SIGMA
  47. (0x3c2, 0x3c3), # ςσ
  48. # GREEK SMALL LETTER PHI, GREEK PHI SYMBOL
  49. (0x3c6, 0x3d5), # φϕ
  50. # LATIN SMALL LETTER S WITH DOT ABOVE, LATIN SMALL LETTER LONG S WITH DOT ABOVE
  51. (0x1e61, 0x1e9b), # ṡẛ
  52. )
  53. # Maps the lowercase code to lowercase codes which have the same uppercase.
  54. _ignorecase_fixes = {i: tuple(j for j in t if i != j)
  55. for t in _equivalences for i in t}
  56. def _compile(code, pattern, flags):
  57. # internal: compile a (sub)pattern
  58. emit = code.append
  59. _len = len
  60. LITERAL_CODES = _LITERAL_CODES
  61. REPEATING_CODES = _REPEATING_CODES
  62. SUCCESS_CODES = _SUCCESS_CODES
  63. ASSERT_CODES = _ASSERT_CODES
  64. if (flags & SRE_FLAG_IGNORECASE and
  65. not (flags & SRE_FLAG_LOCALE) and
  66. flags & SRE_FLAG_UNICODE):
  67. fixes = _ignorecase_fixes
  68. else:
  69. fixes = None
  70. for op, av in pattern:
  71. if op in LITERAL_CODES:
  72. if flags & SRE_FLAG_IGNORECASE:
  73. lo = _sre.getlower(av, flags)
  74. if fixes and lo in fixes:
  75. emit(OPCODES[IN_IGNORE])
  76. skip = _len(code); emit(0)
  77. if op is NOT_LITERAL:
  78. emit(OPCODES[NEGATE])
  79. for k in (lo,) + fixes[lo]:
  80. emit(OPCODES[LITERAL])
  81. emit(k)
  82. emit(OPCODES[FAILURE])
  83. code[skip] = _len(code) - skip
  84. else:
  85. emit(OPCODES[OP_IGNORE[op]])
  86. emit(lo)
  87. else:
  88. emit(OPCODES[op])
  89. emit(av)
  90. elif op is IN:
  91. if flags & SRE_FLAG_IGNORECASE:
  92. emit(OPCODES[OP_IGNORE[op]])
  93. def fixup(literal, flags=flags):
  94. return _sre.getlower(literal, flags)
  95. else:
  96. emit(OPCODES[op])
  97. fixup = None
  98. skip = _len(code); emit(0)
  99. _compile_charset(av, flags, code, fixup, fixes)
  100. code[skip] = _len(code) - skip
  101. elif op is ANY:
  102. if flags & SRE_FLAG_DOTALL:
  103. emit(OPCODES[ANY_ALL])
  104. else:
  105. emit(OPCODES[ANY])
  106. elif op in REPEATING_CODES:
  107. if flags & SRE_FLAG_TEMPLATE:
  108. raise error, "internal: unsupported template operator"
  109. emit(OPCODES[REPEAT])
  110. skip = _len(code); emit(0)
  111. emit(av[0])
  112. emit(av[1])
  113. _compile(code, av[2], flags)
  114. emit(OPCODES[SUCCESS])
  115. code[skip] = _len(code) - skip
  116. elif _simple(av) and op is not REPEAT:
  117. if op is MAX_REPEAT:
  118. emit(OPCODES[REPEAT_ONE])
  119. else:
  120. emit(OPCODES[MIN_REPEAT_ONE])
  121. skip = _len(code); emit(0)
  122. emit(av[0])
  123. emit(av[1])
  124. _compile(code, av[2], flags)
  125. emit(OPCODES[SUCCESS])
  126. code[skip] = _len(code) - skip
  127. else:
  128. emit(OPCODES[REPEAT])
  129. skip = _len(code); emit(0)
  130. emit(av[0])
  131. emit(av[1])
  132. _compile(code, av[2], flags)
  133. code[skip] = _len(code) - skip
  134. if op is MAX_REPEAT:
  135. emit(OPCODES[MAX_UNTIL])
  136. else:
  137. emit(OPCODES[MIN_UNTIL])
  138. elif op is SUBPATTERN:
  139. if av[0]:
  140. emit(OPCODES[MARK])
  141. emit((av[0]-1)*2)
  142. # _compile_info(code, av[1], flags)
  143. _compile(code, av[1], flags)
  144. if av[0]:
  145. emit(OPCODES[MARK])
  146. emit((av[0]-1)*2+1)
  147. elif op in SUCCESS_CODES:
  148. emit(OPCODES[op])
  149. elif op in ASSERT_CODES:
  150. emit(OPCODES[op])
  151. skip = _len(code); emit(0)
  152. if av[0] >= 0:
  153. emit(0) # look ahead
  154. else:
  155. lo, hi = av[1].getwidth()
  156. if lo != hi:
  157. raise error, "look-behind requires fixed-width pattern"
  158. emit(lo) # look behind
  159. _compile(code, av[1], flags)
  160. emit(OPCODES[SUCCESS])
  161. code[skip] = _len(code) - skip
  162. elif op is CALL:
  163. emit(OPCODES[op])
  164. skip = _len(code); emit(0)
  165. _compile(code, av, flags)
  166. emit(OPCODES[SUCCESS])
  167. code[skip] = _len(code) - skip
  168. elif op is AT:
  169. emit(OPCODES[op])
  170. if flags & SRE_FLAG_MULTILINE:
  171. av = AT_MULTILINE.get(av, av)
  172. if flags & SRE_FLAG_LOCALE:
  173. av = AT_LOCALE.get(av, av)
  174. elif flags & SRE_FLAG_UNICODE:
  175. av = AT_UNICODE.get(av, av)
  176. emit(ATCODES[av])
  177. elif op is BRANCH:
  178. emit(OPCODES[op])
  179. tail = []
  180. tailappend = tail.append
  181. for av in av[1]:
  182. skip = _len(code); emit(0)
  183. # _compile_info(code, av, flags)
  184. _compile(code, av, flags)
  185. emit(OPCODES[JUMP])
  186. tailappend(_len(code)); emit(0)
  187. code[skip] = _len(code) - skip
  188. emit(0) # end of branch
  189. for tail in tail:
  190. code[tail] = _len(code) - tail
  191. elif op is CATEGORY:
  192. emit(OPCODES[op])
  193. if flags & SRE_FLAG_LOCALE:
  194. av = CH_LOCALE[av]
  195. elif flags & SRE_FLAG_UNICODE:
  196. av = CH_UNICODE[av]
  197. emit(CHCODES[av])
  198. elif op is GROUPREF:
  199. if flags & SRE_FLAG_IGNORECASE:
  200. emit(OPCODES[OP_IGNORE[op]])
  201. else:
  202. emit(OPCODES[op])
  203. emit(av-1)
  204. elif op is GROUPREF_EXISTS:
  205. emit(OPCODES[op])
  206. emit(av[0]-1)
  207. skipyes = _len(code); emit(0)
  208. _compile(code, av[1], flags)
  209. if av[2]:
  210. emit(OPCODES[JUMP])
  211. skipno = _len(code); emit(0)
  212. code[skipyes] = _len(code) - skipyes + 1
  213. _compile(code, av[2], flags)
  214. code[skipno] = _len(code) - skipno
  215. else:
  216. code[skipyes] = _len(code) - skipyes + 1
  217. else:
  218. raise ValueError, ("unsupported operand type", op)
  219. def _compile_charset(charset, flags, code, fixup=None, fixes=None):
  220. # compile charset subprogram
  221. emit = code.append
  222. for op, av in _optimize_charset(charset, fixup, fixes,
  223. flags & SRE_FLAG_UNICODE):
  224. emit(OPCODES[op])
  225. if op is NEGATE:
  226. pass
  227. elif op is LITERAL:
  228. emit(av)
  229. elif op is RANGE:
  230. emit(av[0])
  231. emit(av[1])
  232. elif op is CHARSET:
  233. code.extend(av)
  234. elif op is BIGCHARSET:
  235. code.extend(av)
  236. elif op is CATEGORY:
  237. if flags & SRE_FLAG_LOCALE:
  238. emit(CHCODES[CH_LOCALE[av]])
  239. elif flags & SRE_FLAG_UNICODE:
  240. emit(CHCODES[CH_UNICODE[av]])
  241. else:
  242. emit(CHCODES[av])
  243. else:
  244. raise error, "internal: unsupported set operator"
  245. emit(OPCODES[FAILURE])
  246. def _optimize_charset(charset, fixup, fixes, isunicode):
  247. # internal: optimize character set
  248. out = []
  249. tail = []
  250. charmap = bytearray(256)
  251. for op, av in charset:
  252. while True:
  253. try:
  254. if op is LITERAL:
  255. if fixup:
  256. i = fixup(av)
  257. charmap[i] = 1
  258. if fixes and i in fixes:
  259. for k in fixes[i]:
  260. charmap[k] = 1
  261. else:
  262. charmap[av] = 1
  263. elif op is RANGE:
  264. r = range(av[0], av[1]+1)
  265. if fixup:
  266. r = map(fixup, r)
  267. if fixup and fixes:
  268. for i in r:
  269. charmap[i] = 1
  270. if i in fixes:
  271. for k in fixes[i]:
  272. charmap[k] = 1
  273. else:
  274. for i in r:
  275. charmap[i] = 1
  276. elif op is NEGATE:
  277. out.append((op, av))
  278. else:
  279. tail.append((op, av))
  280. except IndexError:
  281. if len(charmap) == 256:
  282. # character set contains non-UCS1 character codes
  283. charmap += b'\0' * 0xff00
  284. continue
  285. # character set contains non-BMP character codes
  286. if fixup and isunicode and op is RANGE:
  287. lo, hi = av
  288. ranges = [av]
  289. # There are only two ranges of cased astral characters:
  290. # 10400-1044F (Deseret) and 118A0-118DF (Warang Citi).
  291. _fixup_range(max(0x10000, lo), min(0x11fff, hi),
  292. ranges, fixup)
  293. for lo, hi in ranges:
  294. if lo == hi:
  295. tail.append((LITERAL, hi))
  296. else:
  297. tail.append((RANGE, (lo, hi)))
  298. else:
  299. tail.append((op, av))
  300. break
  301. # compress character map
  302. runs = []
  303. q = 0
  304. while True:
  305. p = charmap.find(b'\1', q)
  306. if p < 0:
  307. break
  308. if len(runs) >= 2:
  309. runs = None
  310. break
  311. q = charmap.find(b'\0', p)
  312. if q < 0:
  313. runs.append((p, len(charmap)))
  314. break
  315. runs.append((p, q))
  316. if runs is not None:
  317. # use literal/range
  318. for p, q in runs:
  319. if q - p == 1:
  320. out.append((LITERAL, p))
  321. else:
  322. out.append((RANGE, (p, q - 1)))
  323. out += tail
  324. # if the case was changed or new representation is more compact
  325. if fixup or len(out) < len(charset):
  326. return out
  327. # else original character set is good enough
  328. return charset
  329. # use bitmap
  330. if len(charmap) == 256:
  331. data = _mk_bitmap(charmap)
  332. out.append((CHARSET, data))
  333. out += tail
  334. return out
  335. # To represent a big charset, first a bitmap of all characters in the
  336. # set is constructed. Then, this bitmap is sliced into chunks of 256
  337. # characters, duplicate chunks are eliminated, and each chunk is
  338. # given a number. In the compiled expression, the charset is
  339. # represented by a 32-bit word sequence, consisting of one word for
  340. # the number of different chunks, a sequence of 256 bytes (64 words)
  341. # of chunk numbers indexed by their original chunk position, and a
  342. # sequence of 256-bit chunks (8 words each).
  343. # Compression is normally good: in a typical charset, large ranges of
  344. # Unicode will be either completely excluded (e.g. if only cyrillic
  345. # letters are to be matched), or completely included (e.g. if large
  346. # subranges of Kanji match). These ranges will be represented by
  347. # chunks of all one-bits or all zero-bits.
  348. # Matching can be also done efficiently: the more significant byte of
  349. # the Unicode character is an index into the chunk number, and the
  350. # less significant byte is a bit index in the chunk (just like the
  351. # CHARSET matching).
  352. # In UCS-4 mode, the BIGCHARSET opcode still supports only subsets
  353. # of the basic multilingual plane; an efficient representation
  354. # for all of Unicode has not yet been developed.
  355. charmap = bytes(charmap) # should be hashable
  356. comps = {}
  357. mapping = bytearray(256)
  358. block = 0
  359. data = bytearray()
  360. for i in range(0, 65536, 256):
  361. chunk = charmap[i: i + 256]
  362. if chunk in comps:
  363. mapping[i // 256] = comps[chunk]
  364. else:
  365. mapping[i // 256] = comps[chunk] = block
  366. block += 1
  367. data += chunk
  368. data = _mk_bitmap(data)
  369. data[0:0] = [block] + _bytes_to_codes(mapping)
  370. out.append((BIGCHARSET, data))
  371. out += tail
  372. return out
  373. def _fixup_range(lo, hi, ranges, fixup):
  374. for i in map(fixup, range(lo, hi+1)):
  375. for k, (lo, hi) in enumerate(ranges):
  376. if i < lo:
  377. if l == lo - 1:
  378. ranges[k] = (i, hi)
  379. else:
  380. ranges.insert(k, (i, i))
  381. break
  382. elif i > hi:
  383. if i == hi + 1:
  384. ranges[k] = (lo, i)
  385. break
  386. else:
  387. break
  388. else:
  389. ranges.append((i, i))
  390. _CODEBITS = _sre.CODESIZE * 8
  391. _BITS_TRANS = b'0' + b'1' * 255
  392. def _mk_bitmap(bits, _CODEBITS=_CODEBITS, _int=int):
  393. s = bytes(bits).translate(_BITS_TRANS)[::-1]
  394. return [_int(s[i - _CODEBITS: i], 2)
  395. for i in range(len(s), 0, -_CODEBITS)]
  396. def _bytes_to_codes(b):
  397. # Convert block indices to word array
  398. import array
  399. if _sre.CODESIZE == 2:
  400. code = 'H'
  401. else:
  402. code = 'I'
  403. a = array.array(code, bytes(b))
  404. assert a.itemsize == _sre.CODESIZE
  405. assert len(a) * a.itemsize == len(b)
  406. return a.tolist()
  407. def _simple(av):
  408. # check if av is a "simple" operator
  409. lo, hi = av[2].getwidth()
  410. return lo == hi == 1 and av[2][0][0] != SUBPATTERN
  411. def _compile_info(code, pattern, flags):
  412. # internal: compile an info block. in the current version,
  413. # this contains min/max pattern width, and an optional literal
  414. # prefix or a character map
  415. lo, hi = pattern.getwidth()
  416. if lo == 0:
  417. return # not worth it
  418. # look for a literal prefix
  419. prefix = []
  420. prefixappend = prefix.append
  421. prefix_skip = 0
  422. charset = [] # not used
  423. charsetappend = charset.append
  424. if not (flags & SRE_FLAG_IGNORECASE):
  425. # look for literal prefix
  426. for op, av in pattern.data:
  427. if op is LITERAL:
  428. if len(prefix) == prefix_skip:
  429. prefix_skip = prefix_skip + 1
  430. prefixappend(av)
  431. elif op is SUBPATTERN and len(av[1]) == 1:
  432. op, av = av[1][0]
  433. if op is LITERAL:
  434. prefixappend(av)
  435. else:
  436. break
  437. else:
  438. break
  439. # if no prefix, look for charset prefix
  440. if not prefix and pattern.data:
  441. op, av = pattern.data[0]
  442. if op is SUBPATTERN and av[1]:
  443. op, av = av[1][0]
  444. if op is LITERAL:
  445. charsetappend((op, av))
  446. elif op is BRANCH:
  447. c = []
  448. cappend = c.append
  449. for p in av[1]:
  450. if not p:
  451. break
  452. op, av = p[0]
  453. if op is LITERAL:
  454. cappend((op, av))
  455. else:
  456. break
  457. else:
  458. charset = c
  459. elif op is BRANCH:
  460. c = []
  461. cappend = c.append
  462. for p in av[1]:
  463. if not p:
  464. break
  465. op, av = p[0]
  466. if op is LITERAL:
  467. cappend((op, av))
  468. else:
  469. break
  470. else:
  471. charset = c
  472. elif op is IN:
  473. charset = av
  474. ## if prefix:
  475. ## print "*** PREFIX", prefix, prefix_skip
  476. ## if charset:
  477. ## print "*** CHARSET", charset
  478. # add an info block
  479. emit = code.append
  480. emit(OPCODES[INFO])
  481. skip = len(code); emit(0)
  482. # literal flag
  483. mask = 0
  484. if prefix:
  485. mask = SRE_INFO_PREFIX
  486. if len(prefix) == prefix_skip == len(pattern.data):
  487. mask = mask + SRE_INFO_LITERAL
  488. elif charset:
  489. mask = mask + SRE_INFO_CHARSET
  490. emit(mask)
  491. # pattern length
  492. if lo < MAXCODE:
  493. emit(lo)
  494. else:
  495. emit(MAXCODE)
  496. prefix = prefix[:MAXCODE]
  497. if hi < MAXCODE:
  498. emit(hi)
  499. else:
  500. emit(0)
  501. # add literal prefix
  502. if prefix:
  503. emit(len(prefix)) # length
  504. emit(prefix_skip) # skip
  505. code.extend(prefix)
  506. # generate overlap table
  507. table = [-1] + ([0]*len(prefix))
  508. for i in xrange(len(prefix)):
  509. table[i+1] = table[i]+1
  510. while table[i+1] > 0 and prefix[i] != prefix[table[i+1]-1]:
  511. table[i+1] = table[table[i+1]-1]+1
  512. code.extend(table[1:]) # don't store first entry
  513. elif charset:
  514. _compile_charset(charset, flags, code)
  515. code[skip] = len(code) - skip
  516. try:
  517. unicode
  518. except NameError:
  519. STRING_TYPES = (type(""),)
  520. else:
  521. STRING_TYPES = (type(""), type(unicode("")))
  522. def isstring(obj):
  523. for tp in STRING_TYPES:
  524. if isinstance(obj, tp):
  525. return 1
  526. return 0
  527. def _code(p, flags):
  528. flags = p.pattern.flags | flags
  529. code = []
  530. # compile info block
  531. _compile_info(code, p, flags)
  532. # compile the pattern
  533. _compile(code, p.data, flags)
  534. code.append(OPCODES[SUCCESS])
  535. return code
  536. def compile(p, flags=0):
  537. # internal: convert pattern list to internal format
  538. if isstring(p):
  539. pattern = p
  540. p = sre_parse.parse(p, flags)
  541. else:
  542. pattern = None
  543. code = _code(p, flags)
  544. # print code
  545. # XXX: <fl> get rid of this limitation!
  546. if p.pattern.groups > 100:
  547. raise AssertionError(
  548. "sorry, but this version only supports 100 named groups"
  549. )
  550. # map in either direction
  551. groupindex = p.pattern.groupdict
  552. indexgroup = [None] * p.pattern.groups
  553. for k, i in groupindex.items():
  554. indexgroup[i] = k
  555. return _sre.compile(
  556. pattern, flags | p.pattern.flags, code,
  557. p.pattern.groups-1,
  558. groupindex, indexgroup
  559. )