serializer.py 14 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334
  1. from __future__ import absolute_import, division, unicode_literals
  2. from pip._vendor.six import text_type
  3. import re
  4. from codecs import register_error, xmlcharrefreplace_errors
  5. from .constants import voidElements, booleanAttributes, spaceCharacters
  6. from .constants import rcdataElements, entities, xmlEntities
  7. from . import treewalkers, _utils
  8. from xml.sax.saxutils import escape
  9. _quoteAttributeSpecChars = "".join(spaceCharacters) + "\"'=<>`"
  10. _quoteAttributeSpec = re.compile("[" + _quoteAttributeSpecChars + "]")
  11. _quoteAttributeLegacy = re.compile("[" + _quoteAttributeSpecChars +
  12. "\x00\x01\x02\x03\x04\x05\x06\x07\x08\t\n"
  13. "\x0b\x0c\r\x0e\x0f\x10\x11\x12\x13\x14\x15"
  14. "\x16\x17\x18\x19\x1a\x1b\x1c\x1d\x1e\x1f"
  15. "\x20\x2f\x60\xa0\u1680\u180e\u180f\u2000"
  16. "\u2001\u2002\u2003\u2004\u2005\u2006\u2007"
  17. "\u2008\u2009\u200a\u2028\u2029\u202f\u205f"
  18. "\u3000]")
  19. _encode_entity_map = {}
  20. _is_ucs4 = len("\U0010FFFF") == 1
  21. for k, v in list(entities.items()):
  22. # skip multi-character entities
  23. if ((_is_ucs4 and len(v) > 1) or
  24. (not _is_ucs4 and len(v) > 2)):
  25. continue
  26. if v != "&":
  27. if len(v) == 2:
  28. v = _utils.surrogatePairToCodepoint(v)
  29. else:
  30. v = ord(v)
  31. if v not in _encode_entity_map or k.islower():
  32. # prefer &lt; over &LT; and similarly for &amp;, &gt;, etc.
  33. _encode_entity_map[v] = k
  34. def htmlentityreplace_errors(exc):
  35. if isinstance(exc, (UnicodeEncodeError, UnicodeTranslateError)):
  36. res = []
  37. codepoints = []
  38. skip = False
  39. for i, c in enumerate(exc.object[exc.start:exc.end]):
  40. if skip:
  41. skip = False
  42. continue
  43. index = i + exc.start
  44. if _utils.isSurrogatePair(exc.object[index:min([exc.end, index + 2])]):
  45. codepoint = _utils.surrogatePairToCodepoint(exc.object[index:index + 2])
  46. skip = True
  47. else:
  48. codepoint = ord(c)
  49. codepoints.append(codepoint)
  50. for cp in codepoints:
  51. e = _encode_entity_map.get(cp)
  52. if e:
  53. res.append("&")
  54. res.append(e)
  55. if not e.endswith(";"):
  56. res.append(";")
  57. else:
  58. res.append("&#x%s;" % (hex(cp)[2:]))
  59. return ("".join(res), exc.end)
  60. else:
  61. return xmlcharrefreplace_errors(exc)
  62. register_error("htmlentityreplace", htmlentityreplace_errors)
  63. def serialize(input, tree="etree", encoding=None, **serializer_opts):
  64. # XXX: Should we cache this?
  65. walker = treewalkers.getTreeWalker(tree)
  66. s = HTMLSerializer(**serializer_opts)
  67. return s.render(walker(input), encoding)
  68. class HTMLSerializer(object):
  69. # attribute quoting options
  70. quote_attr_values = "legacy" # be secure by default
  71. quote_char = '"'
  72. use_best_quote_char = True
  73. # tag syntax options
  74. omit_optional_tags = True
  75. minimize_boolean_attributes = True
  76. use_trailing_solidus = False
  77. space_before_trailing_solidus = True
  78. # escaping options
  79. escape_lt_in_attrs = False
  80. escape_rcdata = False
  81. resolve_entities = True
  82. # miscellaneous options
  83. alphabetical_attributes = False
  84. inject_meta_charset = True
  85. strip_whitespace = False
  86. sanitize = False
  87. options = ("quote_attr_values", "quote_char", "use_best_quote_char",
  88. "omit_optional_tags", "minimize_boolean_attributes",
  89. "use_trailing_solidus", "space_before_trailing_solidus",
  90. "escape_lt_in_attrs", "escape_rcdata", "resolve_entities",
  91. "alphabetical_attributes", "inject_meta_charset",
  92. "strip_whitespace", "sanitize")
  93. def __init__(self, **kwargs):
  94. """Initialize HTMLSerializer.
  95. Keyword options (default given first unless specified) include:
  96. inject_meta_charset=True|False
  97. Whether it insert a meta element to define the character set of the
  98. document.
  99. quote_attr_values="legacy"|"spec"|"always"
  100. Whether to quote attribute values that don't require quoting
  101. per legacy browser behaviour, when required by the standard, or always.
  102. quote_char=u'"'|u"'"
  103. Use given quote character for attribute quoting. Default is to
  104. use double quote unless attribute value contains a double quote,
  105. in which case single quotes are used instead.
  106. escape_lt_in_attrs=False|True
  107. Whether to escape < in attribute values.
  108. escape_rcdata=False|True
  109. Whether to escape characters that need to be escaped within normal
  110. elements within rcdata elements such as style.
  111. resolve_entities=True|False
  112. Whether to resolve named character entities that appear in the
  113. source tree. The XML predefined entities &lt; &gt; &amp; &quot; &apos;
  114. are unaffected by this setting.
  115. strip_whitespace=False|True
  116. Whether to remove semantically meaningless whitespace. (This
  117. compresses all whitespace to a single space except within pre.)
  118. minimize_boolean_attributes=True|False
  119. Shortens boolean attributes to give just the attribute value,
  120. for example <input disabled="disabled"> becomes <input disabled>.
  121. use_trailing_solidus=False|True
  122. Includes a close-tag slash at the end of the start tag of void
  123. elements (empty elements whose end tag is forbidden). E.g. <hr/>.
  124. space_before_trailing_solidus=True|False
  125. Places a space immediately before the closing slash in a tag
  126. using a trailing solidus. E.g. <hr />. Requires use_trailing_solidus.
  127. sanitize=False|True
  128. Strip all unsafe or unknown constructs from output.
  129. See `html5lib user documentation`_
  130. omit_optional_tags=True|False
  131. Omit start/end tags that are optional.
  132. alphabetical_attributes=False|True
  133. Reorder attributes to be in alphabetical order.
  134. .. _html5lib user documentation: http://code.google.com/p/html5lib/wiki/UserDocumentation
  135. """
  136. unexpected_args = frozenset(kwargs) - frozenset(self.options)
  137. if len(unexpected_args) > 0:
  138. raise TypeError("__init__() got an unexpected keyword argument '%s'" % next(iter(unexpected_args)))
  139. if 'quote_char' in kwargs:
  140. self.use_best_quote_char = False
  141. for attr in self.options:
  142. setattr(self, attr, kwargs.get(attr, getattr(self, attr)))
  143. self.errors = []
  144. self.strict = False
  145. def encode(self, string):
  146. assert(isinstance(string, text_type))
  147. if self.encoding:
  148. return string.encode(self.encoding, "htmlentityreplace")
  149. else:
  150. return string
  151. def encodeStrict(self, string):
  152. assert(isinstance(string, text_type))
  153. if self.encoding:
  154. return string.encode(self.encoding, "strict")
  155. else:
  156. return string
  157. def serialize(self, treewalker, encoding=None):
  158. # pylint:disable=too-many-nested-blocks
  159. self.encoding = encoding
  160. in_cdata = False
  161. self.errors = []
  162. if encoding and self.inject_meta_charset:
  163. from .filters.inject_meta_charset import Filter
  164. treewalker = Filter(treewalker, encoding)
  165. # Alphabetical attributes is here under the assumption that none of
  166. # the later filters add or change order of attributes; it needs to be
  167. # before the sanitizer so escaped elements come out correctly
  168. if self.alphabetical_attributes:
  169. from .filters.alphabeticalattributes import Filter
  170. treewalker = Filter(treewalker)
  171. # WhitespaceFilter should be used before OptionalTagFilter
  172. # for maximum efficiently of this latter filter
  173. if self.strip_whitespace:
  174. from .filters.whitespace import Filter
  175. treewalker = Filter(treewalker)
  176. if self.sanitize:
  177. from .filters.sanitizer import Filter
  178. treewalker = Filter(treewalker)
  179. if self.omit_optional_tags:
  180. from .filters.optionaltags import Filter
  181. treewalker = Filter(treewalker)
  182. for token in treewalker:
  183. type = token["type"]
  184. if type == "Doctype":
  185. doctype = "<!DOCTYPE %s" % token["name"]
  186. if token["publicId"]:
  187. doctype += ' PUBLIC "%s"' % token["publicId"]
  188. elif token["systemId"]:
  189. doctype += " SYSTEM"
  190. if token["systemId"]:
  191. if token["systemId"].find('"') >= 0:
  192. if token["systemId"].find("'") >= 0:
  193. self.serializeError("System identifer contains both single and double quote characters")
  194. quote_char = "'"
  195. else:
  196. quote_char = '"'
  197. doctype += " %s%s%s" % (quote_char, token["systemId"], quote_char)
  198. doctype += ">"
  199. yield self.encodeStrict(doctype)
  200. elif type in ("Characters", "SpaceCharacters"):
  201. if type == "SpaceCharacters" or in_cdata:
  202. if in_cdata and token["data"].find("</") >= 0:
  203. self.serializeError("Unexpected </ in CDATA")
  204. yield self.encode(token["data"])
  205. else:
  206. yield self.encode(escape(token["data"]))
  207. elif type in ("StartTag", "EmptyTag"):
  208. name = token["name"]
  209. yield self.encodeStrict("<%s" % name)
  210. if name in rcdataElements and not self.escape_rcdata:
  211. in_cdata = True
  212. elif in_cdata:
  213. self.serializeError("Unexpected child element of a CDATA element")
  214. for (_, attr_name), attr_value in token["data"].items():
  215. # TODO: Add namespace support here
  216. k = attr_name
  217. v = attr_value
  218. yield self.encodeStrict(' ')
  219. yield self.encodeStrict(k)
  220. if not self.minimize_boolean_attributes or \
  221. (k not in booleanAttributes.get(name, tuple()) and
  222. k not in booleanAttributes.get("", tuple())):
  223. yield self.encodeStrict("=")
  224. if self.quote_attr_values == "always" or len(v) == 0:
  225. quote_attr = True
  226. elif self.quote_attr_values == "spec":
  227. quote_attr = _quoteAttributeSpec.search(v) is not None
  228. elif self.quote_attr_values == "legacy":
  229. quote_attr = _quoteAttributeLegacy.search(v) is not None
  230. else:
  231. raise ValueError("quote_attr_values must be one of: "
  232. "'always', 'spec', or 'legacy'")
  233. v = v.replace("&", "&amp;")
  234. if self.escape_lt_in_attrs:
  235. v = v.replace("<", "&lt;")
  236. if quote_attr:
  237. quote_char = self.quote_char
  238. if self.use_best_quote_char:
  239. if "'" in v and '"' not in v:
  240. quote_char = '"'
  241. elif '"' in v and "'" not in v:
  242. quote_char = "'"
  243. if quote_char == "'":
  244. v = v.replace("'", "&#39;")
  245. else:
  246. v = v.replace('"', "&quot;")
  247. yield self.encodeStrict(quote_char)
  248. yield self.encode(v)
  249. yield self.encodeStrict(quote_char)
  250. else:
  251. yield self.encode(v)
  252. if name in voidElements and self.use_trailing_solidus:
  253. if self.space_before_trailing_solidus:
  254. yield self.encodeStrict(" /")
  255. else:
  256. yield self.encodeStrict("/")
  257. yield self.encode(">")
  258. elif type == "EndTag":
  259. name = token["name"]
  260. if name in rcdataElements:
  261. in_cdata = False
  262. elif in_cdata:
  263. self.serializeError("Unexpected child element of a CDATA element")
  264. yield self.encodeStrict("</%s>" % name)
  265. elif type == "Comment":
  266. data = token["data"]
  267. if data.find("--") >= 0:
  268. self.serializeError("Comment contains --")
  269. yield self.encodeStrict("<!--%s-->" % token["data"])
  270. elif type == "Entity":
  271. name = token["name"]
  272. key = name + ";"
  273. if key not in entities:
  274. self.serializeError("Entity %s not recognized" % name)
  275. if self.resolve_entities and key not in xmlEntities:
  276. data = entities[key]
  277. else:
  278. data = "&%s;" % name
  279. yield self.encodeStrict(data)
  280. else:
  281. self.serializeError(token["data"])
  282. def render(self, treewalker, encoding=None):
  283. if encoding:
  284. return b"".join(list(self.serialize(treewalker, encoding)))
  285. else:
  286. return "".join(list(self.serialize(treewalker)))
  287. def serializeError(self, data="XXX ERROR MESSAGE NEEDED"):
  288. # XXX The idea is to make data mandatory.
  289. self.errors.append(data)
  290. if self.strict:
  291. raise SerializeError
  292. class SerializeError(Exception):
  293. """Error in serialized tree"""
  294. pass