html5parser.py 117 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862863864865866867868869870871872873874875876877878879880881882883884885886887888889890891892893894895896897898899900901902903904905906907908909910911912913914915916917918919920921922923924925926927928929930931932933934935936937938939940941942943944945946947948949950951952953954955956957958959960961962963964965966967968969970971972973974975976977978979980981982983984985986987988989990991992993994995996997998999100010011002100310041005100610071008100910101011101210131014101510161017101810191020102110221023102410251026102710281029103010311032103310341035103610371038103910401041104210431044104510461047104810491050105110521053105410551056105710581059106010611062106310641065106610671068106910701071107210731074107510761077107810791080108110821083108410851086108710881089109010911092109310941095109610971098109911001101110211031104110511061107110811091110111111121113111411151116111711181119112011211122112311241125112611271128112911301131113211331134113511361137113811391140114111421143114411451146114711481149115011511152115311541155115611571158115911601161116211631164116511661167116811691170117111721173117411751176117711781179118011811182118311841185118611871188118911901191119211931194119511961197119811991200120112021203120412051206120712081209121012111212121312141215121612171218121912201221122212231224122512261227122812291230123112321233123412351236123712381239124012411242124312441245124612471248124912501251125212531254125512561257125812591260126112621263126412651266126712681269127012711272127312741275127612771278127912801281128212831284128512861287128812891290129112921293129412951296129712981299130013011302130313041305130613071308130913101311131213131314131513161317131813191320132113221323132413251326132713281329133013311332133313341335133613371338133913401341134213431344134513461347134813491350135113521353135413551356135713581359136013611362136313641365136613671368136913701371137213731374137513761377137813791380138113821383138413851386138713881389139013911392139313941395139613971398139914001401140214031404140514061407140814091410141114121413141414151416141714181419142014211422142314241425142614271428142914301431143214331434143514361437143814391440144114421443144414451446144714481449145014511452145314541455145614571458145914601461146214631464146514661467146814691470147114721473147414751476147714781479148014811482148314841485148614871488148914901491149214931494149514961497149814991500150115021503150415051506150715081509151015111512151315141515151615171518151915201521152215231524152515261527152815291530153115321533153415351536153715381539154015411542154315441545154615471548154915501551155215531554155515561557155815591560156115621563156415651566156715681569157015711572157315741575157615771578157915801581158215831584158515861587158815891590159115921593159415951596159715981599160016011602160316041605160616071608160916101611161216131614161516161617161816191620162116221623162416251626162716281629163016311632163316341635163616371638163916401641164216431644164516461647164816491650165116521653165416551656165716581659166016611662166316641665166616671668166916701671167216731674167516761677167816791680168116821683168416851686168716881689169016911692169316941695169616971698169917001701170217031704170517061707170817091710171117121713171417151716171717181719172017211722172317241725172617271728172917301731173217331734173517361737173817391740174117421743174417451746174717481749175017511752175317541755175617571758175917601761176217631764176517661767176817691770177117721773177417751776177717781779178017811782178317841785178617871788178917901791179217931794179517961797179817991800180118021803180418051806180718081809181018111812181318141815181618171818181918201821182218231824182518261827182818291830183118321833183418351836183718381839184018411842184318441845184618471848184918501851185218531854185518561857185818591860186118621863186418651866186718681869187018711872187318741875187618771878187918801881188218831884188518861887188818891890189118921893189418951896189718981899190019011902190319041905190619071908190919101911191219131914191519161917191819191920192119221923192419251926192719281929193019311932193319341935193619371938193919401941194219431944194519461947194819491950195119521953195419551956195719581959196019611962196319641965196619671968196919701971197219731974197519761977197819791980198119821983198419851986198719881989199019911992199319941995199619971998199920002001200220032004200520062007200820092010201120122013201420152016201720182019202020212022202320242025202620272028202920302031203220332034203520362037203820392040204120422043204420452046204720482049205020512052205320542055205620572058205920602061206220632064206520662067206820692070207120722073207420752076207720782079208020812082208320842085208620872088208920902091209220932094209520962097209820992100210121022103210421052106210721082109211021112112211321142115211621172118211921202121212221232124212521262127212821292130213121322133213421352136213721382139214021412142214321442145214621472148214921502151215221532154215521562157215821592160216121622163216421652166216721682169217021712172217321742175217621772178217921802181218221832184218521862187218821892190219121922193219421952196219721982199220022012202220322042205220622072208220922102211221222132214221522162217221822192220222122222223222422252226222722282229223022312232223322342235223622372238223922402241224222432244224522462247224822492250225122522253225422552256225722582259226022612262226322642265226622672268226922702271227222732274227522762277227822792280228122822283228422852286228722882289229022912292229322942295229622972298229923002301230223032304230523062307230823092310231123122313231423152316231723182319232023212322232323242325232623272328232923302331233223332334233523362337233823392340234123422343234423452346234723482349235023512352235323542355235623572358235923602361236223632364236523662367236823692370237123722373237423752376237723782379238023812382238323842385238623872388238923902391239223932394239523962397239823992400240124022403240424052406240724082409241024112412241324142415241624172418241924202421242224232424242524262427242824292430243124322433243424352436243724382439244024412442244324442445244624472448244924502451245224532454245524562457245824592460246124622463246424652466246724682469247024712472247324742475247624772478247924802481248224832484248524862487248824892490249124922493249424952496249724982499250025012502250325042505250625072508250925102511251225132514251525162517251825192520252125222523252425252526252725282529253025312532253325342535253625372538253925402541254225432544254525462547254825492550255125522553255425552556255725582559256025612562256325642565256625672568256925702571257225732574257525762577257825792580258125822583258425852586258725882589259025912592259325942595259625972598259926002601260226032604260526062607260826092610261126122613261426152616261726182619262026212622262326242625262626272628262926302631263226332634263526362637263826392640264126422643264426452646264726482649265026512652265326542655265626572658265926602661266226632664266526662667266826692670267126722673267426752676267726782679268026812682268326842685268626872688268926902691269226932694269526962697269826992700270127022703270427052706270727082709271027112712271327142715271627172718271927202721272227232724272527262727272827292730273127322733
  1. from __future__ import absolute_import, division, unicode_literals
  2. from pip._vendor.six import with_metaclass, viewkeys, PY3
  3. import types
  4. try:
  5. from collections import OrderedDict
  6. except ImportError:
  7. from pip._vendor.ordereddict import OrderedDict
  8. from . import _inputstream
  9. from . import _tokenizer
  10. from . import treebuilders
  11. from .treebuilders.base import Marker
  12. from . import _utils
  13. from .constants import (
  14. spaceCharacters, asciiUpper2Lower,
  15. specialElements, headingElements, cdataElements, rcdataElements,
  16. tokenTypes, tagTokenTypes,
  17. namespaces,
  18. htmlIntegrationPointElements, mathmlTextIntegrationPointElements,
  19. adjustForeignAttributes as adjustForeignAttributesMap,
  20. adjustMathMLAttributes, adjustSVGAttributes,
  21. E,
  22. ReparseException
  23. )
  24. def parse(doc, treebuilder="etree", namespaceHTMLElements=True, **kwargs):
  25. """Parse a string or file-like object into a tree"""
  26. tb = treebuilders.getTreeBuilder(treebuilder)
  27. p = HTMLParser(tb, namespaceHTMLElements=namespaceHTMLElements)
  28. return p.parse(doc, **kwargs)
  29. def parseFragment(doc, container="div", treebuilder="etree", namespaceHTMLElements=True, **kwargs):
  30. tb = treebuilders.getTreeBuilder(treebuilder)
  31. p = HTMLParser(tb, namespaceHTMLElements=namespaceHTMLElements)
  32. return p.parseFragment(doc, container=container, **kwargs)
  33. def method_decorator_metaclass(function):
  34. class Decorated(type):
  35. def __new__(meta, classname, bases, classDict):
  36. for attributeName, attribute in classDict.items():
  37. if isinstance(attribute, types.FunctionType):
  38. attribute = function(attribute)
  39. classDict[attributeName] = attribute
  40. return type.__new__(meta, classname, bases, classDict)
  41. return Decorated
  42. class HTMLParser(object):
  43. """HTML parser. Generates a tree structure from a stream of (possibly
  44. malformed) HTML"""
  45. def __init__(self, tree=None, strict=False, namespaceHTMLElements=True, debug=False):
  46. """
  47. strict - raise an exception when a parse error is encountered
  48. tree - a treebuilder class controlling the type of tree that will be
  49. returned. Built in treebuilders can be accessed through
  50. html5lib.treebuilders.getTreeBuilder(treeType)
  51. """
  52. # Raise an exception on the first error encountered
  53. self.strict = strict
  54. if tree is None:
  55. tree = treebuilders.getTreeBuilder("etree")
  56. self.tree = tree(namespaceHTMLElements)
  57. self.errors = []
  58. self.phases = dict([(name, cls(self, self.tree)) for name, cls in
  59. getPhases(debug).items()])
  60. def _parse(self, stream, innerHTML=False, container="div", scripting=False, **kwargs):
  61. self.innerHTMLMode = innerHTML
  62. self.container = container
  63. self.scripting = scripting
  64. self.tokenizer = _tokenizer.HTMLTokenizer(stream, parser=self, **kwargs)
  65. self.reset()
  66. try:
  67. self.mainLoop()
  68. except ReparseException:
  69. self.reset()
  70. self.mainLoop()
  71. def reset(self):
  72. self.tree.reset()
  73. self.firstStartTag = False
  74. self.errors = []
  75. self.log = [] # only used with debug mode
  76. # "quirks" / "limited quirks" / "no quirks"
  77. self.compatMode = "no quirks"
  78. if self.innerHTMLMode:
  79. self.innerHTML = self.container.lower()
  80. if self.innerHTML in cdataElements:
  81. self.tokenizer.state = self.tokenizer.rcdataState
  82. elif self.innerHTML in rcdataElements:
  83. self.tokenizer.state = self.tokenizer.rawtextState
  84. elif self.innerHTML == 'plaintext':
  85. self.tokenizer.state = self.tokenizer.plaintextState
  86. else:
  87. # state already is data state
  88. # self.tokenizer.state = self.tokenizer.dataState
  89. pass
  90. self.phase = self.phases["beforeHtml"]
  91. self.phase.insertHtmlElement()
  92. self.resetInsertionMode()
  93. else:
  94. self.innerHTML = False # pylint:disable=redefined-variable-type
  95. self.phase = self.phases["initial"]
  96. self.lastPhase = None
  97. self.beforeRCDataPhase = None
  98. self.framesetOK = True
  99. @property
  100. def documentEncoding(self):
  101. """The name of the character encoding
  102. that was used to decode the input stream,
  103. or :obj:`None` if that is not determined yet.
  104. """
  105. if not hasattr(self, 'tokenizer'):
  106. return None
  107. return self.tokenizer.stream.charEncoding[0].name
  108. def isHTMLIntegrationPoint(self, element):
  109. if (element.name == "annotation-xml" and
  110. element.namespace == namespaces["mathml"]):
  111. return ("encoding" in element.attributes and
  112. element.attributes["encoding"].translate(
  113. asciiUpper2Lower) in
  114. ("text/html", "application/xhtml+xml"))
  115. else:
  116. return (element.namespace, element.name) in htmlIntegrationPointElements
  117. def isMathMLTextIntegrationPoint(self, element):
  118. return (element.namespace, element.name) in mathmlTextIntegrationPointElements
  119. def mainLoop(self):
  120. CharactersToken = tokenTypes["Characters"]
  121. SpaceCharactersToken = tokenTypes["SpaceCharacters"]
  122. StartTagToken = tokenTypes["StartTag"]
  123. EndTagToken = tokenTypes["EndTag"]
  124. CommentToken = tokenTypes["Comment"]
  125. DoctypeToken = tokenTypes["Doctype"]
  126. ParseErrorToken = tokenTypes["ParseError"]
  127. for token in self.normalizedTokens():
  128. prev_token = None
  129. new_token = token
  130. while new_token is not None:
  131. prev_token = new_token
  132. currentNode = self.tree.openElements[-1] if self.tree.openElements else None
  133. currentNodeNamespace = currentNode.namespace if currentNode else None
  134. currentNodeName = currentNode.name if currentNode else None
  135. type = new_token["type"]
  136. if type == ParseErrorToken:
  137. self.parseError(new_token["data"], new_token.get("datavars", {}))
  138. new_token = None
  139. else:
  140. if (len(self.tree.openElements) == 0 or
  141. currentNodeNamespace == self.tree.defaultNamespace or
  142. (self.isMathMLTextIntegrationPoint(currentNode) and
  143. ((type == StartTagToken and
  144. token["name"] not in frozenset(["mglyph", "malignmark"])) or
  145. type in (CharactersToken, SpaceCharactersToken))) or
  146. (currentNodeNamespace == namespaces["mathml"] and
  147. currentNodeName == "annotation-xml" and
  148. type == StartTagToken and
  149. token["name"] == "svg") or
  150. (self.isHTMLIntegrationPoint(currentNode) and
  151. type in (StartTagToken, CharactersToken, SpaceCharactersToken))):
  152. phase = self.phase
  153. else:
  154. phase = self.phases["inForeignContent"]
  155. if type == CharactersToken:
  156. new_token = phase.processCharacters(new_token)
  157. elif type == SpaceCharactersToken:
  158. new_token = phase.processSpaceCharacters(new_token)
  159. elif type == StartTagToken:
  160. new_token = phase.processStartTag(new_token)
  161. elif type == EndTagToken:
  162. new_token = phase.processEndTag(new_token)
  163. elif type == CommentToken:
  164. new_token = phase.processComment(new_token)
  165. elif type == DoctypeToken:
  166. new_token = phase.processDoctype(new_token)
  167. if (type == StartTagToken and prev_token["selfClosing"] and
  168. not prev_token["selfClosingAcknowledged"]):
  169. self.parseError("non-void-element-with-trailing-solidus",
  170. {"name": prev_token["name"]})
  171. # When the loop finishes it's EOF
  172. reprocess = True
  173. phases = []
  174. while reprocess:
  175. phases.append(self.phase)
  176. reprocess = self.phase.processEOF()
  177. if reprocess:
  178. assert self.phase not in phases
  179. def normalizedTokens(self):
  180. for token in self.tokenizer:
  181. yield self.normalizeToken(token)
  182. def parse(self, stream, *args, **kwargs):
  183. """Parse a HTML document into a well-formed tree
  184. stream - a filelike object or string containing the HTML to be parsed
  185. The optional encoding parameter must be a string that indicates
  186. the encoding. If specified, that encoding will be used,
  187. regardless of any BOM or later declaration (such as in a meta
  188. element)
  189. scripting - treat noscript elements as if javascript was turned on
  190. """
  191. self._parse(stream, False, None, *args, **kwargs)
  192. return self.tree.getDocument()
  193. def parseFragment(self, stream, *args, **kwargs):
  194. """Parse a HTML fragment into a well-formed tree fragment
  195. container - name of the element we're setting the innerHTML property
  196. if set to None, default to 'div'
  197. stream - a filelike object or string containing the HTML to be parsed
  198. The optional encoding parameter must be a string that indicates
  199. the encoding. If specified, that encoding will be used,
  200. regardless of any BOM or later declaration (such as in a meta
  201. element)
  202. scripting - treat noscript elements as if javascript was turned on
  203. """
  204. self._parse(stream, True, *args, **kwargs)
  205. return self.tree.getFragment()
  206. def parseError(self, errorcode="XXX-undefined-error", datavars=None):
  207. # XXX The idea is to make errorcode mandatory.
  208. if datavars is None:
  209. datavars = {}
  210. self.errors.append((self.tokenizer.stream.position(), errorcode, datavars))
  211. if self.strict:
  212. raise ParseError(E[errorcode] % datavars)
  213. def normalizeToken(self, token):
  214. """ HTML5 specific normalizations to the token stream """
  215. if token["type"] == tokenTypes["StartTag"]:
  216. raw = token["data"]
  217. token["data"] = OrderedDict(raw)
  218. if len(raw) > len(token["data"]):
  219. # we had some duplicated attribute, fix so first wins
  220. token["data"].update(raw[::-1])
  221. return token
  222. def adjustMathMLAttributes(self, token):
  223. adjust_attributes(token, adjustMathMLAttributes)
  224. def adjustSVGAttributes(self, token):
  225. adjust_attributes(token, adjustSVGAttributes)
  226. def adjustForeignAttributes(self, token):
  227. adjust_attributes(token, adjustForeignAttributesMap)
  228. def reparseTokenNormal(self, token):
  229. # pylint:disable=unused-argument
  230. self.parser.phase()
  231. def resetInsertionMode(self):
  232. # The name of this method is mostly historical. (It's also used in the
  233. # specification.)
  234. last = False
  235. newModes = {
  236. "select": "inSelect",
  237. "td": "inCell",
  238. "th": "inCell",
  239. "tr": "inRow",
  240. "tbody": "inTableBody",
  241. "thead": "inTableBody",
  242. "tfoot": "inTableBody",
  243. "caption": "inCaption",
  244. "colgroup": "inColumnGroup",
  245. "table": "inTable",
  246. "head": "inBody",
  247. "body": "inBody",
  248. "frameset": "inFrameset",
  249. "html": "beforeHead"
  250. }
  251. for node in self.tree.openElements[::-1]:
  252. nodeName = node.name
  253. new_phase = None
  254. if node == self.tree.openElements[0]:
  255. assert self.innerHTML
  256. last = True
  257. nodeName = self.innerHTML
  258. # Check for conditions that should only happen in the innerHTML
  259. # case
  260. if nodeName in ("select", "colgroup", "head", "html"):
  261. assert self.innerHTML
  262. if not last and node.namespace != self.tree.defaultNamespace:
  263. continue
  264. if nodeName in newModes:
  265. new_phase = self.phases[newModes[nodeName]]
  266. break
  267. elif last:
  268. new_phase = self.phases["inBody"]
  269. break
  270. self.phase = new_phase
  271. def parseRCDataRawtext(self, token, contentType):
  272. """Generic RCDATA/RAWTEXT Parsing algorithm
  273. contentType - RCDATA or RAWTEXT
  274. """
  275. assert contentType in ("RAWTEXT", "RCDATA")
  276. self.tree.insertElement(token)
  277. if contentType == "RAWTEXT":
  278. self.tokenizer.state = self.tokenizer.rawtextState
  279. else:
  280. self.tokenizer.state = self.tokenizer.rcdataState
  281. self.originalPhase = self.phase
  282. self.phase = self.phases["text"]
  283. @_utils.memoize
  284. def getPhases(debug):
  285. def log(function):
  286. """Logger that records which phase processes each token"""
  287. type_names = dict((value, key) for key, value in
  288. tokenTypes.items())
  289. def wrapped(self, *args, **kwargs):
  290. if function.__name__.startswith("process") and len(args) > 0:
  291. token = args[0]
  292. try:
  293. info = {"type": type_names[token['type']]}
  294. except:
  295. raise
  296. if token['type'] in tagTokenTypes:
  297. info["name"] = token['name']
  298. self.parser.log.append((self.parser.tokenizer.state.__name__,
  299. self.parser.phase.__class__.__name__,
  300. self.__class__.__name__,
  301. function.__name__,
  302. info))
  303. return function(self, *args, **kwargs)
  304. else:
  305. return function(self, *args, **kwargs)
  306. return wrapped
  307. def getMetaclass(use_metaclass, metaclass_func):
  308. if use_metaclass:
  309. return method_decorator_metaclass(metaclass_func)
  310. else:
  311. return type
  312. # pylint:disable=unused-argument
  313. class Phase(with_metaclass(getMetaclass(debug, log))):
  314. """Base class for helper object that implements each phase of processing
  315. """
  316. def __init__(self, parser, tree):
  317. self.parser = parser
  318. self.tree = tree
  319. def processEOF(self):
  320. raise NotImplementedError
  321. def processComment(self, token):
  322. # For most phases the following is correct. Where it's not it will be
  323. # overridden.
  324. self.tree.insertComment(token, self.tree.openElements[-1])
  325. def processDoctype(self, token):
  326. self.parser.parseError("unexpected-doctype")
  327. def processCharacters(self, token):
  328. self.tree.insertText(token["data"])
  329. def processSpaceCharacters(self, token):
  330. self.tree.insertText(token["data"])
  331. def processStartTag(self, token):
  332. return self.startTagHandler[token["name"]](token)
  333. def startTagHtml(self, token):
  334. if not self.parser.firstStartTag and token["name"] == "html":
  335. self.parser.parseError("non-html-root")
  336. # XXX Need a check here to see if the first start tag token emitted is
  337. # this token... If it's not, invoke self.parser.parseError().
  338. for attr, value in token["data"].items():
  339. if attr not in self.tree.openElements[0].attributes:
  340. self.tree.openElements[0].attributes[attr] = value
  341. self.parser.firstStartTag = False
  342. def processEndTag(self, token):
  343. return self.endTagHandler[token["name"]](token)
  344. class InitialPhase(Phase):
  345. def processSpaceCharacters(self, token):
  346. pass
  347. def processComment(self, token):
  348. self.tree.insertComment(token, self.tree.document)
  349. def processDoctype(self, token):
  350. name = token["name"]
  351. publicId = token["publicId"]
  352. systemId = token["systemId"]
  353. correct = token["correct"]
  354. if (name != "html" or publicId is not None or
  355. systemId is not None and systemId != "about:legacy-compat"):
  356. self.parser.parseError("unknown-doctype")
  357. if publicId is None:
  358. publicId = ""
  359. self.tree.insertDoctype(token)
  360. if publicId != "":
  361. publicId = publicId.translate(asciiUpper2Lower)
  362. if (not correct or token["name"] != "html" or
  363. publicId.startswith(
  364. ("+//silmaril//dtd html pro v0r11 19970101//",
  365. "-//advasoft ltd//dtd html 3.0 aswedit + extensions//",
  366. "-//as//dtd html 3.0 aswedit + extensions//",
  367. "-//ietf//dtd html 2.0 level 1//",
  368. "-//ietf//dtd html 2.0 level 2//",
  369. "-//ietf//dtd html 2.0 strict level 1//",
  370. "-//ietf//dtd html 2.0 strict level 2//",
  371. "-//ietf//dtd html 2.0 strict//",
  372. "-//ietf//dtd html 2.0//",
  373. "-//ietf//dtd html 2.1e//",
  374. "-//ietf//dtd html 3.0//",
  375. "-//ietf//dtd html 3.2 final//",
  376. "-//ietf//dtd html 3.2//",
  377. "-//ietf//dtd html 3//",
  378. "-//ietf//dtd html level 0//",
  379. "-//ietf//dtd html level 1//",
  380. "-//ietf//dtd html level 2//",
  381. "-//ietf//dtd html level 3//",
  382. "-//ietf//dtd html strict level 0//",
  383. "-//ietf//dtd html strict level 1//",
  384. "-//ietf//dtd html strict level 2//",
  385. "-//ietf//dtd html strict level 3//",
  386. "-//ietf//dtd html strict//",
  387. "-//ietf//dtd html//",
  388. "-//metrius//dtd metrius presentational//",
  389. "-//microsoft//dtd internet explorer 2.0 html strict//",
  390. "-//microsoft//dtd internet explorer 2.0 html//",
  391. "-//microsoft//dtd internet explorer 2.0 tables//",
  392. "-//microsoft//dtd internet explorer 3.0 html strict//",
  393. "-//microsoft//dtd internet explorer 3.0 html//",
  394. "-//microsoft//dtd internet explorer 3.0 tables//",
  395. "-//netscape comm. corp.//dtd html//",
  396. "-//netscape comm. corp.//dtd strict html//",
  397. "-//o'reilly and associates//dtd html 2.0//",
  398. "-//o'reilly and associates//dtd html extended 1.0//",
  399. "-//o'reilly and associates//dtd html extended relaxed 1.0//",
  400. "-//softquad software//dtd hotmetal pro 6.0::19990601::extensions to html 4.0//",
  401. "-//softquad//dtd hotmetal pro 4.0::19971010::extensions to html 4.0//",
  402. "-//spyglass//dtd html 2.0 extended//",
  403. "-//sq//dtd html 2.0 hotmetal + extensions//",
  404. "-//sun microsystems corp.//dtd hotjava html//",
  405. "-//sun microsystems corp.//dtd hotjava strict html//",
  406. "-//w3c//dtd html 3 1995-03-24//",
  407. "-//w3c//dtd html 3.2 draft//",
  408. "-//w3c//dtd html 3.2 final//",
  409. "-//w3c//dtd html 3.2//",
  410. "-//w3c//dtd html 3.2s draft//",
  411. "-//w3c//dtd html 4.0 frameset//",
  412. "-//w3c//dtd html 4.0 transitional//",
  413. "-//w3c//dtd html experimental 19960712//",
  414. "-//w3c//dtd html experimental 970421//",
  415. "-//w3c//dtd w3 html//",
  416. "-//w3o//dtd w3 html 3.0//",
  417. "-//webtechs//dtd mozilla html 2.0//",
  418. "-//webtechs//dtd mozilla html//")) or
  419. publicId in ("-//w3o//dtd w3 html strict 3.0//en//",
  420. "-/w3c/dtd html 4.0 transitional/en",
  421. "html") or
  422. publicId.startswith(
  423. ("-//w3c//dtd html 4.01 frameset//",
  424. "-//w3c//dtd html 4.01 transitional//")) and
  425. systemId is None or
  426. systemId and systemId.lower() == "http://www.ibm.com/data/dtd/v11/ibmxhtml1-transitional.dtd"):
  427. self.parser.compatMode = "quirks"
  428. elif (publicId.startswith(
  429. ("-//w3c//dtd xhtml 1.0 frameset//",
  430. "-//w3c//dtd xhtml 1.0 transitional//")) or
  431. publicId.startswith(
  432. ("-//w3c//dtd html 4.01 frameset//",
  433. "-//w3c//dtd html 4.01 transitional//")) and
  434. systemId is not None):
  435. self.parser.compatMode = "limited quirks"
  436. self.parser.phase = self.parser.phases["beforeHtml"]
  437. def anythingElse(self):
  438. self.parser.compatMode = "quirks"
  439. self.parser.phase = self.parser.phases["beforeHtml"]
  440. def processCharacters(self, token):
  441. self.parser.parseError("expected-doctype-but-got-chars")
  442. self.anythingElse()
  443. return token
  444. def processStartTag(self, token):
  445. self.parser.parseError("expected-doctype-but-got-start-tag",
  446. {"name": token["name"]})
  447. self.anythingElse()
  448. return token
  449. def processEndTag(self, token):
  450. self.parser.parseError("expected-doctype-but-got-end-tag",
  451. {"name": token["name"]})
  452. self.anythingElse()
  453. return token
  454. def processEOF(self):
  455. self.parser.parseError("expected-doctype-but-got-eof")
  456. self.anythingElse()
  457. return True
  458. class BeforeHtmlPhase(Phase):
  459. # helper methods
  460. def insertHtmlElement(self):
  461. self.tree.insertRoot(impliedTagToken("html", "StartTag"))
  462. self.parser.phase = self.parser.phases["beforeHead"]
  463. # other
  464. def processEOF(self):
  465. self.insertHtmlElement()
  466. return True
  467. def processComment(self, token):
  468. self.tree.insertComment(token, self.tree.document)
  469. def processSpaceCharacters(self, token):
  470. pass
  471. def processCharacters(self, token):
  472. self.insertHtmlElement()
  473. return token
  474. def processStartTag(self, token):
  475. if token["name"] == "html":
  476. self.parser.firstStartTag = True
  477. self.insertHtmlElement()
  478. return token
  479. def processEndTag(self, token):
  480. if token["name"] not in ("head", "body", "html", "br"):
  481. self.parser.parseError("unexpected-end-tag-before-html",
  482. {"name": token["name"]})
  483. else:
  484. self.insertHtmlElement()
  485. return token
  486. class BeforeHeadPhase(Phase):
  487. def __init__(self, parser, tree):
  488. Phase.__init__(self, parser, tree)
  489. self.startTagHandler = _utils.MethodDispatcher([
  490. ("html", self.startTagHtml),
  491. ("head", self.startTagHead)
  492. ])
  493. self.startTagHandler.default = self.startTagOther
  494. self.endTagHandler = _utils.MethodDispatcher([
  495. (("head", "body", "html", "br"), self.endTagImplyHead)
  496. ])
  497. self.endTagHandler.default = self.endTagOther
  498. def processEOF(self):
  499. self.startTagHead(impliedTagToken("head", "StartTag"))
  500. return True
  501. def processSpaceCharacters(self, token):
  502. pass
  503. def processCharacters(self, token):
  504. self.startTagHead(impliedTagToken("head", "StartTag"))
  505. return token
  506. def startTagHtml(self, token):
  507. return self.parser.phases["inBody"].processStartTag(token)
  508. def startTagHead(self, token):
  509. self.tree.insertElement(token)
  510. self.tree.headPointer = self.tree.openElements[-1]
  511. self.parser.phase = self.parser.phases["inHead"]
  512. def startTagOther(self, token):
  513. self.startTagHead(impliedTagToken("head", "StartTag"))
  514. return token
  515. def endTagImplyHead(self, token):
  516. self.startTagHead(impliedTagToken("head", "StartTag"))
  517. return token
  518. def endTagOther(self, token):
  519. self.parser.parseError("end-tag-after-implied-root",
  520. {"name": token["name"]})
  521. class InHeadPhase(Phase):
  522. def __init__(self, parser, tree):
  523. Phase.__init__(self, parser, tree)
  524. self.startTagHandler = _utils.MethodDispatcher([
  525. ("html", self.startTagHtml),
  526. ("title", self.startTagTitle),
  527. (("noframes", "style"), self.startTagNoFramesStyle),
  528. ("noscript", self.startTagNoscript),
  529. ("script", self.startTagScript),
  530. (("base", "basefont", "bgsound", "command", "link"),
  531. self.startTagBaseLinkCommand),
  532. ("meta", self.startTagMeta),
  533. ("head", self.startTagHead)
  534. ])
  535. self.startTagHandler.default = self.startTagOther
  536. self.endTagHandler = _utils.MethodDispatcher([
  537. ("head", self.endTagHead),
  538. (("br", "html", "body"), self.endTagHtmlBodyBr)
  539. ])
  540. self.endTagHandler.default = self.endTagOther
  541. # the real thing
  542. def processEOF(self):
  543. self.anythingElse()
  544. return True
  545. def processCharacters(self, token):
  546. self.anythingElse()
  547. return token
  548. def startTagHtml(self, token):
  549. return self.parser.phases["inBody"].processStartTag(token)
  550. def startTagHead(self, token):
  551. self.parser.parseError("two-heads-are-not-better-than-one")
  552. def startTagBaseLinkCommand(self, token):
  553. self.tree.insertElement(token)
  554. self.tree.openElements.pop()
  555. token["selfClosingAcknowledged"] = True
  556. def startTagMeta(self, token):
  557. self.tree.insertElement(token)
  558. self.tree.openElements.pop()
  559. token["selfClosingAcknowledged"] = True
  560. attributes = token["data"]
  561. if self.parser.tokenizer.stream.charEncoding[1] == "tentative":
  562. if "charset" in attributes:
  563. self.parser.tokenizer.stream.changeEncoding(attributes["charset"])
  564. elif ("content" in attributes and
  565. "http-equiv" in attributes and
  566. attributes["http-equiv"].lower() == "content-type"):
  567. # Encoding it as UTF-8 here is a hack, as really we should pass
  568. # the abstract Unicode string, and just use the
  569. # ContentAttrParser on that, but using UTF-8 allows all chars
  570. # to be encoded and as a ASCII-superset works.
  571. data = _inputstream.EncodingBytes(attributes["content"].encode("utf-8"))
  572. parser = _inputstream.ContentAttrParser(data)
  573. codec = parser.parse()
  574. self.parser.tokenizer.stream.changeEncoding(codec)
  575. def startTagTitle(self, token):
  576. self.parser.parseRCDataRawtext(token, "RCDATA")
  577. def startTagNoFramesStyle(self, token):
  578. # Need to decide whether to implement the scripting-disabled case
  579. self.parser.parseRCDataRawtext(token, "RAWTEXT")
  580. def startTagNoscript(self, token):
  581. if self.parser.scripting:
  582. self.parser.parseRCDataRawtext(token, "RAWTEXT")
  583. else:
  584. self.tree.insertElement(token)
  585. self.parser.phase = self.parser.phases["inHeadNoscript"]
  586. def startTagScript(self, token):
  587. self.tree.insertElement(token)
  588. self.parser.tokenizer.state = self.parser.tokenizer.scriptDataState
  589. self.parser.originalPhase = self.parser.phase
  590. self.parser.phase = self.parser.phases["text"]
  591. def startTagOther(self, token):
  592. self.anythingElse()
  593. return token
  594. def endTagHead(self, token):
  595. node = self.parser.tree.openElements.pop()
  596. assert node.name == "head", "Expected head got %s" % node.name
  597. self.parser.phase = self.parser.phases["afterHead"]
  598. def endTagHtmlBodyBr(self, token):
  599. self.anythingElse()
  600. return token
  601. def endTagOther(self, token):
  602. self.parser.parseError("unexpected-end-tag", {"name": token["name"]})
  603. def anythingElse(self):
  604. self.endTagHead(impliedTagToken("head"))
  605. class InHeadNoscriptPhase(Phase):
  606. def __init__(self, parser, tree):
  607. Phase.__init__(self, parser, tree)
  608. self.startTagHandler = _utils.MethodDispatcher([
  609. ("html", self.startTagHtml),
  610. (("basefont", "bgsound", "link", "meta", "noframes", "style"), self.startTagBaseLinkCommand),
  611. (("head", "noscript"), self.startTagHeadNoscript),
  612. ])
  613. self.startTagHandler.default = self.startTagOther
  614. self.endTagHandler = _utils.MethodDispatcher([
  615. ("noscript", self.endTagNoscript),
  616. ("br", self.endTagBr),
  617. ])
  618. self.endTagHandler.default = self.endTagOther
  619. def processEOF(self):
  620. self.parser.parseError("eof-in-head-noscript")
  621. self.anythingElse()
  622. return True
  623. def processComment(self, token):
  624. return self.parser.phases["inHead"].processComment(token)
  625. def processCharacters(self, token):
  626. self.parser.parseError("char-in-head-noscript")
  627. self.anythingElse()
  628. return token
  629. def processSpaceCharacters(self, token):
  630. return self.parser.phases["inHead"].processSpaceCharacters(token)
  631. def startTagHtml(self, token):
  632. return self.parser.phases["inBody"].processStartTag(token)
  633. def startTagBaseLinkCommand(self, token):
  634. return self.parser.phases["inHead"].processStartTag(token)
  635. def startTagHeadNoscript(self, token):
  636. self.parser.parseError("unexpected-start-tag", {"name": token["name"]})
  637. def startTagOther(self, token):
  638. self.parser.parseError("unexpected-inhead-noscript-tag", {"name": token["name"]})
  639. self.anythingElse()
  640. return token
  641. def endTagNoscript(self, token):
  642. node = self.parser.tree.openElements.pop()
  643. assert node.name == "noscript", "Expected noscript got %s" % node.name
  644. self.parser.phase = self.parser.phases["inHead"]
  645. def endTagBr(self, token):
  646. self.parser.parseError("unexpected-inhead-noscript-tag", {"name": token["name"]})
  647. self.anythingElse()
  648. return token
  649. def endTagOther(self, token):
  650. self.parser.parseError("unexpected-end-tag", {"name": token["name"]})
  651. def anythingElse(self):
  652. # Caller must raise parse error first!
  653. self.endTagNoscript(impliedTagToken("noscript"))
  654. class AfterHeadPhase(Phase):
  655. def __init__(self, parser, tree):
  656. Phase.__init__(self, parser, tree)
  657. self.startTagHandler = _utils.MethodDispatcher([
  658. ("html", self.startTagHtml),
  659. ("body", self.startTagBody),
  660. ("frameset", self.startTagFrameset),
  661. (("base", "basefont", "bgsound", "link", "meta", "noframes", "script",
  662. "style", "title"),
  663. self.startTagFromHead),
  664. ("head", self.startTagHead)
  665. ])
  666. self.startTagHandler.default = self.startTagOther
  667. self.endTagHandler = _utils.MethodDispatcher([(("body", "html", "br"),
  668. self.endTagHtmlBodyBr)])
  669. self.endTagHandler.default = self.endTagOther
  670. def processEOF(self):
  671. self.anythingElse()
  672. return True
  673. def processCharacters(self, token):
  674. self.anythingElse()
  675. return token
  676. def startTagHtml(self, token):
  677. return self.parser.phases["inBody"].processStartTag(token)
  678. def startTagBody(self, token):
  679. self.parser.framesetOK = False
  680. self.tree.insertElement(token)
  681. self.parser.phase = self.parser.phases["inBody"]
  682. def startTagFrameset(self, token):
  683. self.tree.insertElement(token)
  684. self.parser.phase = self.parser.phases["inFrameset"]
  685. def startTagFromHead(self, token):
  686. self.parser.parseError("unexpected-start-tag-out-of-my-head",
  687. {"name": token["name"]})
  688. self.tree.openElements.append(self.tree.headPointer)
  689. self.parser.phases["inHead"].processStartTag(token)
  690. for node in self.tree.openElements[::-1]:
  691. if node.name == "head":
  692. self.tree.openElements.remove(node)
  693. break
  694. def startTagHead(self, token):
  695. self.parser.parseError("unexpected-start-tag", {"name": token["name"]})
  696. def startTagOther(self, token):
  697. self.anythingElse()
  698. return token
  699. def endTagHtmlBodyBr(self, token):
  700. self.anythingElse()
  701. return token
  702. def endTagOther(self, token):
  703. self.parser.parseError("unexpected-end-tag", {"name": token["name"]})
  704. def anythingElse(self):
  705. self.tree.insertElement(impliedTagToken("body", "StartTag"))
  706. self.parser.phase = self.parser.phases["inBody"]
  707. self.parser.framesetOK = True
  708. class InBodyPhase(Phase):
  709. # http://www.whatwg.org/specs/web-apps/current-work/#parsing-main-inbody
  710. # the really-really-really-very crazy mode
  711. def __init__(self, parser, tree):
  712. Phase.__init__(self, parser, tree)
  713. # Set this to the default handler
  714. self.processSpaceCharacters = self.processSpaceCharactersNonPre
  715. self.startTagHandler = _utils.MethodDispatcher([
  716. ("html", self.startTagHtml),
  717. (("base", "basefont", "bgsound", "command", "link", "meta",
  718. "script", "style", "title"),
  719. self.startTagProcessInHead),
  720. ("body", self.startTagBody),
  721. ("frameset", self.startTagFrameset),
  722. (("address", "article", "aside", "blockquote", "center", "details",
  723. "dir", "div", "dl", "fieldset", "figcaption", "figure",
  724. "footer", "header", "hgroup", "main", "menu", "nav", "ol", "p",
  725. "section", "summary", "ul"),
  726. self.startTagCloseP),
  727. (headingElements, self.startTagHeading),
  728. (("pre", "listing"), self.startTagPreListing),
  729. ("form", self.startTagForm),
  730. (("li", "dd", "dt"), self.startTagListItem),
  731. ("plaintext", self.startTagPlaintext),
  732. ("a", self.startTagA),
  733. (("b", "big", "code", "em", "font", "i", "s", "small", "strike",
  734. "strong", "tt", "u"), self.startTagFormatting),
  735. ("nobr", self.startTagNobr),
  736. ("button", self.startTagButton),
  737. (("applet", "marquee", "object"), self.startTagAppletMarqueeObject),
  738. ("xmp", self.startTagXmp),
  739. ("table", self.startTagTable),
  740. (("area", "br", "embed", "img", "keygen", "wbr"),
  741. self.startTagVoidFormatting),
  742. (("param", "source", "track"), self.startTagParamSource),
  743. ("input", self.startTagInput),
  744. ("hr", self.startTagHr),
  745. ("image", self.startTagImage),
  746. ("isindex", self.startTagIsIndex),
  747. ("textarea", self.startTagTextarea),
  748. ("iframe", self.startTagIFrame),
  749. ("noscript", self.startTagNoscript),
  750. (("noembed", "noframes"), self.startTagRawtext),
  751. ("select", self.startTagSelect),
  752. (("rp", "rt"), self.startTagRpRt),
  753. (("option", "optgroup"), self.startTagOpt),
  754. (("math"), self.startTagMath),
  755. (("svg"), self.startTagSvg),
  756. (("caption", "col", "colgroup", "frame", "head",
  757. "tbody", "td", "tfoot", "th", "thead",
  758. "tr"), self.startTagMisplaced)
  759. ])
  760. self.startTagHandler.default = self.startTagOther
  761. self.endTagHandler = _utils.MethodDispatcher([
  762. ("body", self.endTagBody),
  763. ("html", self.endTagHtml),
  764. (("address", "article", "aside", "blockquote", "button", "center",
  765. "details", "dialog", "dir", "div", "dl", "fieldset", "figcaption", "figure",
  766. "footer", "header", "hgroup", "listing", "main", "menu", "nav", "ol", "pre",
  767. "section", "summary", "ul"), self.endTagBlock),
  768. ("form", self.endTagForm),
  769. ("p", self.endTagP),
  770. (("dd", "dt", "li"), self.endTagListItem),
  771. (headingElements, self.endTagHeading),
  772. (("a", "b", "big", "code", "em", "font", "i", "nobr", "s", "small",
  773. "strike", "strong", "tt", "u"), self.endTagFormatting),
  774. (("applet", "marquee", "object"), self.endTagAppletMarqueeObject),
  775. ("br", self.endTagBr),
  776. ])
  777. self.endTagHandler.default = self.endTagOther
  778. def isMatchingFormattingElement(self, node1, node2):
  779. return (node1.name == node2.name and
  780. node1.namespace == node2.namespace and
  781. node1.attributes == node2.attributes)
  782. # helper
  783. def addFormattingElement(self, token):
  784. self.tree.insertElement(token)
  785. element = self.tree.openElements[-1]
  786. matchingElements = []
  787. for node in self.tree.activeFormattingElements[::-1]:
  788. if node is Marker:
  789. break
  790. elif self.isMatchingFormattingElement(node, element):
  791. matchingElements.append(node)
  792. assert len(matchingElements) <= 3
  793. if len(matchingElements) == 3:
  794. self.tree.activeFormattingElements.remove(matchingElements[-1])
  795. self.tree.activeFormattingElements.append(element)
  796. # the real deal
  797. def processEOF(self):
  798. allowed_elements = frozenset(("dd", "dt", "li", "p", "tbody", "td",
  799. "tfoot", "th", "thead", "tr", "body",
  800. "html"))
  801. for node in self.tree.openElements[::-1]:
  802. if node.name not in allowed_elements:
  803. self.parser.parseError("expected-closing-tag-but-got-eof")
  804. break
  805. # Stop parsing
  806. def processSpaceCharactersDropNewline(self, token):
  807. # Sometimes (start of <pre>, <listing>, and <textarea> blocks) we
  808. # want to drop leading newlines
  809. data = token["data"]
  810. self.processSpaceCharacters = self.processSpaceCharactersNonPre
  811. if (data.startswith("\n") and
  812. self.tree.openElements[-1].name in ("pre", "listing", "textarea") and
  813. not self.tree.openElements[-1].hasContent()):
  814. data = data[1:]
  815. if data:
  816. self.tree.reconstructActiveFormattingElements()
  817. self.tree.insertText(data)
  818. def processCharacters(self, token):
  819. if token["data"] == "\u0000":
  820. # The tokenizer should always emit null on its own
  821. return
  822. self.tree.reconstructActiveFormattingElements()
  823. self.tree.insertText(token["data"])
  824. # This must be bad for performance
  825. if (self.parser.framesetOK and
  826. any([char not in spaceCharacters
  827. for char in token["data"]])):
  828. self.parser.framesetOK = False
  829. def processSpaceCharactersNonPre(self, token):
  830. self.tree.reconstructActiveFormattingElements()
  831. self.tree.insertText(token["data"])
  832. def startTagProcessInHead(self, token):
  833. return self.parser.phases["inHead"].processStartTag(token)
  834. def startTagBody(self, token):
  835. self.parser.parseError("unexpected-start-tag", {"name": "body"})
  836. if (len(self.tree.openElements) == 1 or
  837. self.tree.openElements[1].name != "body"):
  838. assert self.parser.innerHTML
  839. else:
  840. self.parser.framesetOK = False
  841. for attr, value in token["data"].items():
  842. if attr not in self.tree.openElements[1].attributes:
  843. self.tree.openElements[1].attributes[attr] = value
  844. def startTagFrameset(self, token):
  845. self.parser.parseError("unexpected-start-tag", {"name": "frameset"})
  846. if (len(self.tree.openElements) == 1 or self.tree.openElements[1].name != "body"):
  847. assert self.parser.innerHTML
  848. elif not self.parser.framesetOK:
  849. pass
  850. else:
  851. if self.tree.openElements[1].parent:
  852. self.tree.openElements[1].parent.removeChild(self.tree.openElements[1])
  853. while self.tree.openElements[-1].name != "html":
  854. self.tree.openElements.pop()
  855. self.tree.insertElement(token)
  856. self.parser.phase = self.parser.phases["inFrameset"]
  857. def startTagCloseP(self, token):
  858. if self.tree.elementInScope("p", variant="button"):
  859. self.endTagP(impliedTagToken("p"))
  860. self.tree.insertElement(token)
  861. def startTagPreListing(self, token):
  862. if self.tree.elementInScope("p", variant="button"):
  863. self.endTagP(impliedTagToken("p"))
  864. self.tree.insertElement(token)
  865. self.parser.framesetOK = False
  866. self.processSpaceCharacters = self.processSpaceCharactersDropNewline
  867. def startTagForm(self, token):
  868. if self.tree.formPointer:
  869. self.parser.parseError("unexpected-start-tag", {"name": "form"})
  870. else:
  871. if self.tree.elementInScope("p", variant="button"):
  872. self.endTagP(impliedTagToken("p"))
  873. self.tree.insertElement(token)
  874. self.tree.formPointer = self.tree.openElements[-1]
  875. def startTagListItem(self, token):
  876. self.parser.framesetOK = False
  877. stopNamesMap = {"li": ["li"],
  878. "dt": ["dt", "dd"],
  879. "dd": ["dt", "dd"]}
  880. stopNames = stopNamesMap[token["name"]]
  881. for node in reversed(self.tree.openElements):
  882. if node.name in stopNames:
  883. self.parser.phase.processEndTag(
  884. impliedTagToken(node.name, "EndTag"))
  885. break
  886. if (node.nameTuple in specialElements and
  887. node.name not in ("address", "div", "p")):
  888. break
  889. if self.tree.elementInScope("p", variant="button"):
  890. self.parser.phase.processEndTag(
  891. impliedTagToken("p", "EndTag"))
  892. self.tree.insertElement(token)
  893. def startTagPlaintext(self, token):
  894. if self.tree.elementInScope("p", variant="button"):
  895. self.endTagP(impliedTagToken("p"))
  896. self.tree.insertElement(token)
  897. self.parser.tokenizer.state = self.parser.tokenizer.plaintextState
  898. def startTagHeading(self, token):
  899. if self.tree.elementInScope("p", variant="button"):
  900. self.endTagP(impliedTagToken("p"))
  901. if self.tree.openElements[-1].name in headingElements:
  902. self.parser.parseError("unexpected-start-tag", {"name": token["name"]})
  903. self.tree.openElements.pop()
  904. self.tree.insertElement(token)
  905. def startTagA(self, token):
  906. afeAElement = self.tree.elementInActiveFormattingElements("a")
  907. if afeAElement:
  908. self.parser.parseError("unexpected-start-tag-implies-end-tag",
  909. {"startName": "a", "endName": "a"})
  910. self.endTagFormatting(impliedTagToken("a"))
  911. if afeAElement in self.tree.openElements:
  912. self.tree.openElements.remove(afeAElement)
  913. if afeAElement in self.tree.activeFormattingElements:
  914. self.tree.activeFormattingElements.remove(afeAElement)
  915. self.tree.reconstructActiveFormattingElements()
  916. self.addFormattingElement(token)
  917. def startTagFormatting(self, token):
  918. self.tree.reconstructActiveFormattingElements()
  919. self.addFormattingElement(token)
  920. def startTagNobr(self, token):
  921. self.tree.reconstructActiveFormattingElements()
  922. if self.tree.elementInScope("nobr"):
  923. self.parser.parseError("unexpected-start-tag-implies-end-tag",
  924. {"startName": "nobr", "endName": "nobr"})
  925. self.processEndTag(impliedTagToken("nobr"))
  926. # XXX Need tests that trigger the following
  927. self.tree.reconstructActiveFormattingElements()
  928. self.addFormattingElement(token)
  929. def startTagButton(self, token):
  930. if self.tree.elementInScope("button"):
  931. self.parser.parseError("unexpected-start-tag-implies-end-tag",
  932. {"startName": "button", "endName": "button"})
  933. self.processEndTag(impliedTagToken("button"))
  934. return token
  935. else:
  936. self.tree.reconstructActiveFormattingElements()
  937. self.tree.insertElement(token)
  938. self.parser.framesetOK = False
  939. def startTagAppletMarqueeObject(self, token):
  940. self.tree.reconstructActiveFormattingElements()
  941. self.tree.insertElement(token)
  942. self.tree.activeFormattingElements.append(Marker)
  943. self.parser.framesetOK = False
  944. def startTagXmp(self, token):
  945. if self.tree.elementInScope("p", variant="button"):
  946. self.endTagP(impliedTagToken("p"))
  947. self.tree.reconstructActiveFormattingElements()
  948. self.parser.framesetOK = False
  949. self.parser.parseRCDataRawtext(token, "RAWTEXT")
  950. def startTagTable(self, token):
  951. if self.parser.compatMode != "quirks":
  952. if self.tree.elementInScope("p", variant="button"):
  953. self.processEndTag(impliedTagToken("p"))
  954. self.tree.insertElement(token)
  955. self.parser.framesetOK = False
  956. self.parser.phase = self.parser.phases["inTable"]
  957. def startTagVoidFormatting(self, token):
  958. self.tree.reconstructActiveFormattingElements()
  959. self.tree.insertElement(token)
  960. self.tree.openElements.pop()
  961. token["selfClosingAcknowledged"] = True
  962. self.parser.framesetOK = False
  963. def startTagInput(self, token):
  964. framesetOK = self.parser.framesetOK
  965. self.startTagVoidFormatting(token)
  966. if ("type" in token["data"] and
  967. token["data"]["type"].translate(asciiUpper2Lower) == "hidden"):
  968. # input type=hidden doesn't change framesetOK
  969. self.parser.framesetOK = framesetOK
  970. def startTagParamSource(self, token):
  971. self.tree.insertElement(token)
  972. self.tree.openElements.pop()
  973. token["selfClosingAcknowledged"] = True
  974. def startTagHr(self, token):
  975. if self.tree.elementInScope("p", variant="button"):
  976. self.endTagP(impliedTagToken("p"))
  977. self.tree.insertElement(token)
  978. self.tree.openElements.pop()
  979. token["selfClosingAcknowledged"] = True
  980. self.parser.framesetOK = False
  981. def startTagImage(self, token):
  982. # No really...
  983. self.parser.parseError("unexpected-start-tag-treated-as",
  984. {"originalName": "image", "newName": "img"})
  985. self.processStartTag(impliedTagToken("img", "StartTag",
  986. attributes=token["data"],
  987. selfClosing=token["selfClosing"]))
  988. def startTagIsIndex(self, token):
  989. self.parser.parseError("deprecated-tag", {"name": "isindex"})
  990. if self.tree.formPointer:
  991. return
  992. form_attrs = {}
  993. if "action" in token["data"]:
  994. form_attrs["action"] = token["data"]["action"]
  995. self.processStartTag(impliedTagToken("form", "StartTag",
  996. attributes=form_attrs))
  997. self.processStartTag(impliedTagToken("hr", "StartTag"))
  998. self.processStartTag(impliedTagToken("label", "StartTag"))
  999. # XXX Localization ...
  1000. if "prompt" in token["data"]:
  1001. prompt = token["data"]["prompt"]
  1002. else:
  1003. prompt = "This is a searchable index. Enter search keywords: "
  1004. self.processCharacters(
  1005. {"type": tokenTypes["Characters"], "data": prompt})
  1006. attributes = token["data"].copy()
  1007. if "action" in attributes:
  1008. del attributes["action"]
  1009. if "prompt" in attributes:
  1010. del attributes["prompt"]
  1011. attributes["name"] = "isindex"
  1012. self.processStartTag(impliedTagToken("input", "StartTag",
  1013. attributes=attributes,
  1014. selfClosing=token["selfClosing"]))
  1015. self.processEndTag(impliedTagToken("label"))
  1016. self.processStartTag(impliedTagToken("hr", "StartTag"))
  1017. self.processEndTag(impliedTagToken("form"))
  1018. def startTagTextarea(self, token):
  1019. self.tree.insertElement(token)
  1020. self.parser.tokenizer.state = self.parser.tokenizer.rcdataState
  1021. self.processSpaceCharacters = self.processSpaceCharactersDropNewline
  1022. self.parser.framesetOK = False
  1023. def startTagIFrame(self, token):
  1024. self.parser.framesetOK = False
  1025. self.startTagRawtext(token)
  1026. def startTagNoscript(self, token):
  1027. if self.parser.scripting:
  1028. self.startTagRawtext(token)
  1029. else:
  1030. self.startTagOther(token)
  1031. def startTagRawtext(self, token):
  1032. """iframe, noembed noframes, noscript(if scripting enabled)"""
  1033. self.parser.parseRCDataRawtext(token, "RAWTEXT")
  1034. def startTagOpt(self, token):
  1035. if self.tree.openElements[-1].name == "option":
  1036. self.parser.phase.processEndTag(impliedTagToken("option"))
  1037. self.tree.reconstructActiveFormattingElements()
  1038. self.parser.tree.insertElement(token)
  1039. def startTagSelect(self, token):
  1040. self.tree.reconstructActiveFormattingElements()
  1041. self.tree.insertElement(token)
  1042. self.parser.framesetOK = False
  1043. if self.parser.phase in (self.parser.phases["inTable"],
  1044. self.parser.phases["inCaption"],
  1045. self.parser.phases["inColumnGroup"],
  1046. self.parser.phases["inTableBody"],
  1047. self.parser.phases["inRow"],
  1048. self.parser.phases["inCell"]):
  1049. self.parser.phase = self.parser.phases["inSelectInTable"]
  1050. else:
  1051. self.parser.phase = self.parser.phases["inSelect"]
  1052. def startTagRpRt(self, token):
  1053. if self.tree.elementInScope("ruby"):
  1054. self.tree.generateImpliedEndTags()
  1055. if self.tree.openElements[-1].name != "ruby":
  1056. self.parser.parseError()
  1057. self.tree.insertElement(token)
  1058. def startTagMath(self, token):
  1059. self.tree.reconstructActiveFormattingElements()
  1060. self.parser.adjustMathMLAttributes(token)
  1061. self.parser.adjustForeignAttributes(token)
  1062. token["namespace"] = namespaces["mathml"]
  1063. self.tree.insertElement(token)
  1064. # Need to get the parse error right for the case where the token
  1065. # has a namespace not equal to the xmlns attribute
  1066. if token["selfClosing"]:
  1067. self.tree.openElements.pop()
  1068. token["selfClosingAcknowledged"] = True
  1069. def startTagSvg(self, token):
  1070. self.tree.reconstructActiveFormattingElements()
  1071. self.parser.adjustSVGAttributes(token)
  1072. self.parser.adjustForeignAttributes(token)
  1073. token["namespace"] = namespaces["svg"]
  1074. self.tree.insertElement(token)
  1075. # Need to get the parse error right for the case where the token
  1076. # has a namespace not equal to the xmlns attribute
  1077. if token["selfClosing"]:
  1078. self.tree.openElements.pop()
  1079. token["selfClosingAcknowledged"] = True
  1080. def startTagMisplaced(self, token):
  1081. """ Elements that should be children of other elements that have a
  1082. different insertion mode; here they are ignored
  1083. "caption", "col", "colgroup", "frame", "frameset", "head",
  1084. "option", "optgroup", "tbody", "td", "tfoot", "th", "thead",
  1085. "tr", "noscript"
  1086. """
  1087. self.parser.parseError("unexpected-start-tag-ignored", {"name": token["name"]})
  1088. def startTagOther(self, token):
  1089. self.tree.reconstructActiveFormattingElements()
  1090. self.tree.insertElement(token)
  1091. def endTagP(self, token):
  1092. if not self.tree.elementInScope("p", variant="button"):
  1093. self.startTagCloseP(impliedTagToken("p", "StartTag"))
  1094. self.parser.parseError("unexpected-end-tag", {"name": "p"})
  1095. self.endTagP(impliedTagToken("p", "EndTag"))
  1096. else:
  1097. self.tree.generateImpliedEndTags("p")
  1098. if self.tree.openElements[-1].name != "p":
  1099. self.parser.parseError("unexpected-end-tag", {"name": "p"})
  1100. node = self.tree.openElements.pop()
  1101. while node.name != "p":
  1102. node = self.tree.openElements.pop()
  1103. def endTagBody(self, token):
  1104. if not self.tree.elementInScope("body"):
  1105. self.parser.parseError()
  1106. return
  1107. elif self.tree.openElements[-1].name != "body":
  1108. for node in self.tree.openElements[2:]:
  1109. if node.name not in frozenset(("dd", "dt", "li", "optgroup",
  1110. "option", "p", "rp", "rt",
  1111. "tbody", "td", "tfoot",
  1112. "th", "thead", "tr", "body",
  1113. "html")):
  1114. # Not sure this is the correct name for the parse error
  1115. self.parser.parseError(
  1116. "expected-one-end-tag-but-got-another",
  1117. {"gotName": "body", "expectedName": node.name})
  1118. break
  1119. self.parser.phase = self.parser.phases["afterBody"]
  1120. def endTagHtml(self, token):
  1121. # We repeat the test for the body end tag token being ignored here
  1122. if self.tree.elementInScope("body"):
  1123. self.endTagBody(impliedTagToken("body"))
  1124. return token
  1125. def endTagBlock(self, token):
  1126. # Put us back in the right whitespace handling mode
  1127. if token["name"] == "pre":
  1128. self.processSpaceCharacters = self.processSpaceCharactersNonPre
  1129. inScope = self.tree.elementInScope(token["name"])
  1130. if inScope:
  1131. self.tree.generateImpliedEndTags()
  1132. if self.tree.openElements[-1].name != token["name"]:
  1133. self.parser.parseError("end-tag-too-early", {"name": token["name"]})
  1134. if inScope:
  1135. node = self.tree.openElements.pop()
  1136. while node.name != token["name"]:
  1137. node = self.tree.openElements.pop()
  1138. def endTagForm(self, token):
  1139. node = self.tree.formPointer
  1140. self.tree.formPointer = None
  1141. if node is None or not self.tree.elementInScope(node):
  1142. self.parser.parseError("unexpected-end-tag",
  1143. {"name": "form"})
  1144. else:
  1145. self.tree.generateImpliedEndTags()
  1146. if self.tree.openElements[-1] != node:
  1147. self.parser.parseError("end-tag-too-early-ignored",
  1148. {"name": "form"})
  1149. self.tree.openElements.remove(node)
  1150. def endTagListItem(self, token):
  1151. if token["name"] == "li":
  1152. variant = "list"
  1153. else:
  1154. variant = None
  1155. if not self.tree.elementInScope(token["name"], variant=variant):
  1156. self.parser.parseError("unexpected-end-tag", {"name": token["name"]})
  1157. else:
  1158. self.tree.generateImpliedEndTags(exclude=token["name"])
  1159. if self.tree.openElements[-1].name != token["name"]:
  1160. self.parser.parseError(
  1161. "end-tag-too-early",
  1162. {"name": token["name"]})
  1163. node = self.tree.openElements.pop()
  1164. while node.name != token["name"]:
  1165. node = self.tree.openElements.pop()
  1166. def endTagHeading(self, token):
  1167. for item in headingElements:
  1168. if self.tree.elementInScope(item):
  1169. self.tree.generateImpliedEndTags()
  1170. break
  1171. if self.tree.openElements[-1].name != token["name"]:
  1172. self.parser.parseError("end-tag-too-early", {"name": token["name"]})
  1173. for item in headingElements:
  1174. if self.tree.elementInScope(item):
  1175. item = self.tree.openElements.pop()
  1176. while item.name not in headingElements:
  1177. item = self.tree.openElements.pop()
  1178. break
  1179. def endTagFormatting(self, token):
  1180. """The much-feared adoption agency algorithm"""
  1181. # http://svn.whatwg.org/webapps/complete.html#adoptionAgency revision 7867
  1182. # XXX Better parseError messages appreciated.
  1183. # Step 1
  1184. outerLoopCounter = 0
  1185. # Step 2
  1186. while outerLoopCounter < 8:
  1187. # Step 3
  1188. outerLoopCounter += 1
  1189. # Step 4:
  1190. # Let the formatting element be the last element in
  1191. # the list of active formatting elements that:
  1192. # - is between the end of the list and the last scope
  1193. # marker in the list, if any, or the start of the list
  1194. # otherwise, and
  1195. # - has the same tag name as the token.
  1196. formattingElement = self.tree.elementInActiveFormattingElements(
  1197. token["name"])
  1198. if (not formattingElement or
  1199. (formattingElement in self.tree.openElements and
  1200. not self.tree.elementInScope(formattingElement.name))):
  1201. # If there is no such node, then abort these steps
  1202. # and instead act as described in the "any other
  1203. # end tag" entry below.
  1204. self.endTagOther(token)
  1205. return
  1206. # Otherwise, if there is such a node, but that node is
  1207. # not in the stack of open elements, then this is a
  1208. # parse error; remove the element from the list, and
  1209. # abort these steps.
  1210. elif formattingElement not in self.tree.openElements:
  1211. self.parser.parseError("adoption-agency-1.2", {"name": token["name"]})
  1212. self.tree.activeFormattingElements.remove(formattingElement)
  1213. return
  1214. # Otherwise, if there is such a node, and that node is
  1215. # also in the stack of open elements, but the element
  1216. # is not in scope, then this is a parse error; ignore
  1217. # the token, and abort these steps.
  1218. elif not self.tree.elementInScope(formattingElement.name):
  1219. self.parser.parseError("adoption-agency-4.4", {"name": token["name"]})
  1220. return
  1221. # Otherwise, there is a formatting element and that
  1222. # element is in the stack and is in scope. If the
  1223. # element is not the current node, this is a parse
  1224. # error. In any case, proceed with the algorithm as
  1225. # written in the following steps.
  1226. else:
  1227. if formattingElement != self.tree.openElements[-1]:
  1228. self.parser.parseError("adoption-agency-1.3", {"name": token["name"]})
  1229. # Step 5:
  1230. # Let the furthest block be the topmost node in the
  1231. # stack of open elements that is lower in the stack
  1232. # than the formatting element, and is an element in
  1233. # the special category. There might not be one.
  1234. afeIndex = self.tree.openElements.index(formattingElement)
  1235. furthestBlock = None
  1236. for element in self.tree.openElements[afeIndex:]:
  1237. if element.nameTuple in specialElements:
  1238. furthestBlock = element
  1239. break
  1240. # Step 6:
  1241. # If there is no furthest block, then the UA must
  1242. # first pop all the nodes from the bottom of the stack
  1243. # of open elements, from the current node up to and
  1244. # including the formatting element, then remove the
  1245. # formatting element from the list of active
  1246. # formatting elements, and finally abort these steps.
  1247. if furthestBlock is None:
  1248. element = self.tree.openElements.pop()
  1249. while element != formattingElement:
  1250. element = self.tree.openElements.pop()
  1251. self.tree.activeFormattingElements.remove(element)
  1252. return
  1253. # Step 7
  1254. commonAncestor = self.tree.openElements[afeIndex - 1]
  1255. # Step 8:
  1256. # The bookmark is supposed to help us identify where to reinsert
  1257. # nodes in step 15. We have to ensure that we reinsert nodes after
  1258. # the node before the active formatting element. Note the bookmark
  1259. # can move in step 9.7
  1260. bookmark = self.tree.activeFormattingElements.index(formattingElement)
  1261. # Step 9
  1262. lastNode = node = furthestBlock
  1263. innerLoopCounter = 0
  1264. index = self.tree.openElements.index(node)
  1265. while innerLoopCounter < 3:
  1266. innerLoopCounter += 1
  1267. # Node is element before node in open elements
  1268. index -= 1
  1269. node = self.tree.openElements[index]
  1270. if node not in self.tree.activeFormattingElements:
  1271. self.tree.openElements.remove(node)
  1272. continue
  1273. # Step 9.6
  1274. if node == formattingElement:
  1275. break
  1276. # Step 9.7
  1277. if lastNode == furthestBlock:
  1278. bookmark = self.tree.activeFormattingElements.index(node) + 1
  1279. # Step 9.8
  1280. clone = node.cloneNode()
  1281. # Replace node with clone
  1282. self.tree.activeFormattingElements[
  1283. self.tree.activeFormattingElements.index(node)] = clone
  1284. self.tree.openElements[
  1285. self.tree.openElements.index(node)] = clone
  1286. node = clone
  1287. # Step 9.9
  1288. # Remove lastNode from its parents, if any
  1289. if lastNode.parent:
  1290. lastNode.parent.removeChild(lastNode)
  1291. node.appendChild(lastNode)
  1292. # Step 9.10
  1293. lastNode = node
  1294. # Step 10
  1295. # Foster parent lastNode if commonAncestor is a
  1296. # table, tbody, tfoot, thead, or tr we need to foster
  1297. # parent the lastNode
  1298. if lastNode.parent:
  1299. lastNode.parent.removeChild(lastNode)
  1300. if commonAncestor.name in frozenset(("table", "tbody", "tfoot", "thead", "tr")):
  1301. parent, insertBefore = self.tree.getTableMisnestedNodePosition()
  1302. parent.insertBefore(lastNode, insertBefore)
  1303. else:
  1304. commonAncestor.appendChild(lastNode)
  1305. # Step 11
  1306. clone = formattingElement.cloneNode()
  1307. # Step 12
  1308. furthestBlock.reparentChildren(clone)
  1309. # Step 13
  1310. furthestBlock.appendChild(clone)
  1311. # Step 14
  1312. self.tree.activeFormattingElements.remove(formattingElement)
  1313. self.tree.activeFormattingElements.insert(bookmark, clone)
  1314. # Step 15
  1315. self.tree.openElements.remove(formattingElement)
  1316. self.tree.openElements.insert(
  1317. self.tree.openElements.index(furthestBlock) + 1, clone)
  1318. def endTagAppletMarqueeObject(self, token):
  1319. if self.tree.elementInScope(token["name"]):
  1320. self.tree.generateImpliedEndTags()
  1321. if self.tree.openElements[-1].name != token["name"]:
  1322. self.parser.parseError("end-tag-too-early", {"name": token["name"]})
  1323. if self.tree.elementInScope(token["name"]):
  1324. element = self.tree.openElements.pop()
  1325. while element.name != token["name"]:
  1326. element = self.tree.openElements.pop()
  1327. self.tree.clearActiveFormattingElements()
  1328. def endTagBr(self, token):
  1329. self.parser.parseError("unexpected-end-tag-treated-as",
  1330. {"originalName": "br", "newName": "br element"})
  1331. self.tree.reconstructActiveFormattingElements()
  1332. self.tree.insertElement(impliedTagToken("br", "StartTag"))
  1333. self.tree.openElements.pop()
  1334. def endTagOther(self, token):
  1335. for node in self.tree.openElements[::-1]:
  1336. if node.name == token["name"]:
  1337. self.tree.generateImpliedEndTags(exclude=token["name"])
  1338. if self.tree.openElements[-1].name != token["name"]:
  1339. self.parser.parseError("unexpected-end-tag", {"name": token["name"]})
  1340. while self.tree.openElements.pop() != node:
  1341. pass
  1342. break
  1343. else:
  1344. if node.nameTuple in specialElements:
  1345. self.parser.parseError("unexpected-end-tag", {"name": token["name"]})
  1346. break
  1347. class TextPhase(Phase):
  1348. def __init__(self, parser, tree):
  1349. Phase.__init__(self, parser, tree)
  1350. self.startTagHandler = _utils.MethodDispatcher([])
  1351. self.startTagHandler.default = self.startTagOther
  1352. self.endTagHandler = _utils.MethodDispatcher([
  1353. ("script", self.endTagScript)])
  1354. self.endTagHandler.default = self.endTagOther
  1355. def processCharacters(self, token):
  1356. self.tree.insertText(token["data"])
  1357. def processEOF(self):
  1358. self.parser.parseError("expected-named-closing-tag-but-got-eof",
  1359. {"name": self.tree.openElements[-1].name})
  1360. self.tree.openElements.pop()
  1361. self.parser.phase = self.parser.originalPhase
  1362. return True
  1363. def startTagOther(self, token):
  1364. assert False, "Tried to process start tag %s in RCDATA/RAWTEXT mode" % token['name']
  1365. def endTagScript(self, token):
  1366. node = self.tree.openElements.pop()
  1367. assert node.name == "script"
  1368. self.parser.phase = self.parser.originalPhase
  1369. # The rest of this method is all stuff that only happens if
  1370. # document.write works
  1371. def endTagOther(self, token):
  1372. self.tree.openElements.pop()
  1373. self.parser.phase = self.parser.originalPhase
  1374. class InTablePhase(Phase):
  1375. # http://www.whatwg.org/specs/web-apps/current-work/#in-table
  1376. def __init__(self, parser, tree):
  1377. Phase.__init__(self, parser, tree)
  1378. self.startTagHandler = _utils.MethodDispatcher([
  1379. ("html", self.startTagHtml),
  1380. ("caption", self.startTagCaption),
  1381. ("colgroup", self.startTagColgroup),
  1382. ("col", self.startTagCol),
  1383. (("tbody", "tfoot", "thead"), self.startTagRowGroup),
  1384. (("td", "th", "tr"), self.startTagImplyTbody),
  1385. ("table", self.startTagTable),
  1386. (("style", "script"), self.startTagStyleScript),
  1387. ("input", self.startTagInput),
  1388. ("form", self.startTagForm)
  1389. ])
  1390. self.startTagHandler.default = self.startTagOther
  1391. self.endTagHandler = _utils.MethodDispatcher([
  1392. ("table", self.endTagTable),
  1393. (("body", "caption", "col", "colgroup", "html", "tbody", "td",
  1394. "tfoot", "th", "thead", "tr"), self.endTagIgnore)
  1395. ])
  1396. self.endTagHandler.default = self.endTagOther
  1397. # helper methods
  1398. def clearStackToTableContext(self):
  1399. # "clear the stack back to a table context"
  1400. while self.tree.openElements[-1].name not in ("table", "html"):
  1401. # self.parser.parseError("unexpected-implied-end-tag-in-table",
  1402. # {"name": self.tree.openElements[-1].name})
  1403. self.tree.openElements.pop()
  1404. # When the current node is <html> it's an innerHTML case
  1405. # processing methods
  1406. def processEOF(self):
  1407. if self.tree.openElements[-1].name != "html":
  1408. self.parser.parseError("eof-in-table")
  1409. else:
  1410. assert self.parser.innerHTML
  1411. # Stop parsing
  1412. def processSpaceCharacters(self, token):
  1413. originalPhase = self.parser.phase
  1414. self.parser.phase = self.parser.phases["inTableText"]
  1415. self.parser.phase.originalPhase = originalPhase
  1416. self.parser.phase.processSpaceCharacters(token)
  1417. def processCharacters(self, token):
  1418. originalPhase = self.parser.phase
  1419. self.parser.phase = self.parser.phases["inTableText"]
  1420. self.parser.phase.originalPhase = originalPhase
  1421. self.parser.phase.processCharacters(token)
  1422. def insertText(self, token):
  1423. # If we get here there must be at least one non-whitespace character
  1424. # Do the table magic!
  1425. self.tree.insertFromTable = True
  1426. self.parser.phases["inBody"].processCharacters(token)
  1427. self.tree.insertFromTable = False
  1428. def startTagCaption(self, token):
  1429. self.clearStackToTableContext()
  1430. self.tree.activeFormattingElements.append(Marker)
  1431. self.tree.insertElement(token)
  1432. self.parser.phase = self.parser.phases["inCaption"]
  1433. def startTagColgroup(self, token):
  1434. self.clearStackToTableContext()
  1435. self.tree.insertElement(token)
  1436. self.parser.phase = self.parser.phases["inColumnGroup"]
  1437. def startTagCol(self, token):
  1438. self.startTagColgroup(impliedTagToken("colgroup", "StartTag"))
  1439. return token
  1440. def startTagRowGroup(self, token):
  1441. self.clearStackToTableContext()
  1442. self.tree.insertElement(token)
  1443. self.parser.phase = self.parser.phases["inTableBody"]
  1444. def startTagImplyTbody(self, token):
  1445. self.startTagRowGroup(impliedTagToken("tbody", "StartTag"))
  1446. return token
  1447. def startTagTable(self, token):
  1448. self.parser.parseError("unexpected-start-tag-implies-end-tag",
  1449. {"startName": "table", "endName": "table"})
  1450. self.parser.phase.processEndTag(impliedTagToken("table"))
  1451. if not self.parser.innerHTML:
  1452. return token
  1453. def startTagStyleScript(self, token):
  1454. return self.parser.phases["inHead"].processStartTag(token)
  1455. def startTagInput(self, token):
  1456. if ("type" in token["data"] and
  1457. token["data"]["type"].translate(asciiUpper2Lower) == "hidden"):
  1458. self.parser.parseError("unexpected-hidden-input-in-table")
  1459. self.tree.insertElement(token)
  1460. # XXX associate with form
  1461. self.tree.openElements.pop()
  1462. else:
  1463. self.startTagOther(token)
  1464. def startTagForm(self, token):
  1465. self.parser.parseError("unexpected-form-in-table")
  1466. if self.tree.formPointer is None:
  1467. self.tree.insertElement(token)
  1468. self.tree.formPointer = self.tree.openElements[-1]
  1469. self.tree.openElements.pop()
  1470. def startTagOther(self, token):
  1471. self.parser.parseError("unexpected-start-tag-implies-table-voodoo", {"name": token["name"]})
  1472. # Do the table magic!
  1473. self.tree.insertFromTable = True
  1474. self.parser.phases["inBody"].processStartTag(token)
  1475. self.tree.insertFromTable = False
  1476. def endTagTable(self, token):
  1477. if self.tree.elementInScope("table", variant="table"):
  1478. self.tree.generateImpliedEndTags()
  1479. if self.tree.openElements[-1].name != "table":
  1480. self.parser.parseError("end-tag-too-early-named",
  1481. {"gotName": "table",
  1482. "expectedName": self.tree.openElements[-1].name})
  1483. while self.tree.openElements[-1].name != "table":
  1484. self.tree.openElements.pop()
  1485. self.tree.openElements.pop()
  1486. self.parser.resetInsertionMode()
  1487. else:
  1488. # innerHTML case
  1489. assert self.parser.innerHTML
  1490. self.parser.parseError()
  1491. def endTagIgnore(self, token):
  1492. self.parser.parseError("unexpected-end-tag", {"name": token["name"]})
  1493. def endTagOther(self, token):
  1494. self.parser.parseError("unexpected-end-tag-implies-table-voodoo", {"name": token["name"]})
  1495. # Do the table magic!
  1496. self.tree.insertFromTable = True
  1497. self.parser.phases["inBody"].processEndTag(token)
  1498. self.tree.insertFromTable = False
  1499. class InTableTextPhase(Phase):
  1500. def __init__(self, parser, tree):
  1501. Phase.__init__(self, parser, tree)
  1502. self.originalPhase = None
  1503. self.characterTokens = []
  1504. def flushCharacters(self):
  1505. data = "".join([item["data"] for item in self.characterTokens])
  1506. if any([item not in spaceCharacters for item in data]):
  1507. token = {"type": tokenTypes["Characters"], "data": data}
  1508. self.parser.phases["inTable"].insertText(token)
  1509. elif data:
  1510. self.tree.insertText(data)
  1511. self.characterTokens = []
  1512. def processComment(self, token):
  1513. self.flushCharacters()
  1514. self.parser.phase = self.originalPhase
  1515. return token
  1516. def processEOF(self):
  1517. self.flushCharacters()
  1518. self.parser.phase = self.originalPhase
  1519. return True
  1520. def processCharacters(self, token):
  1521. if token["data"] == "\u0000":
  1522. return
  1523. self.characterTokens.append(token)
  1524. def processSpaceCharacters(self, token):
  1525. # pretty sure we should never reach here
  1526. self.characterTokens.append(token)
  1527. # assert False
  1528. def processStartTag(self, token):
  1529. self.flushCharacters()
  1530. self.parser.phase = self.originalPhase
  1531. return token
  1532. def processEndTag(self, token):
  1533. self.flushCharacters()
  1534. self.parser.phase = self.originalPhase
  1535. return token
  1536. class InCaptionPhase(Phase):
  1537. # http://www.whatwg.org/specs/web-apps/current-work/#in-caption
  1538. def __init__(self, parser, tree):
  1539. Phase.__init__(self, parser, tree)
  1540. self.startTagHandler = _utils.MethodDispatcher([
  1541. ("html", self.startTagHtml),
  1542. (("caption", "col", "colgroup", "tbody", "td", "tfoot", "th",
  1543. "thead", "tr"), self.startTagTableElement)
  1544. ])
  1545. self.startTagHandler.default = self.startTagOther
  1546. self.endTagHandler = _utils.MethodDispatcher([
  1547. ("caption", self.endTagCaption),
  1548. ("table", self.endTagTable),
  1549. (("body", "col", "colgroup", "html", "tbody", "td", "tfoot", "th",
  1550. "thead", "tr"), self.endTagIgnore)
  1551. ])
  1552. self.endTagHandler.default = self.endTagOther
  1553. def ignoreEndTagCaption(self):
  1554. return not self.tree.elementInScope("caption", variant="table")
  1555. def processEOF(self):
  1556. self.parser.phases["inBody"].processEOF()
  1557. def processCharacters(self, token):
  1558. return self.parser.phases["inBody"].processCharacters(token)
  1559. def startTagTableElement(self, token):
  1560. self.parser.parseError()
  1561. # XXX Have to duplicate logic here to find out if the tag is ignored
  1562. ignoreEndTag = self.ignoreEndTagCaption()
  1563. self.parser.phase.processEndTag(impliedTagToken("caption"))
  1564. if not ignoreEndTag:
  1565. return token
  1566. def startTagOther(self, token):
  1567. return self.parser.phases["inBody"].processStartTag(token)
  1568. def endTagCaption(self, token):
  1569. if not self.ignoreEndTagCaption():
  1570. # AT this code is quite similar to endTagTable in "InTable"
  1571. self.tree.generateImpliedEndTags()
  1572. if self.tree.openElements[-1].name != "caption":
  1573. self.parser.parseError("expected-one-end-tag-but-got-another",
  1574. {"gotName": "caption",
  1575. "expectedName": self.tree.openElements[-1].name})
  1576. while self.tree.openElements[-1].name != "caption":
  1577. self.tree.openElements.pop()
  1578. self.tree.openElements.pop()
  1579. self.tree.clearActiveFormattingElements()
  1580. self.parser.phase = self.parser.phases["inTable"]
  1581. else:
  1582. # innerHTML case
  1583. assert self.parser.innerHTML
  1584. self.parser.parseError()
  1585. def endTagTable(self, token):
  1586. self.parser.parseError()
  1587. ignoreEndTag = self.ignoreEndTagCaption()
  1588. self.parser.phase.processEndTag(impliedTagToken("caption"))
  1589. if not ignoreEndTag:
  1590. return token
  1591. def endTagIgnore(self, token):
  1592. self.parser.parseError("unexpected-end-tag", {"name": token["name"]})
  1593. def endTagOther(self, token):
  1594. return self.parser.phases["inBody"].processEndTag(token)
  1595. class InColumnGroupPhase(Phase):
  1596. # http://www.whatwg.org/specs/web-apps/current-work/#in-column
  1597. def __init__(self, parser, tree):
  1598. Phase.__init__(self, parser, tree)
  1599. self.startTagHandler = _utils.MethodDispatcher([
  1600. ("html", self.startTagHtml),
  1601. ("col", self.startTagCol)
  1602. ])
  1603. self.startTagHandler.default = self.startTagOther
  1604. self.endTagHandler = _utils.MethodDispatcher([
  1605. ("colgroup", self.endTagColgroup),
  1606. ("col", self.endTagCol)
  1607. ])
  1608. self.endTagHandler.default = self.endTagOther
  1609. def ignoreEndTagColgroup(self):
  1610. return self.tree.openElements[-1].name == "html"
  1611. def processEOF(self):
  1612. if self.tree.openElements[-1].name == "html":
  1613. assert self.parser.innerHTML
  1614. return
  1615. else:
  1616. ignoreEndTag = self.ignoreEndTagColgroup()
  1617. self.endTagColgroup(impliedTagToken("colgroup"))
  1618. if not ignoreEndTag:
  1619. return True
  1620. def processCharacters(self, token):
  1621. ignoreEndTag = self.ignoreEndTagColgroup()
  1622. self.endTagColgroup(impliedTagToken("colgroup"))
  1623. if not ignoreEndTag:
  1624. return token
  1625. def startTagCol(self, token):
  1626. self.tree.insertElement(token)
  1627. self.tree.openElements.pop()
  1628. token["selfClosingAcknowledged"] = True
  1629. def startTagOther(self, token):
  1630. ignoreEndTag = self.ignoreEndTagColgroup()
  1631. self.endTagColgroup(impliedTagToken("colgroup"))
  1632. if not ignoreEndTag:
  1633. return token
  1634. def endTagColgroup(self, token):
  1635. if self.ignoreEndTagColgroup():
  1636. # innerHTML case
  1637. assert self.parser.innerHTML
  1638. self.parser.parseError()
  1639. else:
  1640. self.tree.openElements.pop()
  1641. self.parser.phase = self.parser.phases["inTable"]
  1642. def endTagCol(self, token):
  1643. self.parser.parseError("no-end-tag", {"name": "col"})
  1644. def endTagOther(self, token):
  1645. ignoreEndTag = self.ignoreEndTagColgroup()
  1646. self.endTagColgroup(impliedTagToken("colgroup"))
  1647. if not ignoreEndTag:
  1648. return token
  1649. class InTableBodyPhase(Phase):
  1650. # http://www.whatwg.org/specs/web-apps/current-work/#in-table0
  1651. def __init__(self, parser, tree):
  1652. Phase.__init__(self, parser, tree)
  1653. self.startTagHandler = _utils.MethodDispatcher([
  1654. ("html", self.startTagHtml),
  1655. ("tr", self.startTagTr),
  1656. (("td", "th"), self.startTagTableCell),
  1657. (("caption", "col", "colgroup", "tbody", "tfoot", "thead"),
  1658. self.startTagTableOther)
  1659. ])
  1660. self.startTagHandler.default = self.startTagOther
  1661. self.endTagHandler = _utils.MethodDispatcher([
  1662. (("tbody", "tfoot", "thead"), self.endTagTableRowGroup),
  1663. ("table", self.endTagTable),
  1664. (("body", "caption", "col", "colgroup", "html", "td", "th",
  1665. "tr"), self.endTagIgnore)
  1666. ])
  1667. self.endTagHandler.default = self.endTagOther
  1668. # helper methods
  1669. def clearStackToTableBodyContext(self):
  1670. while self.tree.openElements[-1].name not in ("tbody", "tfoot",
  1671. "thead", "html"):
  1672. # self.parser.parseError("unexpected-implied-end-tag-in-table",
  1673. # {"name": self.tree.openElements[-1].name})
  1674. self.tree.openElements.pop()
  1675. if self.tree.openElements[-1].name == "html":
  1676. assert self.parser.innerHTML
  1677. # the rest
  1678. def processEOF(self):
  1679. self.parser.phases["inTable"].processEOF()
  1680. def processSpaceCharacters(self, token):
  1681. return self.parser.phases["inTable"].processSpaceCharacters(token)
  1682. def processCharacters(self, token):
  1683. return self.parser.phases["inTable"].processCharacters(token)
  1684. def startTagTr(self, token):
  1685. self.clearStackToTableBodyContext()
  1686. self.tree.insertElement(token)
  1687. self.parser.phase = self.parser.phases["inRow"]
  1688. def startTagTableCell(self, token):
  1689. self.parser.parseError("unexpected-cell-in-table-body",
  1690. {"name": token["name"]})
  1691. self.startTagTr(impliedTagToken("tr", "StartTag"))
  1692. return token
  1693. def startTagTableOther(self, token):
  1694. # XXX AT Any ideas on how to share this with endTagTable?
  1695. if (self.tree.elementInScope("tbody", variant="table") or
  1696. self.tree.elementInScope("thead", variant="table") or
  1697. self.tree.elementInScope("tfoot", variant="table")):
  1698. self.clearStackToTableBodyContext()
  1699. self.endTagTableRowGroup(
  1700. impliedTagToken(self.tree.openElements[-1].name))
  1701. return token
  1702. else:
  1703. # innerHTML case
  1704. assert self.parser.innerHTML
  1705. self.parser.parseError()
  1706. def startTagOther(self, token):
  1707. return self.parser.phases["inTable"].processStartTag(token)
  1708. def endTagTableRowGroup(self, token):
  1709. if self.tree.elementInScope(token["name"], variant="table"):
  1710. self.clearStackToTableBodyContext()
  1711. self.tree.openElements.pop()
  1712. self.parser.phase = self.parser.phases["inTable"]
  1713. else:
  1714. self.parser.parseError("unexpected-end-tag-in-table-body",
  1715. {"name": token["name"]})
  1716. def endTagTable(self, token):
  1717. if (self.tree.elementInScope("tbody", variant="table") or
  1718. self.tree.elementInScope("thead", variant="table") or
  1719. self.tree.elementInScope("tfoot", variant="table")):
  1720. self.clearStackToTableBodyContext()
  1721. self.endTagTableRowGroup(
  1722. impliedTagToken(self.tree.openElements[-1].name))
  1723. return token
  1724. else:
  1725. # innerHTML case
  1726. assert self.parser.innerHTML
  1727. self.parser.parseError()
  1728. def endTagIgnore(self, token):
  1729. self.parser.parseError("unexpected-end-tag-in-table-body",
  1730. {"name": token["name"]})
  1731. def endTagOther(self, token):
  1732. return self.parser.phases["inTable"].processEndTag(token)
  1733. class InRowPhase(Phase):
  1734. # http://www.whatwg.org/specs/web-apps/current-work/#in-row
  1735. def __init__(self, parser, tree):
  1736. Phase.__init__(self, parser, tree)
  1737. self.startTagHandler = _utils.MethodDispatcher([
  1738. ("html", self.startTagHtml),
  1739. (("td", "th"), self.startTagTableCell),
  1740. (("caption", "col", "colgroup", "tbody", "tfoot", "thead",
  1741. "tr"), self.startTagTableOther)
  1742. ])
  1743. self.startTagHandler.default = self.startTagOther
  1744. self.endTagHandler = _utils.MethodDispatcher([
  1745. ("tr", self.endTagTr),
  1746. ("table", self.endTagTable),
  1747. (("tbody", "tfoot", "thead"), self.endTagTableRowGroup),
  1748. (("body", "caption", "col", "colgroup", "html", "td", "th"),
  1749. self.endTagIgnore)
  1750. ])
  1751. self.endTagHandler.default = self.endTagOther
  1752. # helper methods (XXX unify this with other table helper methods)
  1753. def clearStackToTableRowContext(self):
  1754. while self.tree.openElements[-1].name not in ("tr", "html"):
  1755. self.parser.parseError("unexpected-implied-end-tag-in-table-row",
  1756. {"name": self.tree.openElements[-1].name})
  1757. self.tree.openElements.pop()
  1758. def ignoreEndTagTr(self):
  1759. return not self.tree.elementInScope("tr", variant="table")
  1760. # the rest
  1761. def processEOF(self):
  1762. self.parser.phases["inTable"].processEOF()
  1763. def processSpaceCharacters(self, token):
  1764. return self.parser.phases["inTable"].processSpaceCharacters(token)
  1765. def processCharacters(self, token):
  1766. return self.parser.phases["inTable"].processCharacters(token)
  1767. def startTagTableCell(self, token):
  1768. self.clearStackToTableRowContext()
  1769. self.tree.insertElement(token)
  1770. self.parser.phase = self.parser.phases["inCell"]
  1771. self.tree.activeFormattingElements.append(Marker)
  1772. def startTagTableOther(self, token):
  1773. ignoreEndTag = self.ignoreEndTagTr()
  1774. self.endTagTr(impliedTagToken("tr"))
  1775. # XXX how are we sure it's always ignored in the innerHTML case?
  1776. if not ignoreEndTag:
  1777. return token
  1778. def startTagOther(self, token):
  1779. return self.parser.phases["inTable"].processStartTag(token)
  1780. def endTagTr(self, token):
  1781. if not self.ignoreEndTagTr():
  1782. self.clearStackToTableRowContext()
  1783. self.tree.openElements.pop()
  1784. self.parser.phase = self.parser.phases["inTableBody"]
  1785. else:
  1786. # innerHTML case
  1787. assert self.parser.innerHTML
  1788. self.parser.parseError()
  1789. def endTagTable(self, token):
  1790. ignoreEndTag = self.ignoreEndTagTr()
  1791. self.endTagTr(impliedTagToken("tr"))
  1792. # Reprocess the current tag if the tr end tag was not ignored
  1793. # XXX how are we sure it's always ignored in the innerHTML case?
  1794. if not ignoreEndTag:
  1795. return token
  1796. def endTagTableRowGroup(self, token):
  1797. if self.tree.elementInScope(token["name"], variant="table"):
  1798. self.endTagTr(impliedTagToken("tr"))
  1799. return token
  1800. else:
  1801. self.parser.parseError()
  1802. def endTagIgnore(self, token):
  1803. self.parser.parseError("unexpected-end-tag-in-table-row",
  1804. {"name": token["name"]})
  1805. def endTagOther(self, token):
  1806. return self.parser.phases["inTable"].processEndTag(token)
  1807. class InCellPhase(Phase):
  1808. # http://www.whatwg.org/specs/web-apps/current-work/#in-cell
  1809. def __init__(self, parser, tree):
  1810. Phase.__init__(self, parser, tree)
  1811. self.startTagHandler = _utils.MethodDispatcher([
  1812. ("html", self.startTagHtml),
  1813. (("caption", "col", "colgroup", "tbody", "td", "tfoot", "th",
  1814. "thead", "tr"), self.startTagTableOther)
  1815. ])
  1816. self.startTagHandler.default = self.startTagOther
  1817. self.endTagHandler = _utils.MethodDispatcher([
  1818. (("td", "th"), self.endTagTableCell),
  1819. (("body", "caption", "col", "colgroup", "html"), self.endTagIgnore),
  1820. (("table", "tbody", "tfoot", "thead", "tr"), self.endTagImply)
  1821. ])
  1822. self.endTagHandler.default = self.endTagOther
  1823. # helper
  1824. def closeCell(self):
  1825. if self.tree.elementInScope("td", variant="table"):
  1826. self.endTagTableCell(impliedTagToken("td"))
  1827. elif self.tree.elementInScope("th", variant="table"):
  1828. self.endTagTableCell(impliedTagToken("th"))
  1829. # the rest
  1830. def processEOF(self):
  1831. self.parser.phases["inBody"].processEOF()
  1832. def processCharacters(self, token):
  1833. return self.parser.phases["inBody"].processCharacters(token)
  1834. def startTagTableOther(self, token):
  1835. if (self.tree.elementInScope("td", variant="table") or
  1836. self.tree.elementInScope("th", variant="table")):
  1837. self.closeCell()
  1838. return token
  1839. else:
  1840. # innerHTML case
  1841. assert self.parser.innerHTML
  1842. self.parser.parseError()
  1843. def startTagOther(self, token):
  1844. return self.parser.phases["inBody"].processStartTag(token)
  1845. def endTagTableCell(self, token):
  1846. if self.tree.elementInScope(token["name"], variant="table"):
  1847. self.tree.generateImpliedEndTags(token["name"])
  1848. if self.tree.openElements[-1].name != token["name"]:
  1849. self.parser.parseError("unexpected-cell-end-tag",
  1850. {"name": token["name"]})
  1851. while True:
  1852. node = self.tree.openElements.pop()
  1853. if node.name == token["name"]:
  1854. break
  1855. else:
  1856. self.tree.openElements.pop()
  1857. self.tree.clearActiveFormattingElements()
  1858. self.parser.phase = self.parser.phases["inRow"]
  1859. else:
  1860. self.parser.parseError("unexpected-end-tag", {"name": token["name"]})
  1861. def endTagIgnore(self, token):
  1862. self.parser.parseError("unexpected-end-tag", {"name": token["name"]})
  1863. def endTagImply(self, token):
  1864. if self.tree.elementInScope(token["name"], variant="table"):
  1865. self.closeCell()
  1866. return token
  1867. else:
  1868. # sometimes innerHTML case
  1869. self.parser.parseError()
  1870. def endTagOther(self, token):
  1871. return self.parser.phases["inBody"].processEndTag(token)
  1872. class InSelectPhase(Phase):
  1873. def __init__(self, parser, tree):
  1874. Phase.__init__(self, parser, tree)
  1875. self.startTagHandler = _utils.MethodDispatcher([
  1876. ("html", self.startTagHtml),
  1877. ("option", self.startTagOption),
  1878. ("optgroup", self.startTagOptgroup),
  1879. ("select", self.startTagSelect),
  1880. (("input", "keygen", "textarea"), self.startTagInput),
  1881. ("script", self.startTagScript)
  1882. ])
  1883. self.startTagHandler.default = self.startTagOther
  1884. self.endTagHandler = _utils.MethodDispatcher([
  1885. ("option", self.endTagOption),
  1886. ("optgroup", self.endTagOptgroup),
  1887. ("select", self.endTagSelect)
  1888. ])
  1889. self.endTagHandler.default = self.endTagOther
  1890. # http://www.whatwg.org/specs/web-apps/current-work/#in-select
  1891. def processEOF(self):
  1892. if self.tree.openElements[-1].name != "html":
  1893. self.parser.parseError("eof-in-select")
  1894. else:
  1895. assert self.parser.innerHTML
  1896. def processCharacters(self, token):
  1897. if token["data"] == "\u0000":
  1898. return
  1899. self.tree.insertText(token["data"])
  1900. def startTagOption(self, token):
  1901. # We need to imply </option> if <option> is the current node.
  1902. if self.tree.openElements[-1].name == "option":
  1903. self.tree.openElements.pop()
  1904. self.tree.insertElement(token)
  1905. def startTagOptgroup(self, token):
  1906. if self.tree.openElements[-1].name == "option":
  1907. self.tree.openElements.pop()
  1908. if self.tree.openElements[-1].name == "optgroup":
  1909. self.tree.openElements.pop()
  1910. self.tree.insertElement(token)
  1911. def startTagSelect(self, token):
  1912. self.parser.parseError("unexpected-select-in-select")
  1913. self.endTagSelect(impliedTagToken("select"))
  1914. def startTagInput(self, token):
  1915. self.parser.parseError("unexpected-input-in-select")
  1916. if self.tree.elementInScope("select", variant="select"):
  1917. self.endTagSelect(impliedTagToken("select"))
  1918. return token
  1919. else:
  1920. assert self.parser.innerHTML
  1921. def startTagScript(self, token):
  1922. return self.parser.phases["inHead"].processStartTag(token)
  1923. def startTagOther(self, token):
  1924. self.parser.parseError("unexpected-start-tag-in-select",
  1925. {"name": token["name"]})
  1926. def endTagOption(self, token):
  1927. if self.tree.openElements[-1].name == "option":
  1928. self.tree.openElements.pop()
  1929. else:
  1930. self.parser.parseError("unexpected-end-tag-in-select",
  1931. {"name": "option"})
  1932. def endTagOptgroup(self, token):
  1933. # </optgroup> implicitly closes <option>
  1934. if (self.tree.openElements[-1].name == "option" and
  1935. self.tree.openElements[-2].name == "optgroup"):
  1936. self.tree.openElements.pop()
  1937. # It also closes </optgroup>
  1938. if self.tree.openElements[-1].name == "optgroup":
  1939. self.tree.openElements.pop()
  1940. # But nothing else
  1941. else:
  1942. self.parser.parseError("unexpected-end-tag-in-select",
  1943. {"name": "optgroup"})
  1944. def endTagSelect(self, token):
  1945. if self.tree.elementInScope("select", variant="select"):
  1946. node = self.tree.openElements.pop()
  1947. while node.name != "select":
  1948. node = self.tree.openElements.pop()
  1949. self.parser.resetInsertionMode()
  1950. else:
  1951. # innerHTML case
  1952. assert self.parser.innerHTML
  1953. self.parser.parseError()
  1954. def endTagOther(self, token):
  1955. self.parser.parseError("unexpected-end-tag-in-select",
  1956. {"name": token["name"]})
  1957. class InSelectInTablePhase(Phase):
  1958. def __init__(self, parser, tree):
  1959. Phase.__init__(self, parser, tree)
  1960. self.startTagHandler = _utils.MethodDispatcher([
  1961. (("caption", "table", "tbody", "tfoot", "thead", "tr", "td", "th"),
  1962. self.startTagTable)
  1963. ])
  1964. self.startTagHandler.default = self.startTagOther
  1965. self.endTagHandler = _utils.MethodDispatcher([
  1966. (("caption", "table", "tbody", "tfoot", "thead", "tr", "td", "th"),
  1967. self.endTagTable)
  1968. ])
  1969. self.endTagHandler.default = self.endTagOther
  1970. def processEOF(self):
  1971. self.parser.phases["inSelect"].processEOF()
  1972. def processCharacters(self, token):
  1973. return self.parser.phases["inSelect"].processCharacters(token)
  1974. def startTagTable(self, token):
  1975. self.parser.parseError("unexpected-table-element-start-tag-in-select-in-table", {"name": token["name"]})
  1976. self.endTagOther(impliedTagToken("select"))
  1977. return token
  1978. def startTagOther(self, token):
  1979. return self.parser.phases["inSelect"].processStartTag(token)
  1980. def endTagTable(self, token):
  1981. self.parser.parseError("unexpected-table-element-end-tag-in-select-in-table", {"name": token["name"]})
  1982. if self.tree.elementInScope(token["name"], variant="table"):
  1983. self.endTagOther(impliedTagToken("select"))
  1984. return token
  1985. def endTagOther(self, token):
  1986. return self.parser.phases["inSelect"].processEndTag(token)
  1987. class InForeignContentPhase(Phase):
  1988. breakoutElements = frozenset(["b", "big", "blockquote", "body", "br",
  1989. "center", "code", "dd", "div", "dl", "dt",
  1990. "em", "embed", "h1", "h2", "h3",
  1991. "h4", "h5", "h6", "head", "hr", "i", "img",
  1992. "li", "listing", "menu", "meta", "nobr",
  1993. "ol", "p", "pre", "ruby", "s", "small",
  1994. "span", "strong", "strike", "sub", "sup",
  1995. "table", "tt", "u", "ul", "var"])
  1996. def __init__(self, parser, tree):
  1997. Phase.__init__(self, parser, tree)
  1998. def adjustSVGTagNames(self, token):
  1999. replacements = {"altglyph": "altGlyph",
  2000. "altglyphdef": "altGlyphDef",
  2001. "altglyphitem": "altGlyphItem",
  2002. "animatecolor": "animateColor",
  2003. "animatemotion": "animateMotion",
  2004. "animatetransform": "animateTransform",
  2005. "clippath": "clipPath",
  2006. "feblend": "feBlend",
  2007. "fecolormatrix": "feColorMatrix",
  2008. "fecomponenttransfer": "feComponentTransfer",
  2009. "fecomposite": "feComposite",
  2010. "feconvolvematrix": "feConvolveMatrix",
  2011. "fediffuselighting": "feDiffuseLighting",
  2012. "fedisplacementmap": "feDisplacementMap",
  2013. "fedistantlight": "feDistantLight",
  2014. "feflood": "feFlood",
  2015. "fefunca": "feFuncA",
  2016. "fefuncb": "feFuncB",
  2017. "fefuncg": "feFuncG",
  2018. "fefuncr": "feFuncR",
  2019. "fegaussianblur": "feGaussianBlur",
  2020. "feimage": "feImage",
  2021. "femerge": "feMerge",
  2022. "femergenode": "feMergeNode",
  2023. "femorphology": "feMorphology",
  2024. "feoffset": "feOffset",
  2025. "fepointlight": "fePointLight",
  2026. "fespecularlighting": "feSpecularLighting",
  2027. "fespotlight": "feSpotLight",
  2028. "fetile": "feTile",
  2029. "feturbulence": "feTurbulence",
  2030. "foreignobject": "foreignObject",
  2031. "glyphref": "glyphRef",
  2032. "lineargradient": "linearGradient",
  2033. "radialgradient": "radialGradient",
  2034. "textpath": "textPath"}
  2035. if token["name"] in replacements:
  2036. token["name"] = replacements[token["name"]]
  2037. def processCharacters(self, token):
  2038. if token["data"] == "\u0000":
  2039. token["data"] = "\uFFFD"
  2040. elif (self.parser.framesetOK and
  2041. any(char not in spaceCharacters for char in token["data"])):
  2042. self.parser.framesetOK = False
  2043. Phase.processCharacters(self, token)
  2044. def processStartTag(self, token):
  2045. currentNode = self.tree.openElements[-1]
  2046. if (token["name"] in self.breakoutElements or
  2047. (token["name"] == "font" and
  2048. set(token["data"].keys()) & set(["color", "face", "size"]))):
  2049. self.parser.parseError("unexpected-html-element-in-foreign-content",
  2050. {"name": token["name"]})
  2051. while (self.tree.openElements[-1].namespace !=
  2052. self.tree.defaultNamespace and
  2053. not self.parser.isHTMLIntegrationPoint(self.tree.openElements[-1]) and
  2054. not self.parser.isMathMLTextIntegrationPoint(self.tree.openElements[-1])):
  2055. self.tree.openElements.pop()
  2056. return token
  2057. else:
  2058. if currentNode.namespace == namespaces["mathml"]:
  2059. self.parser.adjustMathMLAttributes(token)
  2060. elif currentNode.namespace == namespaces["svg"]:
  2061. self.adjustSVGTagNames(token)
  2062. self.parser.adjustSVGAttributes(token)
  2063. self.parser.adjustForeignAttributes(token)
  2064. token["namespace"] = currentNode.namespace
  2065. self.tree.insertElement(token)
  2066. if token["selfClosing"]:
  2067. self.tree.openElements.pop()
  2068. token["selfClosingAcknowledged"] = True
  2069. def processEndTag(self, token):
  2070. nodeIndex = len(self.tree.openElements) - 1
  2071. node = self.tree.openElements[-1]
  2072. if node.name.translate(asciiUpper2Lower) != token["name"]:
  2073. self.parser.parseError("unexpected-end-tag", {"name": token["name"]})
  2074. while True:
  2075. if node.name.translate(asciiUpper2Lower) == token["name"]:
  2076. # XXX this isn't in the spec but it seems necessary
  2077. if self.parser.phase == self.parser.phases["inTableText"]:
  2078. self.parser.phase.flushCharacters()
  2079. self.parser.phase = self.parser.phase.originalPhase
  2080. while self.tree.openElements.pop() != node:
  2081. assert self.tree.openElements
  2082. new_token = None
  2083. break
  2084. nodeIndex -= 1
  2085. node = self.tree.openElements[nodeIndex]
  2086. if node.namespace != self.tree.defaultNamespace:
  2087. continue
  2088. else:
  2089. new_token = self.parser.phase.processEndTag(token)
  2090. break
  2091. return new_token
  2092. class AfterBodyPhase(Phase):
  2093. def __init__(self, parser, tree):
  2094. Phase.__init__(self, parser, tree)
  2095. self.startTagHandler = _utils.MethodDispatcher([
  2096. ("html", self.startTagHtml)
  2097. ])
  2098. self.startTagHandler.default = self.startTagOther
  2099. self.endTagHandler = _utils.MethodDispatcher([("html", self.endTagHtml)])
  2100. self.endTagHandler.default = self.endTagOther
  2101. def processEOF(self):
  2102. # Stop parsing
  2103. pass
  2104. def processComment(self, token):
  2105. # This is needed because data is to be appended to the <html> element
  2106. # here and not to whatever is currently open.
  2107. self.tree.insertComment(token, self.tree.openElements[0])
  2108. def processCharacters(self, token):
  2109. self.parser.parseError("unexpected-char-after-body")
  2110. self.parser.phase = self.parser.phases["inBody"]
  2111. return token
  2112. def startTagHtml(self, token):
  2113. return self.parser.phases["inBody"].processStartTag(token)
  2114. def startTagOther(self, token):
  2115. self.parser.parseError("unexpected-start-tag-after-body",
  2116. {"name": token["name"]})
  2117. self.parser.phase = self.parser.phases["inBody"]
  2118. return token
  2119. def endTagHtml(self, name):
  2120. if self.parser.innerHTML:
  2121. self.parser.parseError("unexpected-end-tag-after-body-innerhtml")
  2122. else:
  2123. self.parser.phase = self.parser.phases["afterAfterBody"]
  2124. def endTagOther(self, token):
  2125. self.parser.parseError("unexpected-end-tag-after-body",
  2126. {"name": token["name"]})
  2127. self.parser.phase = self.parser.phases["inBody"]
  2128. return token
  2129. class InFramesetPhase(Phase):
  2130. # http://www.whatwg.org/specs/web-apps/current-work/#in-frameset
  2131. def __init__(self, parser, tree):
  2132. Phase.__init__(self, parser, tree)
  2133. self.startTagHandler = _utils.MethodDispatcher([
  2134. ("html", self.startTagHtml),
  2135. ("frameset", self.startTagFrameset),
  2136. ("frame", self.startTagFrame),
  2137. ("noframes", self.startTagNoframes)
  2138. ])
  2139. self.startTagHandler.default = self.startTagOther
  2140. self.endTagHandler = _utils.MethodDispatcher([
  2141. ("frameset", self.endTagFrameset)
  2142. ])
  2143. self.endTagHandler.default = self.endTagOther
  2144. def processEOF(self):
  2145. if self.tree.openElements[-1].name != "html":
  2146. self.parser.parseError("eof-in-frameset")
  2147. else:
  2148. assert self.parser.innerHTML
  2149. def processCharacters(self, token):
  2150. self.parser.parseError("unexpected-char-in-frameset")
  2151. def startTagFrameset(self, token):
  2152. self.tree.insertElement(token)
  2153. def startTagFrame(self, token):
  2154. self.tree.insertElement(token)
  2155. self.tree.openElements.pop()
  2156. def startTagNoframes(self, token):
  2157. return self.parser.phases["inBody"].processStartTag(token)
  2158. def startTagOther(self, token):
  2159. self.parser.parseError("unexpected-start-tag-in-frameset",
  2160. {"name": token["name"]})
  2161. def endTagFrameset(self, token):
  2162. if self.tree.openElements[-1].name == "html":
  2163. # innerHTML case
  2164. self.parser.parseError("unexpected-frameset-in-frameset-innerhtml")
  2165. else:
  2166. self.tree.openElements.pop()
  2167. if (not self.parser.innerHTML and
  2168. self.tree.openElements[-1].name != "frameset"):
  2169. # If we're not in innerHTML mode and the current node is not a
  2170. # "frameset" element (anymore) then switch.
  2171. self.parser.phase = self.parser.phases["afterFrameset"]
  2172. def endTagOther(self, token):
  2173. self.parser.parseError("unexpected-end-tag-in-frameset",
  2174. {"name": token["name"]})
  2175. class AfterFramesetPhase(Phase):
  2176. # http://www.whatwg.org/specs/web-apps/current-work/#after3
  2177. def __init__(self, parser, tree):
  2178. Phase.__init__(self, parser, tree)
  2179. self.startTagHandler = _utils.MethodDispatcher([
  2180. ("html", self.startTagHtml),
  2181. ("noframes", self.startTagNoframes)
  2182. ])
  2183. self.startTagHandler.default = self.startTagOther
  2184. self.endTagHandler = _utils.MethodDispatcher([
  2185. ("html", self.endTagHtml)
  2186. ])
  2187. self.endTagHandler.default = self.endTagOther
  2188. def processEOF(self):
  2189. # Stop parsing
  2190. pass
  2191. def processCharacters(self, token):
  2192. self.parser.parseError("unexpected-char-after-frameset")
  2193. def startTagNoframes(self, token):
  2194. return self.parser.phases["inHead"].processStartTag(token)
  2195. def startTagOther(self, token):
  2196. self.parser.parseError("unexpected-start-tag-after-frameset",
  2197. {"name": token["name"]})
  2198. def endTagHtml(self, token):
  2199. self.parser.phase = self.parser.phases["afterAfterFrameset"]
  2200. def endTagOther(self, token):
  2201. self.parser.parseError("unexpected-end-tag-after-frameset",
  2202. {"name": token["name"]})
  2203. class AfterAfterBodyPhase(Phase):
  2204. def __init__(self, parser, tree):
  2205. Phase.__init__(self, parser, tree)
  2206. self.startTagHandler = _utils.MethodDispatcher([
  2207. ("html", self.startTagHtml)
  2208. ])
  2209. self.startTagHandler.default = self.startTagOther
  2210. def processEOF(self):
  2211. pass
  2212. def processComment(self, token):
  2213. self.tree.insertComment(token, self.tree.document)
  2214. def processSpaceCharacters(self, token):
  2215. return self.parser.phases["inBody"].processSpaceCharacters(token)
  2216. def processCharacters(self, token):
  2217. self.parser.parseError("expected-eof-but-got-char")
  2218. self.parser.phase = self.parser.phases["inBody"]
  2219. return token
  2220. def startTagHtml(self, token):
  2221. return self.parser.phases["inBody"].processStartTag(token)
  2222. def startTagOther(self, token):
  2223. self.parser.parseError("expected-eof-but-got-start-tag",
  2224. {"name": token["name"]})
  2225. self.parser.phase = self.parser.phases["inBody"]
  2226. return token
  2227. def processEndTag(self, token):
  2228. self.parser.parseError("expected-eof-but-got-end-tag",
  2229. {"name": token["name"]})
  2230. self.parser.phase = self.parser.phases["inBody"]
  2231. return token
  2232. class AfterAfterFramesetPhase(Phase):
  2233. def __init__(self, parser, tree):
  2234. Phase.__init__(self, parser, tree)
  2235. self.startTagHandler = _utils.MethodDispatcher([
  2236. ("html", self.startTagHtml),
  2237. ("noframes", self.startTagNoFrames)
  2238. ])
  2239. self.startTagHandler.default = self.startTagOther
  2240. def processEOF(self):
  2241. pass
  2242. def processComment(self, token):
  2243. self.tree.insertComment(token, self.tree.document)
  2244. def processSpaceCharacters(self, token):
  2245. return self.parser.phases["inBody"].processSpaceCharacters(token)
  2246. def processCharacters(self, token):
  2247. self.parser.parseError("expected-eof-but-got-char")
  2248. def startTagHtml(self, token):
  2249. return self.parser.phases["inBody"].processStartTag(token)
  2250. def startTagNoFrames(self, token):
  2251. return self.parser.phases["inHead"].processStartTag(token)
  2252. def startTagOther(self, token):
  2253. self.parser.parseError("expected-eof-but-got-start-tag",
  2254. {"name": token["name"]})
  2255. def processEndTag(self, token):
  2256. self.parser.parseError("expected-eof-but-got-end-tag",
  2257. {"name": token["name"]})
  2258. # pylint:enable=unused-argument
  2259. return {
  2260. "initial": InitialPhase,
  2261. "beforeHtml": BeforeHtmlPhase,
  2262. "beforeHead": BeforeHeadPhase,
  2263. "inHead": InHeadPhase,
  2264. "inHeadNoscript": InHeadNoscriptPhase,
  2265. "afterHead": AfterHeadPhase,
  2266. "inBody": InBodyPhase,
  2267. "text": TextPhase,
  2268. "inTable": InTablePhase,
  2269. "inTableText": InTableTextPhase,
  2270. "inCaption": InCaptionPhase,
  2271. "inColumnGroup": InColumnGroupPhase,
  2272. "inTableBody": InTableBodyPhase,
  2273. "inRow": InRowPhase,
  2274. "inCell": InCellPhase,
  2275. "inSelect": InSelectPhase,
  2276. "inSelectInTable": InSelectInTablePhase,
  2277. "inForeignContent": InForeignContentPhase,
  2278. "afterBody": AfterBodyPhase,
  2279. "inFrameset": InFramesetPhase,
  2280. "afterFrameset": AfterFramesetPhase,
  2281. "afterAfterBody": AfterAfterBodyPhase,
  2282. "afterAfterFrameset": AfterAfterFramesetPhase,
  2283. # XXX after after frameset
  2284. }
  2285. def adjust_attributes(token, replacements):
  2286. if PY3 or _utils.PY27:
  2287. needs_adjustment = viewkeys(token['data']) & viewkeys(replacements)
  2288. else:
  2289. needs_adjustment = frozenset(token['data']) & frozenset(replacements)
  2290. if needs_adjustment:
  2291. token['data'] = OrderedDict((replacements.get(k, k), v)
  2292. for k, v in token['data'].items())
  2293. def impliedTagToken(name, type="EndTag", attributes=None,
  2294. selfClosing=False):
  2295. if attributes is None:
  2296. attributes = {}
  2297. return {"type": tokenTypes[type], "name": name, "data": attributes,
  2298. "selfClosing": selfClosing}
  2299. class ParseError(Exception):
  2300. """Error in parsed document"""
  2301. pass