summaryrefslogtreecommitdiff
path: root/venv/lib/python3.7/site-packages/pip-10.0.1-py3.7.egg/pip/_vendor/html5lib/_tokenizer.py
diff options
context:
space:
mode:
Diffstat (limited to 'venv/lib/python3.7/site-packages/pip-10.0.1-py3.7.egg/pip/_vendor/html5lib/_tokenizer.py')
-rw-r--r--venv/lib/python3.7/site-packages/pip-10.0.1-py3.7.egg/pip/_vendor/html5lib/_tokenizer.py1721
1 files changed, 0 insertions, 1721 deletions
diff --git a/venv/lib/python3.7/site-packages/pip-10.0.1-py3.7.egg/pip/_vendor/html5lib/_tokenizer.py b/venv/lib/python3.7/site-packages/pip-10.0.1-py3.7.egg/pip/_vendor/html5lib/_tokenizer.py
deleted file mode 100644
index ef1ccf8..0000000
--- a/venv/lib/python3.7/site-packages/pip-10.0.1-py3.7.egg/pip/_vendor/html5lib/_tokenizer.py
+++ /dev/null
@@ -1,1721 +0,0 @@
1from __future__ import absolute_import, division, unicode_literals
2
3from pip._vendor.six import unichr as chr
4
5from collections import deque
6
7from .constants import spaceCharacters
8from .constants import entities
9from .constants import asciiLetters, asciiUpper2Lower
10from .constants import digits, hexDigits, EOF
11from .constants import tokenTypes, tagTokenTypes
12from .constants import replacementCharacters
13
14from ._inputstream import HTMLInputStream
15
16from ._trie import Trie
17
18entitiesTrie = Trie(entities)
19
20
21class HTMLTokenizer(object):
22 """ This class takes care of tokenizing HTML.
23
24 * self.currentToken
25 Holds the token that is currently being processed.
26
27 * self.state
28 Holds a reference to the method to be invoked... XXX
29
30 * self.stream
31 Points to HTMLInputStream object.
32 """
33
34 def __init__(self, stream, parser=None, **kwargs):
35
36 self.stream = HTMLInputStream(stream, **kwargs)
37 self.parser = parser
38
39 # Setup the initial tokenizer state
40 self.escapeFlag = False
41 self.lastFourChars = []
42 self.state = self.dataState
43 self.escape = False
44
45 # The current token being created
46 self.currentToken = None
47 super(HTMLTokenizer, self).__init__()
48
49 def __iter__(self):
50 """ This is where the magic happens.
51
52 We do our usually processing through the states and when we have a token
53 to return we yield the token which pauses processing until the next token
54 is requested.
55 """
56 self.tokenQueue = deque([])
57 # Start processing. When EOF is reached self.state will return False
58 # instead of True and the loop will terminate.
59 while self.state():
60 while self.stream.errors:
61 yield {"type": tokenTypes["ParseError"], "data": self.stream.errors.pop(0)}
62 while self.tokenQueue:
63 yield self.tokenQueue.popleft()
64
65 def consumeNumberEntity(self, isHex):
66 """This function returns either U+FFFD or the character based on the
67 decimal or hexadecimal representation. It also discards ";" if present.
68 If not present self.tokenQueue.append({"type": tokenTypes["ParseError"]}) is invoked.
69 """
70
71 allowed = digits
72 radix = 10
73 if isHex:
74 allowed = hexDigits
75 radix = 16
76
77 charStack = []
78
79 # Consume all the characters that are in range while making sure we
80 # don't hit an EOF.
81 c = self.stream.char()
82 while c in allowed and c is not EOF:
83 charStack.append(c)
84 c = self.stream.char()
85
86 # Convert the set of characters consumed to an int.
87 charAsInt = int("".join(charStack), radix)
88
89 # Certain characters get replaced with others
90 if charAsInt in replacementCharacters:
91 char = replacementCharacters[charAsInt]
92 self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
93 "illegal-codepoint-for-numeric-entity",
94 "datavars": {"charAsInt": charAsInt}})
95 elif ((0xD800 <= charAsInt <= 0xDFFF) or
96 (charAsInt > 0x10FFFF)):
97 char = "\uFFFD"
98 self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
99 "illegal-codepoint-for-numeric-entity",
100 "datavars": {"charAsInt": charAsInt}})
101 else:
102 # Should speed up this check somehow (e.g. move the set to a constant)
103 if ((0x0001 <= charAsInt <= 0x0008) or
104 (0x000E <= charAsInt <= 0x001F) or
105 (0x007F <= charAsInt <= 0x009F) or
106 (0xFDD0 <= charAsInt <= 0xFDEF) or
107 charAsInt in frozenset([0x000B, 0xFFFE, 0xFFFF, 0x1FFFE,
108 0x1FFFF, 0x2FFFE, 0x2FFFF, 0x3FFFE,
109 0x3FFFF, 0x4FFFE, 0x4FFFF, 0x5FFFE,
110 0x5FFFF, 0x6FFFE, 0x6FFFF, 0x7FFFE,
111 0x7FFFF, 0x8FFFE, 0x8FFFF, 0x9FFFE,
112 0x9FFFF, 0xAFFFE, 0xAFFFF, 0xBFFFE,
113 0xBFFFF, 0xCFFFE, 0xCFFFF, 0xDFFFE,
114 0xDFFFF, 0xEFFFE, 0xEFFFF, 0xFFFFE,
115 0xFFFFF, 0x10FFFE, 0x10FFFF])):
116 self.tokenQueue.append({"type": tokenTypes["ParseError"],
117 "data":
118 "illegal-codepoint-for-numeric-entity",
119 "datavars": {"charAsInt": charAsInt}})
120 try:
121 # Try/except needed as UCS-2 Python builds' unichar only works
122 # within the BMP.
123 char = chr(charAsInt)
124 except ValueError:
125 v = charAsInt - 0x10000
126 char = chr(0xD800 | (v >> 10)) + chr(0xDC00 | (v & 0x3FF))
127
128 # Discard the ; if present. Otherwise, put it back on the queue and
129 # invoke parseError on parser.
130 if c != ";":
131 self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
132 "numeric-entity-without-semicolon"})
133 self.stream.unget(c)
134
135 return char
136
137 def consumeEntity(self, allowedChar=None, fromAttribute=False):
138 # Initialise to the default output for when no entity is matched
139 output = "&"
140
141 charStack = [self.stream.char()]
142 if (charStack[0] in spaceCharacters or charStack[0] in (EOF, "<", "&") or
143 (allowedChar is not None and allowedChar == charStack[0])):
144 self.stream.unget(charStack[0])
145
146 elif charStack[0] == "#":
147 # Read the next character to see if it's hex or decimal
148 hex = False
149 charStack.append(self.stream.char())
150 if charStack[-1] in ("x", "X"):
151 hex = True
152 charStack.append(self.stream.char())
153
154 # charStack[-1] should be the first digit
155 if (hex and charStack[-1] in hexDigits) \
156 or (not hex and charStack[-1] in digits):
157 # At least one digit found, so consume the whole number
158 self.stream.unget(charStack[-1])
159 output = self.consumeNumberEntity(hex)
160 else:
161 # No digits found
162 self.tokenQueue.append({"type": tokenTypes["ParseError"],
163 "data": "expected-numeric-entity"})
164 self.stream.unget(charStack.pop())
165 output = "&" + "".join(charStack)
166
167 else:
168 # At this point in the process might have named entity. Entities
169 # are stored in the global variable "entities".
170 #
171 # Consume characters and compare to these to a substring of the
172 # entity names in the list until the substring no longer matches.
173 while (charStack[-1] is not EOF):
174 if not entitiesTrie.has_keys_with_prefix("".join(charStack)):
175 break
176 charStack.append(self.stream.char())
177
178 # At this point we have a string that starts with some characters
179 # that may match an entity
180 # Try to find the longest entity the string will match to take care
181 # of &noti for instance.
182 try:
183 entityName = entitiesTrie.longest_prefix("".join(charStack[:-1]))
184 entityLength = len(entityName)
185 except KeyError:
186 entityName = None
187
188 if entityName is not None:
189 if entityName[-1] != ";":
190 self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
191 "named-entity-without-semicolon"})
192 if (entityName[-1] != ";" and fromAttribute and
193 (charStack[entityLength] in asciiLetters or
194 charStack[entityLength] in digits or
195 charStack[entityLength] == "=")):
196 self.stream.unget(charStack.pop())
197 output = "&" + "".join(charStack)
198 else:
199 output = entities[entityName]
200 self.stream.unget(charStack.pop())
201 output += "".join(charStack[entityLength:])
202 else:
203 self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
204 "expected-named-entity"})
205 self.stream.unget(charStack.pop())
206 output = "&" + "".join(charStack)
207
208 if fromAttribute:
209 self.currentToken["data"][-1][1] += output
210 else:
211 if output in spaceCharacters:
212 tokenType = "SpaceCharacters"
213 else:
214 tokenType = "Characters"
215 self.tokenQueue.append({"type": tokenTypes[tokenType], "data": output})
216
217 def processEntityInAttribute(self, allowedChar):
218 """This method replaces the need for "entityInAttributeValueState".
219 """
220 self.consumeEntity(allowedChar=allowedChar, fromAttribute=True)
221
222 def emitCurrentToken(self):
223 """This method is a generic handler for emitting the tags. It also sets
224 the state to "data" because that's what's needed after a token has been
225 emitted.
226 """
227 token = self.currentToken
228 # Add token to the queue to be yielded
229 if (token["type"] in tagTokenTypes):
230 token["name"] = token["name"].translate(asciiUpper2Lower)
231 if token["type"] == tokenTypes["EndTag"]:
232 if token["data"]:
233 self.tokenQueue.append({"type": tokenTypes["ParseError"],
234 "data": "attributes-in-end-tag"})
235 if token["selfClosing"]:
236 self.tokenQueue.append({"type": tokenTypes["ParseError"],
237 "data": "self-closing-flag-on-end-tag"})
238 self.tokenQueue.append(token)
239 self.state = self.dataState
240
241 # Below are the various tokenizer states worked out.
242 def dataState(self):
243 data = self.stream.char()
244 if data == "&":
245 self.state = self.entityDataState
246 elif data == "<":
247 self.state = self.tagOpenState
248 elif data == "\u0000":
249 self.tokenQueue.append({"type": tokenTypes["ParseError"],
250 "data": "invalid-codepoint"})
251 self.tokenQueue.append({"type": tokenTypes["Characters"],
252 "data": "\u0000"})
253 elif data is EOF:
254 # Tokenization ends.
255 return False
256 elif data in spaceCharacters:
257 # Directly after emitting a token you switch back to the "data
258 # state". At that point spaceCharacters are important so they are
259 # emitted separately.
260 self.tokenQueue.append({"type": tokenTypes["SpaceCharacters"], "data":
261 data + self.stream.charsUntil(spaceCharacters, True)})
262 # No need to update lastFourChars here, since the first space will
263 # have already been appended to lastFourChars and will have broken
264 # any <!-- or --> sequences
265 else:
266 chars = self.stream.charsUntil(("&", "<", "\u0000"))
267 self.tokenQueue.append({"type": tokenTypes["Characters"], "data":
268 data + chars})
269 return True
270
271 def entityDataState(self):
272 self.consumeEntity()
273 self.state = self.dataState
274 return True
275
276 def rcdataState(self):
277 data = self.stream.char()
278 if data == "&":
279 self.state = self.characterReferenceInRcdata
280 elif data == "<":
281 self.state = self.rcdataLessThanSignState
282 elif data == EOF:
283 # Tokenization ends.
284 return False
285 elif data == "\u0000":
286 self.tokenQueue.append({"type": tokenTypes["ParseError"],
287 "data": "invalid-codepoint"})
288 self.tokenQueue.append({"type": tokenTypes["Characters"],
289 "data": "\uFFFD"})
290 elif data in spaceCharacters:
291 # Directly after emitting a token you switch back to the "data
292 # state". At that point spaceCharacters are important so they are
293 # emitted separately.
294 self.tokenQueue.append({"type": tokenTypes["SpaceCharacters"], "data":
295 data + self.stream.charsUntil(spaceCharacters, True)})
296 # No need to update lastFourChars here, since the first space will
297 # have already been appended to lastFourChars and will have broken
298 # any <!-- or --> sequences
299 else:
300 chars = self.stream.charsUntil(("&", "<", "\u0000"))
301 self.tokenQueue.append({"type": tokenTypes["Characters"], "data":
302 data + chars})
303 return True
304
305 def characterReferenceInRcdata(self):
306 self.consumeEntity()
307 self.state = self.rcdataState
308 return True
309
310 def rawtextState(self):
311 data = self.stream.char()
312 if data == "<":
313 self.state = self.rawtextLessThanSignState
314 elif data == "\u0000":
315 self.tokenQueue.append({"type": tokenTypes["ParseError"],
316 "data": "invalid-codepoint"})
317 self.tokenQueue.append({"type": tokenTypes["Characters"],
318 "data": "\uFFFD"})
319 elif data == EOF:
320 # Tokenization ends.
321 return False
322 else:
323 chars = self.stream.charsUntil(("<", "\u0000"))
324 self.tokenQueue.append({"type": tokenTypes["Characters"], "data":
325 data + chars})
326 return True
327
328 def scriptDataState(self):
329 data = self.stream.char()
330 if data == "<":
331 self.state = self.scriptDataLessThanSignState
332 elif data == "\u0000":
333 self.tokenQueue.append({"type": tokenTypes["ParseError"],
334 "data": "invalid-codepoint"})
335 self.tokenQueue.append({"type": tokenTypes["Characters"],
336 "data": "\uFFFD"})
337 elif data == EOF:
338 # Tokenization ends.
339 return False
340 else:
341 chars = self.stream.charsUntil(("<", "\u0000"))
342 self.tokenQueue.append({"type": tokenTypes["Characters"], "data":
343 data + chars})
344 return True
345
346 def plaintextState(self):
347 data = self.stream.char()
348 if data == EOF:
349 # Tokenization ends.
350 return False
351 elif data == "\u0000":
352 self.tokenQueue.append({"type": tokenTypes["ParseError"],
353 "data": "invalid-codepoint"})
354 self.tokenQueue.append({"type": tokenTypes["Characters"],
355 "data": "\uFFFD"})
356 else:
357 self.tokenQueue.append({"type": tokenTypes["Characters"], "data":
358 data + self.stream.charsUntil("\u0000")})
359 return True
360
361 def tagOpenState(self):
362 data = self.stream.char()
363 if data == "!":
364 self.state = self.markupDeclarationOpenState
365 elif data == "/":
366 self.state = self.closeTagOpenState
367 elif data in asciiLetters:
368 self.currentToken = {"type": tokenTypes["StartTag"],
369 "name": data, "data": [],
370 "selfClosing": False,
371 "selfClosingAcknowledged": False}
372 self.state = self.tagNameState
373 elif data == ">":
374 # XXX In theory it could be something besides a tag name. But
375 # do we really care?
376 self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
377 "expected-tag-name-but-got-right-bracket"})
378 self.tokenQueue.append({"type": tokenTypes["Characters"], "data": "<>"})
379 self.state = self.dataState
380 elif data == "?":
381 # XXX In theory it could be something besides a tag name. But
382 # do we really care?
383 self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
384 "expected-tag-name-but-got-question-mark"})
385 self.stream.unget(data)
386 self.state = self.bogusCommentState
387 else:
388 # XXX
389 self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
390 "expected-tag-name"})
391 self.tokenQueue.append({"type": tokenTypes["Characters"], "data": "<"})
392 self.stream.unget(data)
393 self.state = self.dataState
394 return True
395
396 def closeTagOpenState(self):
397 data = self.stream.char()
398 if data in asciiLetters:
399 self.currentToken = {"type": tokenTypes["EndTag"], "name": data,
400 "data": [], "selfClosing": False}
401 self.state = self.tagNameState
402 elif data == ">":
403 self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
404 "expected-closing-tag-but-got-right-bracket"})
405 self.state = self.dataState
406 elif data is EOF:
407 self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
408 "expected-closing-tag-but-got-eof"})
409 self.tokenQueue.append({"type": tokenTypes["Characters"], "data": "</"})
410 self.state = self.dataState
411 else:
412 # XXX data can be _'_...
413 self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
414 "expected-closing-tag-but-got-char",
415 "datavars": {"data": data}})
416 self.stream.unget(data)
417 self.state = self.bogusCommentState
418 return True
419
420 def tagNameState(self):
421 data = self.stream.char()
422 if data in spaceCharacters:
423 self.state = self.beforeAttributeNameState
424 elif data == ">":
425 self.emitCurrentToken()
426 elif data is EOF:
427 self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
428 "eof-in-tag-name"})
429 self.state = self.dataState
430 elif data == "/":
431 self.state = self.selfClosingStartTagState
432 elif data == "\u0000":
433 self.tokenQueue.append({"type": tokenTypes["ParseError"],
434 "data": "invalid-codepoint"})
435 self.currentToken["name"] += "\uFFFD"
436 else:
437 self.currentToken["name"] += data
438 # (Don't use charsUntil here, because tag names are
439 # very short and it's faster to not do anything fancy)
440 return True
441
442 def rcdataLessThanSignState(self):
443 data = self.stream.char()
444 if data == "/":
445 self.temporaryBuffer = ""
446 self.state = self.rcdataEndTagOpenState
447 else:
448 self.tokenQueue.append({"type": tokenTypes["Characters"], "data": "<"})
449 self.stream.unget(data)
450 self.state = self.rcdataState
451 return True
452
453 def rcdataEndTagOpenState(self):
454 data = self.stream.char()
455 if data in asciiLetters:
456 self.temporaryBuffer += data
457 self.state = self.rcdataEndTagNameState
458 else:
459 self.tokenQueue.append({"type": tokenTypes["Characters"], "data": "</"})
460 self.stream.unget(data)
461 self.state = self.rcdataState
462 return True
463
464 def rcdataEndTagNameState(self):
465 appropriate = self.currentToken and self.currentToken["name"].lower() == self.temporaryBuffer.lower()
466 data = self.stream.char()
467 if data in spaceCharacters and appropriate:
468 self.currentToken = {"type": tokenTypes["EndTag"],
469 "name": self.temporaryBuffer,
470 "data": [], "selfClosing": False}
471 self.state = self.beforeAttributeNameState
472 elif data == "/" and appropriate:
473 self.currentToken = {"type": tokenTypes["EndTag"],
474 "name": self.temporaryBuffer,
475 "data": [], "selfClosing": False}
476 self.state = self.selfClosingStartTagState
477 elif data == ">" and appropriate:
478 self.currentToken = {"type": tokenTypes["EndTag"],
479 "name": self.temporaryBuffer,
480 "data": [], "selfClosing": False}
481 self.emitCurrentToken()
482 self.state = self.dataState
483 elif data in asciiLetters:
484 self.temporaryBuffer += data
485 else:
486 self.tokenQueue.append({"type": tokenTypes["Characters"],
487 "data": "</" + self.temporaryBuffer})
488 self.stream.unget(data)
489 self.state = self.rcdataState
490 return True
491
492 def rawtextLessThanSignState(self):
493 data = self.stream.char()
494 if data == "/":
495 self.temporaryBuffer = ""
496 self.state = self.rawtextEndTagOpenState
497 else:
498 self.tokenQueue.append({"type": tokenTypes["Characters"], "data": "<"})
499 self.stream.unget(data)
500 self.state = self.rawtextState
501 return True
502
503 def rawtextEndTagOpenState(self):
504 data = self.stream.char()
505 if data in asciiLetters:
506 self.temporaryBuffer += data
507 self.state = self.rawtextEndTagNameState
508 else:
509 self.tokenQueue.append({"type": tokenTypes["Characters"], "data": "</"})
510 self.stream.unget(data)
511 self.state = self.rawtextState
512 return True
513
514 def rawtextEndTagNameState(self):
515 appropriate = self.currentToken and self.currentToken["name"].lower() == self.temporaryBuffer.lower()
516 data = self.stream.char()
517 if data in spaceCharacters and appropriate:
518 self.currentToken = {"type": tokenTypes["EndTag"],
519 "name": self.temporaryBuffer,
520 "data": [], "selfClosing": False}
521 self.state = self.beforeAttributeNameState
522 elif data == "/" and appropriate:
523 self.currentToken = {"type": tokenTypes["EndTag"],
524 "name": self.temporaryBuffer,
525 "data": [], "selfClosing": False}
526 self.state = self.selfClosingStartTagState
527 elif data == ">" and appropriate:
528 self.currentToken = {"type": tokenTypes["EndTag"],
529 "name": self.temporaryBuffer,
530 "data": [], "selfClosing": False}
531 self.emitCurrentToken()
532 self.state = self.dataState
533 elif data in asciiLetters:
534 self.temporaryBuffer += data
535 else:
536 self.tokenQueue.append({"type": tokenTypes["Characters"],
537 "data": "</" + self.temporaryBuffer})
538 self.stream.unget(data)
539 self.state = self.rawtextState
540 return True
541
542 def scriptDataLessThanSignState(self):
543 data = self.stream.char()
544 if data == "/":
545 self.temporaryBuffer = ""
546 self.state = self.scriptDataEndTagOpenState
547 elif data == "!":
548 self.tokenQueue.append({"type": tokenTypes["Characters"], "data": "<!"})
549 self.state = self.scriptDataEscapeStartState
550 else:
551 self.tokenQueue.append({"type": tokenTypes["Characters"], "data": "<"})
552 self.stream.unget(data)
553 self.state = self.scriptDataState
554 return True
555
556 def scriptDataEndTagOpenState(self):
557 data = self.stream.char()
558 if data in asciiLetters:
559 self.temporaryBuffer += data
560 self.state = self.scriptDataEndTagNameState
561 else:
562 self.tokenQueue.append({"type": tokenTypes["Characters"], "data": "</"})
563 self.stream.unget(data)
564 self.state = self.scriptDataState
565 return True
566
567 def scriptDataEndTagNameState(self):
568 appropriate = self.currentToken and self.currentToken["name"].lower() == self.temporaryBuffer.lower()
569 data = self.stream.char()
570 if data in spaceCharacters and appropriate:
571 self.currentToken = {"type": tokenTypes["EndTag"],
572 "name": self.temporaryBuffer,
573 "data": [], "selfClosing": False}
574 self.state = self.beforeAttributeNameState
575 elif data == "/" and appropriate:
576 self.currentToken = {"type": tokenTypes["EndTag"],
577 "name": self.temporaryBuffer,
578 "data": [], "selfClosing": False}
579 self.state = self.selfClosingStartTagState
580 elif data == ">" and appropriate:
581 self.currentToken = {"type": tokenTypes["EndTag"],
582 "name": self.temporaryBuffer,
583 "data": [], "selfClosing": False}
584 self.emitCurrentToken()
585 self.state = self.dataState
586 elif data in asciiLetters:
587 self.temporaryBuffer += data
588 else:
589 self.tokenQueue.append({"type": tokenTypes["Characters"],
590 "data": "</" + self.temporaryBuffer})
591 self.stream.unget(data)
592 self.state = self.scriptDataState
593 return True
594
595 def scriptDataEscapeStartState(self):
596 data = self.stream.char()
597 if data == "-":
598 self.tokenQueue.append({"type": tokenTypes["Characters"], "data": "-"})
599 self.state = self.scriptDataEscapeStartDashState
600 else:
601 self.stream.unget(data)
602 self.state = self.scriptDataState
603 return True
604
605 def scriptDataEscapeStartDashState(self):
606 data = self.stream.char()
607 if data == "-":
608 self.tokenQueue.append({"type": tokenTypes["Characters"], "data": "-"})
609 self.state = self.scriptDataEscapedDashDashState
610 else:
611 self.stream.unget(data)
612 self.state = self.scriptDataState
613 return True
614
615 def scriptDataEscapedState(self):
616 data = self.stream.char()
617 if data == "-":
618 self.tokenQueue.append({"type": tokenTypes["Characters"], "data": "-"})
619 self.state = self.scriptDataEscapedDashState
620 elif data == "<":
621 self.state = self.scriptDataEscapedLessThanSignState
622 elif data == "\u0000":
623 self.tokenQueue.append({"type": tokenTypes["ParseError"],
624 "data": "invalid-codepoint"})
625 self.tokenQueue.append({"type": tokenTypes["Characters"],
626 "data": "\uFFFD"})
627 elif data == EOF:
628 self.state = self.dataState
629 else:
630 chars = self.stream.charsUntil(("<", "-", "\u0000"))
631 self.tokenQueue.append({"type": tokenTypes["Characters"], "data":
632 data + chars})
633 return True
634
635 def scriptDataEscapedDashState(self):
636 data = self.stream.char()
637 if data == "-":
638 self.tokenQueue.append({"type": tokenTypes["Characters"], "data": "-"})
639 self.state = self.scriptDataEscapedDashDashState
640 elif data == "<":
641 self.state = self.scriptDataEscapedLessThanSignState
642 elif data == "\u0000":
643 self.tokenQueue.append({"type": tokenTypes["ParseError"],
644 "data": "invalid-codepoint"})
645 self.tokenQueue.append({"type": tokenTypes["Characters"],
646 "data": "\uFFFD"})
647 self.state = self.scriptDataEscapedState
648 elif data == EOF:
649 self.state = self.dataState
650 else:
651 self.tokenQueue.append({"type": tokenTypes["Characters"], "data": data})
652 self.state = self.scriptDataEscapedState
653 return True
654
655 def scriptDataEscapedDashDashState(self):
656 data = self.stream.char()
657 if data == "-":
658 self.tokenQueue.append({"type": tokenTypes["Characters"], "data": "-"})
659 elif data == "<":
660 self.state = self.scriptDataEscapedLessThanSignState
661 elif data == ">":
662 self.tokenQueue.append({"type": tokenTypes["Characters"], "data": ">"})
663 self.state = self.scriptDataState
664 elif data == "\u0000":
665 self.tokenQueue.append({"type": tokenTypes["ParseError"],
666 "data": "invalid-codepoint"})
667 self.tokenQueue.append({"type": tokenTypes["Characters"],
668 "data": "\uFFFD"})
669 self.state = self.scriptDataEscapedState
670 elif data == EOF:
671 self.state = self.dataState
672 else:
673 self.tokenQueue.append({"type": tokenTypes["Characters"], "data": data})
674 self.state = self.scriptDataEscapedState
675 return True
676
677 def scriptDataEscapedLessThanSignState(self):
678 data = self.stream.char()
679 if data == "/":
680 self.temporaryBuffer = ""
681 self.state = self.scriptDataEscapedEndTagOpenState
682 elif data in asciiLetters:
683 self.tokenQueue.append({"type": tokenTypes["Characters"], "data": "<" + data})
684 self.temporaryBuffer = data
685 self.state = self.scriptDataDoubleEscapeStartState
686 else:
687 self.tokenQueue.append({"type": tokenTypes["Characters"], "data": "<"})
688 self.stream.unget(data)
689 self.state = self.scriptDataEscapedState
690 return True
691
692 def scriptDataEscapedEndTagOpenState(self):
693 data = self.stream.char()
694 if data in asciiLetters:
695 self.temporaryBuffer = data
696 self.state = self.scriptDataEscapedEndTagNameState
697 else:
698 self.tokenQueue.append({"type": tokenTypes["Characters"], "data": "</"})
699 self.stream.unget(data)
700 self.state = self.scriptDataEscapedState
701 return True
702
703 def scriptDataEscapedEndTagNameState(self):
704 appropriate = self.currentToken and self.currentToken["name"].lower() == self.temporaryBuffer.lower()
705 data = self.stream.char()
706 if data in spaceCharacters and appropriate:
707 self.currentToken = {"type": tokenTypes["EndTag"],
708 "name": self.temporaryBuffer,
709 "data": [], "selfClosing": False}
710 self.state = self.beforeAttributeNameState
711 elif data == "/" and appropriate:
712 self.currentToken = {"type": tokenTypes["EndTag"],
713 "name": self.temporaryBuffer,
714 "data": [], "selfClosing": False}
715 self.state = self.selfClosingStartTagState
716 elif data == ">" and appropriate:
717 self.currentToken = {"type": tokenTypes["EndTag"],
718 "name": self.temporaryBuffer,
719 "data": [], "selfClosing": False}
720 self.emitCurrentToken()
721 self.state = self.dataState
722 elif data in asciiLetters:
723 self.temporaryBuffer += data
724 else:
725 self.tokenQueue.append({"type": tokenTypes["Characters"],
726 "data": "</" + self.temporaryBuffer})
727 self.stream.unget(data)
728 self.state = self.scriptDataEscapedState
729 return True
730
731 def scriptDataDoubleEscapeStartState(self):
732 data = self.stream.char()
733 if data in (spaceCharacters | frozenset(("/", ">"))):
734 self.tokenQueue.append({"type": tokenTypes["Characters"], "data": data})
735 if self.temporaryBuffer.lower() == "script":
736 self.state = self.scriptDataDoubleEscapedState
737 else:
738 self.state = self.scriptDataEscapedState
739 elif data in asciiLetters:
740 self.tokenQueue.append({"type": tokenTypes["Characters"], "data": data})
741 self.temporaryBuffer += data
742 else:
743 self.stream.unget(data)
744 self.state = self.scriptDataEscapedState
745 return True
746
747 def scriptDataDoubleEscapedState(self):
748 data = self.stream.char()
749 if data == "-":
750 self.tokenQueue.append({"type": tokenTypes["Characters"], "data": "-"})
751 self.state = self.scriptDataDoubleEscapedDashState
752 elif data == "<":
753 self.tokenQueue.append({"type": tokenTypes["Characters"], "data": "<"})
754 self.state = self.scriptDataDoubleEscapedLessThanSignState
755 elif data == "\u0000":
756 self.tokenQueue.append({"type": tokenTypes["ParseError"],
757 "data": "invalid-codepoint"})
758 self.tokenQueue.append({"type": tokenTypes["Characters"],
759 "data": "\uFFFD"})
760 elif data == EOF:
761 self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
762 "eof-in-script-in-script"})
763 self.state = self.dataState
764 else:
765 self.tokenQueue.append({"type": tokenTypes["Characters"], "data": data})
766 return True
767
768 def scriptDataDoubleEscapedDashState(self):
769 data = self.stream.char()
770 if data == "-":
771 self.tokenQueue.append({"type": tokenTypes["Characters"], "data": "-"})
772 self.state = self.scriptDataDoubleEscapedDashDashState
773 elif data == "<":
774 self.tokenQueue.append({"type": tokenTypes["Characters"], "data": "<"})
775 self.state = self.scriptDataDoubleEscapedLessThanSignState
776 elif data == "\u0000":
777 self.tokenQueue.append({"type": tokenTypes["ParseError"],
778 "data": "invalid-codepoint"})
779 self.tokenQueue.append({"type": tokenTypes["Characters"],
780 "data": "\uFFFD"})
781 self.state = self.scriptDataDoubleEscapedState
782 elif data == EOF:
783 self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
784 "eof-in-script-in-script"})
785 self.state = self.dataState
786 else:
787 self.tokenQueue.append({"type": tokenTypes["Characters"], "data": data})
788 self.state = self.scriptDataDoubleEscapedState
789 return True
790
791 def scriptDataDoubleEscapedDashDashState(self):
792 data = self.stream.char()
793 if data == "-":
794 self.tokenQueue.append({"type": tokenTypes["Characters"], "data": "-"})
795 elif data == "<":
796 self.tokenQueue.append({"type": tokenTypes["Characters"], "data": "<"})
797 self.state = self.scriptDataDoubleEscapedLessThanSignState
798 elif data == ">":
799 self.tokenQueue.append({"type": tokenTypes["Characters"], "data": ">"})
800 self.state = self.scriptDataState
801 elif data == "\u0000":
802 self.tokenQueue.append({"type": tokenTypes["ParseError"],
803 "data": "invalid-codepoint"})
804 self.tokenQueue.append({"type": tokenTypes["Characters"],
805 "data": "\uFFFD"})
806 self.state = self.scriptDataDoubleEscapedState
807 elif data == EOF:
808 self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
809 "eof-in-script-in-script"})
810 self.state = self.dataState
811 else:
812 self.tokenQueue.append({"type": tokenTypes["Characters"], "data": data})
813 self.state = self.scriptDataDoubleEscapedState
814 return True
815
816 def scriptDataDoubleEscapedLessThanSignState(self):
817 data = self.stream.char()
818 if data == "/":
819 self.tokenQueue.append({"type": tokenTypes["Characters"], "data": "/"})
820 self.temporaryBuffer = ""
821 self.state = self.scriptDataDoubleEscapeEndState
822 else:
823 self.stream.unget(data)
824 self.state = self.scriptDataDoubleEscapedState
825 return True
826
827 def scriptDataDoubleEscapeEndState(self):
828 data = self.stream.char()
829 if data in (spaceCharacters | frozenset(("/", ">"))):
830 self.tokenQueue.append({"type": tokenTypes["Characters"], "data": data})
831 if self.temporaryBuffer.lower() == "script":
832 self.state = self.scriptDataEscapedState
833 else:
834 self.state = self.scriptDataDoubleEscapedState
835 elif data in asciiLetters:
836 self.tokenQueue.append({"type": tokenTypes["Characters"], "data": data})
837 self.temporaryBuffer += data
838 else:
839 self.stream.unget(data)
840 self.state = self.scriptDataDoubleEscapedState
841 return True
842
843 def beforeAttributeNameState(self):
844 data = self.stream.char()
845 if data in spaceCharacters:
846 self.stream.charsUntil(spaceCharacters, True)
847 elif data in asciiLetters:
848 self.currentToken["data"].append([data, ""])
849 self.state = self.attributeNameState
850 elif data == ">":
851 self.emitCurrentToken()
852 elif data == "/":
853 self.state = self.selfClosingStartTagState
854 elif data in ("'", '"', "=", "<"):
855 self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
856 "invalid-character-in-attribute-name"})
857 self.currentToken["data"].append([data, ""])
858 self.state = self.attributeNameState
859 elif data == "\u0000":
860 self.tokenQueue.append({"type": tokenTypes["ParseError"],
861 "data": "invalid-codepoint"})
862 self.currentToken["data"].append(["\uFFFD", ""])
863 self.state = self.attributeNameState
864 elif data is EOF:
865 self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
866 "expected-attribute-name-but-got-eof"})
867 self.state = self.dataState
868 else:
869 self.currentToken["data"].append([data, ""])
870 self.state = self.attributeNameState
871 return True
872
873 def attributeNameState(self):
874 data = self.stream.char()
875 leavingThisState = True
876 emitToken = False
877 if data == "=":
878 self.state = self.beforeAttributeValueState
879 elif data in asciiLetters:
880 self.currentToken["data"][-1][0] += data +\
881 self.stream.charsUntil(asciiLetters, True)
882 leavingThisState = False
883 elif data == ">":
884 # XXX If we emit here the attributes are converted to a dict
885 # without being checked and when the code below runs we error
886 # because data is a dict not a list
887 emitToken = True
888 elif data in spaceCharacters:
889 self.state = self.afterAttributeNameState
890 elif data == "/":
891 self.state = self.selfClosingStartTagState
892 elif data == "\u0000":
893 self.tokenQueue.append({"type": tokenTypes["ParseError"],
894 "data": "invalid-codepoint"})
895 self.currentToken["data"][-1][0] += "\uFFFD"
896 leavingThisState = False
897 elif data in ("'", '"', "<"):
898 self.tokenQueue.append({"type": tokenTypes["ParseError"],
899 "data":
900 "invalid-character-in-attribute-name"})
901 self.currentToken["data"][-1][0] += data
902 leavingThisState = False
903 elif data is EOF:
904 self.tokenQueue.append({"type": tokenTypes["ParseError"],
905 "data": "eof-in-attribute-name"})
906 self.state = self.dataState
907 else:
908 self.currentToken["data"][-1][0] += data
909 leavingThisState = False
910
911 if leavingThisState:
912 # Attributes are not dropped at this stage. That happens when the
913 # start tag token is emitted so values can still be safely appended
914 # to attributes, but we do want to report the parse error in time.
915 self.currentToken["data"][-1][0] = (
916 self.currentToken["data"][-1][0].translate(asciiUpper2Lower))
917 for name, _ in self.currentToken["data"][:-1]:
918 if self.currentToken["data"][-1][0] == name:
919 self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
920 "duplicate-attribute"})
921 break
922 # XXX Fix for above XXX
923 if emitToken:
924 self.emitCurrentToken()
925 return True
926
927 def afterAttributeNameState(self):
928 data = self.stream.char()
929 if data in spaceCharacters:
930 self.stream.charsUntil(spaceCharacters, True)
931 elif data == "=":
932 self.state = self.beforeAttributeValueState
933 elif data == ">":
934 self.emitCurrentToken()
935 elif data in asciiLetters:
936 self.currentToken["data"].append([data, ""])
937 self.state = self.attributeNameState
938 elif data == "/":
939 self.state = self.selfClosingStartTagState
940 elif data == "\u0000":
941 self.tokenQueue.append({"type": tokenTypes["ParseError"],
942 "data": "invalid-codepoint"})
943 self.currentToken["data"].append(["\uFFFD", ""])
944 self.state = self.attributeNameState
945 elif data in ("'", '"', "<"):
946 self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
947 "invalid-character-after-attribute-name"})
948 self.currentToken["data"].append([data, ""])
949 self.state = self.attributeNameState
950 elif data is EOF:
951 self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
952 "expected-end-of-tag-but-got-eof"})
953 self.state = self.dataState
954 else:
955 self.currentToken["data"].append([data, ""])
956 self.state = self.attributeNameState
957 return True
958
959 def beforeAttributeValueState(self):
960 data = self.stream.char()
961 if data in spaceCharacters:
962 self.stream.charsUntil(spaceCharacters, True)
963 elif data == "\"":
964 self.state = self.attributeValueDoubleQuotedState
965 elif data == "&":
966 self.state = self.attributeValueUnQuotedState
967 self.stream.unget(data)
968 elif data == "'":
969 self.state = self.attributeValueSingleQuotedState
970 elif data == ">":
971 self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
972 "expected-attribute-value-but-got-right-bracket"})
973 self.emitCurrentToken()
974 elif data == "\u0000":
975 self.tokenQueue.append({"type": tokenTypes["ParseError"],
976 "data": "invalid-codepoint"})
977 self.currentToken["data"][-1][1] += "\uFFFD"
978 self.state = self.attributeValueUnQuotedState
979 elif data in ("=", "<", "`"):
980 self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
981 "equals-in-unquoted-attribute-value"})
982 self.currentToken["data"][-1][1] += data
983 self.state = self.attributeValueUnQuotedState
984 elif data is EOF:
985 self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
986 "expected-attribute-value-but-got-eof"})
987 self.state = self.dataState
988 else:
989 self.currentToken["data"][-1][1] += data
990 self.state = self.attributeValueUnQuotedState
991 return True
992
993 def attributeValueDoubleQuotedState(self):
994 data = self.stream.char()
995 if data == "\"":
996 self.state = self.afterAttributeValueState
997 elif data == "&":
998 self.processEntityInAttribute('"')
999 elif data == "\u0000":
1000 self.tokenQueue.append({"type": tokenTypes["ParseError"],
1001 "data": "invalid-codepoint"})
1002 self.currentToken["data"][-1][1] += "\uFFFD"
1003 elif data is EOF:
1004 self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
1005 "eof-in-attribute-value-double-quote"})
1006 self.state = self.dataState
1007 else:
1008 self.currentToken["data"][-1][1] += data +\
1009 self.stream.charsUntil(("\"", "&", "\u0000"))
1010 return True
1011
1012 def attributeValueSingleQuotedState(self):
1013 data = self.stream.char()
1014 if data == "'":
1015 self.state = self.afterAttributeValueState
1016 elif data == "&":
1017 self.processEntityInAttribute("'")
1018 elif data == "\u0000":
1019 self.tokenQueue.append({"type": tokenTypes["ParseError"],
1020 "data": "invalid-codepoint"})
1021 self.currentToken["data"][-1][1] += "\uFFFD"
1022 elif data is EOF:
1023 self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
1024 "eof-in-attribute-value-single-quote"})
1025 self.state = self.dataState
1026 else:
1027 self.currentToken["data"][-1][1] += data +\
1028 self.stream.charsUntil(("'", "&", "\u0000"))
1029 return True
1030
1031 def attributeValueUnQuotedState(self):
1032 data = self.stream.char()
1033 if data in spaceCharacters:
1034 self.state = self.beforeAttributeNameState
1035 elif data == "&":
1036 self.processEntityInAttribute(">")
1037 elif data == ">":
1038 self.emitCurrentToken()
1039 elif data in ('"', "'", "=", "<", "`"):
1040 self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
1041 "unexpected-character-in-unquoted-attribute-value"})
1042 self.currentToken["data"][-1][1] += data
1043 elif data == "\u0000":
1044 self.tokenQueue.append({"type": tokenTypes["ParseError"],
1045 "data": "invalid-codepoint"})
1046 self.currentToken["data"][-1][1] += "\uFFFD"
1047 elif data is EOF:
1048 self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
1049 "eof-in-attribute-value-no-quotes"})
1050 self.state = self.dataState
1051 else:
1052 self.currentToken["data"][-1][1] += data + self.stream.charsUntil(
1053 frozenset(("&", ">", '"', "'", "=", "<", "`", "\u0000")) | spaceCharacters)
1054 return True
1055
1056 def afterAttributeValueState(self):
1057 data = self.stream.char()
1058 if data in spaceCharacters:
1059 self.state = self.beforeAttributeNameState
1060 elif data == ">":
1061 self.emitCurrentToken()
1062 elif data == "/":
1063 self.state = self.selfClosingStartTagState
1064 elif data is EOF:
1065 self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
1066 "unexpected-EOF-after-attribute-value"})
1067 self.stream.unget(data)
1068 self.state = self.dataState
1069 else:
1070 self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
1071 "unexpected-character-after-attribute-value"})
1072 self.stream.unget(data)
1073 self.state = self.beforeAttributeNameState
1074 return True
1075
1076 def selfClosingStartTagState(self):
1077 data = self.stream.char()
1078 if data == ">":
1079 self.currentToken["selfClosing"] = True
1080 self.emitCurrentToken()
1081 elif data is EOF:
1082 self.tokenQueue.append({"type": tokenTypes["ParseError"],
1083 "data":
1084 "unexpected-EOF-after-solidus-in-tag"})
1085 self.stream.unget(data)
1086 self.state = self.dataState
1087 else:
1088 self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
1089 "unexpected-character-after-solidus-in-tag"})
1090 self.stream.unget(data)
1091 self.state = self.beforeAttributeNameState
1092 return True
1093
1094 def bogusCommentState(self):
1095 # Make a new comment token and give it as value all the characters
1096 # until the first > or EOF (charsUntil checks for EOF automatically)
1097 # and emit it.
1098 data = self.stream.charsUntil(">")
1099 data = data.replace("\u0000", "\uFFFD")
1100 self.tokenQueue.append(
1101 {"type": tokenTypes["Comment"], "data": data})
1102
1103 # Eat the character directly after the bogus comment which is either a
1104 # ">" or an EOF.
1105 self.stream.char()
1106 self.state = self.dataState
1107 return True
1108
1109 def markupDeclarationOpenState(self):
1110 charStack = [self.stream.char()]
1111 if charStack[-1] == "-":
1112 charStack.append(self.stream.char())
1113 if charStack[-1] == "-":
1114 self.currentToken = {"type": tokenTypes["Comment"], "data": ""}
1115 self.state = self.commentStartState
1116 return True
1117 elif charStack[-1] in ('d', 'D'):
1118 matched = True
1119 for expected in (('o', 'O'), ('c', 'C'), ('t', 'T'),
1120 ('y', 'Y'), ('p', 'P'), ('e', 'E')):
1121 charStack.append(self.stream.char())
1122 if charStack[-1] not in expected:
1123 matched = False
1124 break
1125 if matched:
1126 self.currentToken = {"type": tokenTypes["Doctype"],
1127 "name": "",
1128 "publicId": None, "systemId": None,
1129 "correct": True}
1130 self.state = self.doctypeState
1131 return True
1132 elif (charStack[-1] == "[" and
1133 self.parser is not None and
1134 self.parser.tree.openElements and
1135 self.parser.tree.openElements[-1].namespace != self.parser.tree.defaultNamespace):
1136 matched = True
1137 for expected in ["C", "D", "A", "T", "A", "["]:
1138 charStack.append(self.stream.char())
1139 if charStack[-1] != expected:
1140 matched = False
1141 break
1142 if matched:
1143 self.state = self.cdataSectionState
1144 return True
1145
1146 self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
1147 "expected-dashes-or-doctype"})
1148
1149 while charStack:
1150 self.stream.unget(charStack.pop())
1151 self.state = self.bogusCommentState
1152 return True
1153
1154 def commentStartState(self):
1155 data = self.stream.char()
1156 if data == "-":
1157 self.state = self.commentStartDashState
1158 elif data == "\u0000":
1159 self.tokenQueue.append({"type": tokenTypes["ParseError"],
1160 "data": "invalid-codepoint"})
1161 self.currentToken["data"] += "\uFFFD"
1162 elif data == ">":
1163 self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
1164 "incorrect-comment"})
1165 self.tokenQueue.append(self.currentToken)
1166 self.state = self.dataState
1167 elif data is EOF:
1168 self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
1169 "eof-in-comment"})
1170 self.tokenQueue.append(self.currentToken)
1171 self.state = self.dataState
1172 else:
1173 self.currentToken["data"] += data
1174 self.state = self.commentState
1175 return True
1176
1177 def commentStartDashState(self):
1178 data = self.stream.char()
1179 if data == "-":
1180 self.state = self.commentEndState
1181 elif data == "\u0000":
1182 self.tokenQueue.append({"type": tokenTypes["ParseError"],
1183 "data": "invalid-codepoint"})
1184 self.currentToken["data"] += "-\uFFFD"
1185 elif data == ">":
1186 self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
1187 "incorrect-comment"})
1188 self.tokenQueue.append(self.currentToken)
1189 self.state = self.dataState
1190 elif data is EOF:
1191 self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
1192 "eof-in-comment"})
1193 self.tokenQueue.append(self.currentToken)
1194 self.state = self.dataState
1195 else:
1196 self.currentToken["data"] += "-" + data
1197 self.state = self.commentState
1198 return True
1199
1200 def commentState(self):
1201 data = self.stream.char()
1202 if data == "-":
1203 self.state = self.commentEndDashState
1204 elif data == "\u0000":
1205 self.tokenQueue.append({"type": tokenTypes["ParseError"],
1206 "data": "invalid-codepoint"})
1207 self.currentToken["data"] += "\uFFFD"
1208 elif data is EOF:
1209 self.tokenQueue.append({"type": tokenTypes["ParseError"],
1210 "data": "eof-in-comment"})
1211 self.tokenQueue.append(self.currentToken)
1212 self.state = self.dataState
1213 else:
1214 self.currentToken["data"] += data + \
1215 self.stream.charsUntil(("-", "\u0000"))
1216 return True
1217
1218 def commentEndDashState(self):
1219 data = self.stream.char()
1220 if data == "-":
1221 self.state = self.commentEndState
1222 elif data == "\u0000":
1223 self.tokenQueue.append({"type": tokenTypes["ParseError"],
1224 "data": "invalid-codepoint"})
1225 self.currentToken["data"] += "-\uFFFD"
1226 self.state = self.commentState
1227 elif data is EOF:
1228 self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
1229 "eof-in-comment-end-dash"})
1230 self.tokenQueue.append(self.currentToken)
1231 self.state = self.dataState
1232 else:
1233 self.currentToken["data"] += "-" + data
1234 self.state = self.commentState
1235 return True
1236
1237 def commentEndState(self):
1238 data = self.stream.char()
1239 if data == ">":
1240 self.tokenQueue.append(self.currentToken)
1241 self.state = self.dataState
1242 elif data == "\u0000":
1243 self.tokenQueue.append({"type": tokenTypes["ParseError"],
1244 "data": "invalid-codepoint"})
1245 self.currentToken["data"] += "--\uFFFD"
1246 self.state = self.commentState
1247 elif data == "!":
1248 self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
1249 "unexpected-bang-after-double-dash-in-comment"})
1250 self.state = self.commentEndBangState
1251 elif data == "-":
1252 self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
1253 "unexpected-dash-after-double-dash-in-comment"})
1254 self.currentToken["data"] += data
1255 elif data is EOF:
1256 self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
1257 "eof-in-comment-double-dash"})
1258 self.tokenQueue.append(self.currentToken)
1259 self.state = self.dataState
1260 else:
1261 # XXX
1262 self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
1263 "unexpected-char-in-comment"})
1264 self.currentToken["data"] += "--" + data
1265 self.state = self.commentState
1266 return True
1267
1268 def commentEndBangState(self):
1269 data = self.stream.char()
1270 if data == ">":
1271 self.tokenQueue.append(self.currentToken)
1272 self.state = self.dataState
1273 elif data == "-":
1274 self.currentToken["data"] += "--!"
1275 self.state = self.commentEndDashState
1276 elif data == "\u0000":
1277 self.tokenQueue.append({"type": tokenTypes["ParseError"],
1278 "data": "invalid-codepoint"})
1279 self.currentToken["data"] += "--!\uFFFD"
1280 self.state = self.commentState
1281 elif data is EOF:
1282 self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
1283 "eof-in-comment-end-bang-state"})
1284 self.tokenQueue.append(self.currentToken)
1285 self.state = self.dataState
1286 else:
1287 self.currentToken["data"] += "--!" + data
1288 self.state = self.commentState
1289 return True
1290
1291 def doctypeState(self):
1292 data = self.stream.char()
1293 if data in spaceCharacters:
1294 self.state = self.beforeDoctypeNameState
1295 elif data is EOF:
1296 self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
1297 "expected-doctype-name-but-got-eof"})
1298 self.currentToken["correct"] = False
1299 self.tokenQueue.append(self.currentToken)
1300 self.state = self.dataState
1301 else:
1302 self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
1303 "need-space-after-doctype"})
1304 self.stream.unget(data)
1305 self.state = self.beforeDoctypeNameState
1306 return True
1307
1308 def beforeDoctypeNameState(self):
1309 data = self.stream.char()
1310 if data in spaceCharacters:
1311 pass
1312 elif data == ">":
1313 self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
1314 "expected-doctype-name-but-got-right-bracket"})
1315 self.currentToken["correct"] = False
1316 self.tokenQueue.append(self.currentToken)
1317 self.state = self.dataState
1318 elif data == "\u0000":
1319 self.tokenQueue.append({"type": tokenTypes["ParseError"],
1320 "data": "invalid-codepoint"})
1321 self.currentToken["name"] = "\uFFFD"
1322 self.state = self.doctypeNameState
1323 elif data is EOF:
1324 self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
1325 "expected-doctype-name-but-got-eof"})
1326 self.currentToken["correct"] = False
1327 self.tokenQueue.append(self.currentToken)
1328 self.state = self.dataState
1329 else:
1330 self.currentToken["name"] = data
1331 self.state = self.doctypeNameState
1332 return True
1333
1334 def doctypeNameState(self):
1335 data = self.stream.char()
1336 if data in spaceCharacters:
1337 self.currentToken["name"] = self.currentToken["name"].translate(asciiUpper2Lower)
1338 self.state = self.afterDoctypeNameState
1339 elif data == ">":
1340 self.currentToken["name"] = self.currentToken["name"].translate(asciiUpper2Lower)
1341 self.tokenQueue.append(self.currentToken)
1342 self.state = self.dataState
1343 elif data == "\u0000":
1344 self.tokenQueue.append({"type": tokenTypes["ParseError"],
1345 "data": "invalid-codepoint"})
1346 self.currentToken["name"] += "\uFFFD"
1347 self.state = self.doctypeNameState
1348 elif data is EOF:
1349 self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
1350 "eof-in-doctype-name"})
1351 self.currentToken["correct"] = False
1352 self.currentToken["name"] = self.currentToken["name"].translate(asciiUpper2Lower)
1353 self.tokenQueue.append(self.currentToken)
1354 self.state = self.dataState
1355 else:
1356 self.currentToken["name"] += data
1357 return True
1358
1359 def afterDoctypeNameState(self):
1360 data = self.stream.char()
1361 if data in spaceCharacters:
1362 pass
1363 elif data == ">":
1364 self.tokenQueue.append(self.currentToken)
1365 self.state = self.dataState
1366 elif data is EOF:
1367 self.currentToken["correct"] = False
1368 self.stream.unget(data)
1369 self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
1370 "eof-in-doctype"})
1371 self.tokenQueue.append(self.currentToken)
1372 self.state = self.dataState
1373 else:
1374 if data in ("p", "P"):
1375 matched = True
1376 for expected in (("u", "U"), ("b", "B"), ("l", "L"),
1377 ("i", "I"), ("c", "C")):
1378 data = self.stream.char()
1379 if data not in expected:
1380 matched = False
1381 break
1382 if matched:
1383 self.state = self.afterDoctypePublicKeywordState
1384 return True
1385 elif data in ("s", "S"):
1386 matched = True
1387 for expected in (("y", "Y"), ("s", "S"), ("t", "T"),
1388 ("e", "E"), ("m", "M")):
1389 data = self.stream.char()
1390 if data not in expected:
1391 matched = False
1392 break
1393 if matched:
1394 self.state = self.afterDoctypeSystemKeywordState
1395 return True
1396
1397 # All the characters read before the current 'data' will be
1398 # [a-zA-Z], so they're garbage in the bogus doctype and can be
1399 # discarded; only the latest character might be '>' or EOF
1400 # and needs to be ungetted
1401 self.stream.unget(data)
1402 self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
1403 "expected-space-or-right-bracket-in-doctype", "datavars":
1404 {"data": data}})
1405 self.currentToken["correct"] = False
1406 self.state = self.bogusDoctypeState
1407
1408 return True
1409
1410 def afterDoctypePublicKeywordState(self):
1411 data = self.stream.char()
1412 if data in spaceCharacters:
1413 self.state = self.beforeDoctypePublicIdentifierState
1414 elif data in ("'", '"'):
1415 self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
1416 "unexpected-char-in-doctype"})
1417 self.stream.unget(data)
1418 self.state = self.beforeDoctypePublicIdentifierState
1419 elif data is EOF:
1420 self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
1421 "eof-in-doctype"})
1422 self.currentToken["correct"] = False
1423 self.tokenQueue.append(self.currentToken)
1424 self.state = self.dataState
1425 else:
1426 self.stream.unget(data)
1427 self.state = self.beforeDoctypePublicIdentifierState
1428 return True
1429
1430 def beforeDoctypePublicIdentifierState(self):
1431 data = self.stream.char()
1432 if data in spaceCharacters:
1433 pass
1434 elif data == "\"":
1435 self.currentToken["publicId"] = ""
1436 self.state = self.doctypePublicIdentifierDoubleQuotedState
1437 elif data == "'":
1438 self.currentToken["publicId"] = ""
1439 self.state = self.doctypePublicIdentifierSingleQuotedState
1440 elif data == ">":
1441 self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
1442 "unexpected-end-of-doctype"})
1443 self.currentToken["correct"] = False
1444 self.tokenQueue.append(self.currentToken)
1445 self.state = self.dataState
1446 elif data is EOF:
1447 self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
1448 "eof-in-doctype"})
1449 self.currentToken["correct"] = False
1450 self.tokenQueue.append(self.currentToken)
1451 self.state = self.dataState
1452 else:
1453 self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
1454 "unexpected-char-in-doctype"})
1455 self.currentToken["correct"] = False
1456 self.state = self.bogusDoctypeState
1457 return True
1458
1459 def doctypePublicIdentifierDoubleQuotedState(self):
1460 data = self.stream.char()
1461 if data == "\"":
1462 self.state = self.afterDoctypePublicIdentifierState
1463 elif data == "\u0000":
1464 self.tokenQueue.append({"type": tokenTypes["ParseError"],
1465 "data": "invalid-codepoint"})
1466 self.currentToken["publicId"] += "\uFFFD"
1467 elif data == ">":
1468 self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
1469 "unexpected-end-of-doctype"})
1470 self.currentToken["correct"] = False
1471 self.tokenQueue.append(self.currentToken)
1472 self.state = self.dataState
1473 elif data is EOF:
1474 self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
1475 "eof-in-doctype"})
1476 self.currentToken["correct"] = False
1477 self.tokenQueue.append(self.currentToken)
1478 self.state = self.dataState
1479 else:
1480 self.currentToken["publicId"] += data
1481 return True
1482
1483 def doctypePublicIdentifierSingleQuotedState(self):
1484 data = self.stream.char()
1485 if data == "'":
1486 self.state = self.afterDoctypePublicIdentifierState
1487 elif data == "\u0000":
1488 self.tokenQueue.append({"type": tokenTypes["ParseError"],
1489 "data": "invalid-codepoint"})
1490 self.currentToken["publicId"] += "\uFFFD"
1491 elif data == ">":
1492 self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
1493 "unexpected-end-of-doctype"})
1494 self.currentToken["correct"] = False
1495 self.tokenQueue.append(self.currentToken)
1496 self.state = self.dataState
1497 elif data is EOF:
1498 self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
1499 "eof-in-doctype"})
1500 self.currentToken["correct"] = False
1501 self.tokenQueue.append(self.currentToken)
1502 self.state = self.dataState
1503 else:
1504 self.currentToken["publicId"] += data
1505 return True
1506
1507 def afterDoctypePublicIdentifierState(self):
1508 data = self.stream.char()
1509 if data in spaceCharacters:
1510 self.state = self.betweenDoctypePublicAndSystemIdentifiersState
1511 elif data == ">":
1512 self.tokenQueue.append(self.currentToken)
1513 self.state = self.dataState
1514 elif data == '"':
1515 self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
1516 "unexpected-char-in-doctype"})
1517 self.currentToken["systemId"] = ""
1518 self.state = self.doctypeSystemIdentifierDoubleQuotedState
1519 elif data == "'":
1520 self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
1521 "unexpected-char-in-doctype"})
1522 self.currentToken["systemId"] = ""
1523 self.state = self.doctypeSystemIdentifierSingleQuotedState
1524 elif data is EOF:
1525 self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
1526 "eof-in-doctype"})
1527 self.currentToken["correct"] = False
1528 self.tokenQueue.append(self.currentToken)
1529 self.state = self.dataState
1530 else:
1531 self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
1532 "unexpected-char-in-doctype"})
1533 self.currentToken["correct"] = False
1534 self.state = self.bogusDoctypeState
1535 return True
1536
1537 def betweenDoctypePublicAndSystemIdentifiersState(self):
1538 data = self.stream.char()
1539 if data in spaceCharacters:
1540 pass
1541 elif data == ">":
1542 self.tokenQueue.append(self.currentToken)
1543 self.state = self.dataState
1544 elif data == '"':
1545 self.currentToken["systemId"] = ""
1546 self.state = self.doctypeSystemIdentifierDoubleQuotedState
1547 elif data == "'":
1548 self.currentToken["systemId"] = ""
1549 self.state = self.doctypeSystemIdentifierSingleQuotedState
1550 elif data == EOF:
1551 self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
1552 "eof-in-doctype"})
1553 self.currentToken["correct"] = False
1554 self.tokenQueue.append(self.currentToken)
1555 self.state = self.dataState
1556 else:
1557 self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
1558 "unexpected-char-in-doctype"})
1559 self.currentToken["correct"] = False
1560 self.state = self.bogusDoctypeState
1561 return True
1562
1563 def afterDoctypeSystemKeywordState(self):
1564 data = self.stream.char()
1565 if data in spaceCharacters:
1566 self.state = self.beforeDoctypeSystemIdentifierState
1567 elif data in ("'", '"'):
1568 self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
1569 "unexpected-char-in-doctype"})
1570 self.stream.unget(data)
1571 self.state = self.beforeDoctypeSystemIdentifierState
1572 elif data is EOF:
1573 self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
1574 "eof-in-doctype"})
1575 self.currentToken["correct"] = False
1576 self.tokenQueue.append(self.currentToken)
1577 self.state = self.dataState
1578 else:
1579 self.stream.unget(data)
1580 self.state = self.beforeDoctypeSystemIdentifierState
1581 return True
1582
1583 def beforeDoctypeSystemIdentifierState(self):
1584 data = self.stream.char()
1585 if data in spaceCharacters:
1586 pass
1587 elif data == "\"":
1588 self.currentToken["systemId"] = ""
1589 self.state = self.doctypeSystemIdentifierDoubleQuotedState
1590 elif data == "'":
1591 self.currentToken["systemId"] = ""
1592 self.state = self.doctypeSystemIdentifierSingleQuotedState
1593 elif data == ">":
1594 self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
1595 "unexpected-char-in-doctype"})
1596 self.currentToken["correct"] = False
1597 self.tokenQueue.append(self.currentToken)
1598 self.state = self.dataState
1599 elif data is EOF:
1600 self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
1601 "eof-in-doctype"})
1602 self.currentToken["correct"] = False
1603 self.tokenQueue.append(self.currentToken)
1604 self.state = self.dataState
1605 else:
1606 self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
1607 "unexpected-char-in-doctype"})
1608 self.currentToken["correct"] = False
1609 self.state = self.bogusDoctypeState
1610 return True
1611
1612 def doctypeSystemIdentifierDoubleQuotedState(self):
1613 data = self.stream.char()
1614 if data == "\"":
1615 self.state = self.afterDoctypeSystemIdentifierState
1616 elif data == "\u0000":
1617 self.tokenQueue.append({"type": tokenTypes["ParseError"],
1618 "data": "invalid-codepoint"})
1619 self.currentToken["systemId"] += "\uFFFD"
1620 elif data == ">":
1621 self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
1622 "unexpected-end-of-doctype"})
1623 self.currentToken["correct"] = False
1624 self.tokenQueue.append(self.currentToken)
1625 self.state = self.dataState
1626 elif data is EOF:
1627 self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
1628 "eof-in-doctype"})
1629 self.currentToken["correct"] = False
1630 self.tokenQueue.append(self.currentToken)
1631 self.state = self.dataState
1632 else:
1633 self.currentToken["systemId"] += data
1634 return True
1635
1636 def doctypeSystemIdentifierSingleQuotedState(self):
1637 data = self.stream.char()
1638 if data == "'":
1639 self.state = self.afterDoctypeSystemIdentifierState
1640 elif data == "\u0000":
1641 self.tokenQueue.append({"type": tokenTypes["ParseError"],
1642 "data": "invalid-codepoint"})
1643 self.currentToken["systemId"] += "\uFFFD"
1644 elif data == ">":
1645 self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
1646 "unexpected-end-of-doctype"})
1647 self.currentToken["correct"] = False
1648 self.tokenQueue.append(self.currentToken)
1649 self.state = self.dataState
1650 elif data is EOF:
1651 self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
1652 "eof-in-doctype"})
1653 self.currentToken["correct"] = False
1654 self.tokenQueue.append(self.currentToken)
1655 self.state = self.dataState
1656 else:
1657 self.currentToken["systemId"] += data
1658 return True
1659
1660 def afterDoctypeSystemIdentifierState(self):
1661 data = self.stream.char()
1662 if data in spaceCharacters:
1663 pass
1664 elif data == ">":
1665 self.tokenQueue.append(self.currentToken)
1666 self.state = self.dataState
1667 elif data is EOF:
1668 self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
1669 "eof-in-doctype"})
1670 self.currentToken["correct"] = False
1671 self.tokenQueue.append(self.currentToken)
1672 self.state = self.dataState
1673 else:
1674 self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
1675 "unexpected-char-in-doctype"})
1676 self.state = self.bogusDoctypeState
1677 return True
1678
1679 def bogusDoctypeState(self):
1680 data = self.stream.char()
1681 if data == ">":
1682 self.tokenQueue.append(self.currentToken)
1683 self.state = self.dataState
1684 elif data is EOF:
1685 # XXX EMIT
1686 self.stream.unget(data)
1687 self.tokenQueue.append(self.currentToken)
1688 self.state = self.dataState
1689 else:
1690 pass
1691 return True
1692
1693 def cdataSectionState(self):
1694 data = []
1695 while True:
1696 data.append(self.stream.charsUntil("]"))
1697 data.append(self.stream.charsUntil(">"))
1698 char = self.stream.char()
1699 if char == EOF:
1700 break
1701 else:
1702 assert char == ">"
1703 if data[-1][-2:] == "]]":
1704 data[-1] = data[-1][:-2]
1705 break
1706 else:
1707 data.append(char)
1708
1709 data = "".join(data) # pylint:disable=redefined-variable-type
1710 # Deal with null here rather than in the parser
1711 nullCount = data.count("\u0000")
1712 if nullCount > 0:
1713 for _ in range(nullCount):
1714 self.tokenQueue.append({"type": tokenTypes["ParseError"],
1715 "data": "invalid-codepoint"})
1716 data = data.replace("\u0000", "\uFFFD")
1717 if data:
1718 self.tokenQueue.append({"type": tokenTypes["Characters"],
1719 "data": data})
1720 self.state = self.dataState
1721 return True