summaryrefslogtreecommitdiff
path: root/venv/lib/python3.7/site-packages/pip-10.0.1-py3.7.egg/pip/_vendor/html5lib/_inputstream.py
diff options
context:
space:
mode:
Diffstat (limited to 'venv/lib/python3.7/site-packages/pip-10.0.1-py3.7.egg/pip/_vendor/html5lib/_inputstream.py')
-rw-r--r--venv/lib/python3.7/site-packages/pip-10.0.1-py3.7.egg/pip/_vendor/html5lib/_inputstream.py923
1 files changed, 923 insertions, 0 deletions
diff --git a/venv/lib/python3.7/site-packages/pip-10.0.1-py3.7.egg/pip/_vendor/html5lib/_inputstream.py b/venv/lib/python3.7/site-packages/pip-10.0.1-py3.7.egg/pip/_vendor/html5lib/_inputstream.py
new file mode 100644
index 0000000..21c6bbc
--- /dev/null
+++ b/venv/lib/python3.7/site-packages/pip-10.0.1-py3.7.egg/pip/_vendor/html5lib/_inputstream.py
@@ -0,0 +1,923 @@
1from __future__ import absolute_import, division, unicode_literals
2
3from pip._vendor.six import text_type, binary_type
4from pip._vendor.six.moves import http_client, urllib
5
6import codecs
7import re
8
9from pip._vendor import webencodings
10
11from .constants import EOF, spaceCharacters, asciiLetters, asciiUppercase
12from .constants import _ReparseException
13from . import _utils
14
15from io import StringIO
16
17try:
18 from io import BytesIO
19except ImportError:
20 BytesIO = StringIO
21
22# Non-unicode versions of constants for use in the pre-parser
23spaceCharactersBytes = frozenset([item.encode("ascii") for item in spaceCharacters])
24asciiLettersBytes = frozenset([item.encode("ascii") for item in asciiLetters])
25asciiUppercaseBytes = frozenset([item.encode("ascii") for item in asciiUppercase])
26spacesAngleBrackets = spaceCharactersBytes | frozenset([b">", b"<"])
27
28
29invalid_unicode_no_surrogate = "[\u0001-\u0008\u000B\u000E-\u001F\u007F-\u009F\uFDD0-\uFDEF\uFFFE\uFFFF\U0001FFFE\U0001FFFF\U0002FFFE\U0002FFFF\U0003FFFE\U0003FFFF\U0004FFFE\U0004FFFF\U0005FFFE\U0005FFFF\U0006FFFE\U0006FFFF\U0007FFFE\U0007FFFF\U0008FFFE\U0008FFFF\U0009FFFE\U0009FFFF\U000AFFFE\U000AFFFF\U000BFFFE\U000BFFFF\U000CFFFE\U000CFFFF\U000DFFFE\U000DFFFF\U000EFFFE\U000EFFFF\U000FFFFE\U000FFFFF\U0010FFFE\U0010FFFF]" # noqa
30
31if _utils.supports_lone_surrogates:
32 # Use one extra step of indirection and create surrogates with
33 # eval. Not using this indirection would introduce an illegal
34 # unicode literal on platforms not supporting such lone
35 # surrogates.
36 assert invalid_unicode_no_surrogate[-1] == "]" and invalid_unicode_no_surrogate.count("]") == 1
37 invalid_unicode_re = re.compile(invalid_unicode_no_surrogate[:-1] +
38 eval('"\\uD800-\\uDFFF"') + # pylint:disable=eval-used
39 "]")
40else:
41 invalid_unicode_re = re.compile(invalid_unicode_no_surrogate)
42
43non_bmp_invalid_codepoints = set([0x1FFFE, 0x1FFFF, 0x2FFFE, 0x2FFFF, 0x3FFFE,
44 0x3FFFF, 0x4FFFE, 0x4FFFF, 0x5FFFE, 0x5FFFF,
45 0x6FFFE, 0x6FFFF, 0x7FFFE, 0x7FFFF, 0x8FFFE,
46 0x8FFFF, 0x9FFFE, 0x9FFFF, 0xAFFFE, 0xAFFFF,
47 0xBFFFE, 0xBFFFF, 0xCFFFE, 0xCFFFF, 0xDFFFE,
48 0xDFFFF, 0xEFFFE, 0xEFFFF, 0xFFFFE, 0xFFFFF,
49 0x10FFFE, 0x10FFFF])
50
51ascii_punctuation_re = re.compile("[\u0009-\u000D\u0020-\u002F\u003A-\u0040\u005C\u005B-\u0060\u007B-\u007E]")
52
53# Cache for charsUntil()
54charsUntilRegEx = {}
55
56
57class BufferedStream(object):
58 """Buffering for streams that do not have buffering of their own
59
60 The buffer is implemented as a list of chunks on the assumption that
61 joining many strings will be slow since it is O(n**2)
62 """
63
64 def __init__(self, stream):
65 self.stream = stream
66 self.buffer = []
67 self.position = [-1, 0] # chunk number, offset
68
69 def tell(self):
70 pos = 0
71 for chunk in self.buffer[:self.position[0]]:
72 pos += len(chunk)
73 pos += self.position[1]
74 return pos
75
76 def seek(self, pos):
77 assert pos <= self._bufferedBytes()
78 offset = pos
79 i = 0
80 while len(self.buffer[i]) < offset:
81 offset -= len(self.buffer[i])
82 i += 1
83 self.position = [i, offset]
84
85 def read(self, bytes):
86 if not self.buffer:
87 return self._readStream(bytes)
88 elif (self.position[0] == len(self.buffer) and
89 self.position[1] == len(self.buffer[-1])):
90 return self._readStream(bytes)
91 else:
92 return self._readFromBuffer(bytes)
93
94 def _bufferedBytes(self):
95 return sum([len(item) for item in self.buffer])
96
97 def _readStream(self, bytes):
98 data = self.stream.read(bytes)
99 self.buffer.append(data)
100 self.position[0] += 1
101 self.position[1] = len(data)
102 return data
103
104 def _readFromBuffer(self, bytes):
105 remainingBytes = bytes
106 rv = []
107 bufferIndex = self.position[0]
108 bufferOffset = self.position[1]
109 while bufferIndex < len(self.buffer) and remainingBytes != 0:
110 assert remainingBytes > 0
111 bufferedData = self.buffer[bufferIndex]
112
113 if remainingBytes <= len(bufferedData) - bufferOffset:
114 bytesToRead = remainingBytes
115 self.position = [bufferIndex, bufferOffset + bytesToRead]
116 else:
117 bytesToRead = len(bufferedData) - bufferOffset
118 self.position = [bufferIndex, len(bufferedData)]
119 bufferIndex += 1
120 rv.append(bufferedData[bufferOffset:bufferOffset + bytesToRead])
121 remainingBytes -= bytesToRead
122
123 bufferOffset = 0
124
125 if remainingBytes:
126 rv.append(self._readStream(remainingBytes))
127
128 return b"".join(rv)
129
130
131def HTMLInputStream(source, **kwargs):
132 # Work around Python bug #20007: read(0) closes the connection.
133 # http://bugs.python.org/issue20007
134 if (isinstance(source, http_client.HTTPResponse) or
135 # Also check for addinfourl wrapping HTTPResponse
136 (isinstance(source, urllib.response.addbase) and
137 isinstance(source.fp, http_client.HTTPResponse))):
138 isUnicode = False
139 elif hasattr(source, "read"):
140 isUnicode = isinstance(source.read(0), text_type)
141 else:
142 isUnicode = isinstance(source, text_type)
143
144 if isUnicode:
145 encodings = [x for x in kwargs if x.endswith("_encoding")]
146 if encodings:
147 raise TypeError("Cannot set an encoding with a unicode input, set %r" % encodings)
148
149 return HTMLUnicodeInputStream(source, **kwargs)
150 else:
151 return HTMLBinaryInputStream(source, **kwargs)
152
153
154class HTMLUnicodeInputStream(object):
155 """Provides a unicode stream of characters to the HTMLTokenizer.
156
157 This class takes care of character encoding and removing or replacing
158 incorrect byte-sequences and also provides column and line tracking.
159
160 """
161
162 _defaultChunkSize = 10240
163
164 def __init__(self, source):
165 """Initialises the HTMLInputStream.
166
167 HTMLInputStream(source, [encoding]) -> Normalized stream from source
168 for use by html5lib.
169
170 source can be either a file-object, local filename or a string.
171
172 The optional encoding parameter must be a string that indicates
173 the encoding. If specified, that encoding will be used,
174 regardless of any BOM or later declaration (such as in a meta
175 element)
176
177 """
178
179 if not _utils.supports_lone_surrogates:
180 # Such platforms will have already checked for such
181 # surrogate errors, so no need to do this checking.
182 self.reportCharacterErrors = None
183 elif len("\U0010FFFF") == 1:
184 self.reportCharacterErrors = self.characterErrorsUCS4
185 else:
186 self.reportCharacterErrors = self.characterErrorsUCS2
187
188 # List of where new lines occur
189 self.newLines = [0]
190
191 self.charEncoding = (lookupEncoding("utf-8"), "certain")
192 self.dataStream = self.openStream(source)
193
194 self.reset()
195
196 def reset(self):
197 self.chunk = ""
198 self.chunkSize = 0
199 self.chunkOffset = 0
200 self.errors = []
201
202 # number of (complete) lines in previous chunks
203 self.prevNumLines = 0
204 # number of columns in the last line of the previous chunk
205 self.prevNumCols = 0
206
207 # Deal with CR LF and surrogates split over chunk boundaries
208 self._bufferedCharacter = None
209
210 def openStream(self, source):
211 """Produces a file object from source.
212
213 source can be either a file object, local filename or a string.
214
215 """
216 # Already a file object
217 if hasattr(source, 'read'):
218 stream = source
219 else:
220 stream = StringIO(source)
221
222 return stream
223
224 def _position(self, offset):
225 chunk = self.chunk
226 nLines = chunk.count('\n', 0, offset)
227 positionLine = self.prevNumLines + nLines
228 lastLinePos = chunk.rfind('\n', 0, offset)
229 if lastLinePos == -1:
230 positionColumn = self.prevNumCols + offset
231 else:
232 positionColumn = offset - (lastLinePos + 1)
233 return (positionLine, positionColumn)
234
235 def position(self):
236 """Returns (line, col) of the current position in the stream."""
237 line, col = self._position(self.chunkOffset)
238 return (line + 1, col)
239
240 def char(self):
241 """ Read one character from the stream or queue if available. Return
242 EOF when EOF is reached.
243 """
244 # Read a new chunk from the input stream if necessary
245 if self.chunkOffset >= self.chunkSize:
246 if not self.readChunk():
247 return EOF
248
249 chunkOffset = self.chunkOffset
250 char = self.chunk[chunkOffset]
251 self.chunkOffset = chunkOffset + 1
252
253 return char
254
255 def readChunk(self, chunkSize=None):
256 if chunkSize is None:
257 chunkSize = self._defaultChunkSize
258
259 self.prevNumLines, self.prevNumCols = self._position(self.chunkSize)
260
261 self.chunk = ""
262 self.chunkSize = 0
263 self.chunkOffset = 0
264
265 data = self.dataStream.read(chunkSize)
266
267 # Deal with CR LF and surrogates broken across chunks
268 if self._bufferedCharacter:
269 data = self._bufferedCharacter + data
270 self._bufferedCharacter = None
271 elif not data:
272 # We have no more data, bye-bye stream
273 return False
274
275 if len(data) > 1:
276 lastv = ord(data[-1])
277 if lastv == 0x0D or 0xD800 <= lastv <= 0xDBFF:
278 self._bufferedCharacter = data[-1]
279 data = data[:-1]
280
281 if self.reportCharacterErrors:
282 self.reportCharacterErrors(data)
283
284 # Replace invalid characters
285 data = data.replace("\r\n", "\n")
286 data = data.replace("\r", "\n")
287
288 self.chunk = data
289 self.chunkSize = len(data)
290
291 return True
292
293 def characterErrorsUCS4(self, data):
294 for _ in range(len(invalid_unicode_re.findall(data))):
295 self.errors.append("invalid-codepoint")
296
297 def characterErrorsUCS2(self, data):
298 # Someone picked the wrong compile option
299 # You lose
300 skip = False
301 for match in invalid_unicode_re.finditer(data):
302 if skip:
303 continue
304 codepoint = ord(match.group())
305 pos = match.start()
306 # Pretty sure there should be endianness issues here
307 if _utils.isSurrogatePair(data[pos:pos + 2]):
308 # We have a surrogate pair!
309 char_val = _utils.surrogatePairToCodepoint(data[pos:pos + 2])
310 if char_val in non_bmp_invalid_codepoints:
311 self.errors.append("invalid-codepoint")
312 skip = True
313 elif (codepoint >= 0xD800 and codepoint <= 0xDFFF and
314 pos == len(data) - 1):
315 self.errors.append("invalid-codepoint")
316 else:
317 skip = False
318 self.errors.append("invalid-codepoint")
319
320 def charsUntil(self, characters, opposite=False):
321 """ Returns a string of characters from the stream up to but not
322 including any character in 'characters' or EOF. 'characters' must be
323 a container that supports the 'in' method and iteration over its
324 characters.
325 """
326
327 # Use a cache of regexps to find the required characters
328 try:
329 chars = charsUntilRegEx[(characters, opposite)]
330 except KeyError:
331 if __debug__:
332 for c in characters:
333 assert(ord(c) < 128)
334 regex = "".join(["\\x%02x" % ord(c) for c in characters])
335 if not opposite:
336 regex = "^%s" % regex
337 chars = charsUntilRegEx[(characters, opposite)] = re.compile("[%s]+" % regex)
338
339 rv = []
340
341 while True:
342 # Find the longest matching prefix
343 m = chars.match(self.chunk, self.chunkOffset)
344 if m is None:
345 # If nothing matched, and it wasn't because we ran out of chunk,
346 # then stop
347 if self.chunkOffset != self.chunkSize:
348 break
349 else:
350 end = m.end()
351 # If not the whole chunk matched, return everything
352 # up to the part that didn't match
353 if end != self.chunkSize:
354 rv.append(self.chunk[self.chunkOffset:end])
355 self.chunkOffset = end
356 break
357 # If the whole remainder of the chunk matched,
358 # use it all and read the next chunk
359 rv.append(self.chunk[self.chunkOffset:])
360 if not self.readChunk():
361 # Reached EOF
362 break
363
364 r = "".join(rv)
365 return r
366
367 def unget(self, char):
368 # Only one character is allowed to be ungotten at once - it must
369 # be consumed again before any further call to unget
370 if char is not None:
371 if self.chunkOffset == 0:
372 # unget is called quite rarely, so it's a good idea to do
373 # more work here if it saves a bit of work in the frequently
374 # called char and charsUntil.
375 # So, just prepend the ungotten character onto the current
376 # chunk:
377 self.chunk = char + self.chunk
378 self.chunkSize += 1
379 else:
380 self.chunkOffset -= 1
381 assert self.chunk[self.chunkOffset] == char
382
383
384class HTMLBinaryInputStream(HTMLUnicodeInputStream):
385 """Provides a unicode stream of characters to the HTMLTokenizer.
386
387 This class takes care of character encoding and removing or replacing
388 incorrect byte-sequences and also provides column and line tracking.
389
390 """
391
392 def __init__(self, source, override_encoding=None, transport_encoding=None,
393 same_origin_parent_encoding=None, likely_encoding=None,
394 default_encoding="windows-1252", useChardet=True):
395 """Initialises the HTMLInputStream.
396
397 HTMLInputStream(source, [encoding]) -> Normalized stream from source
398 for use by html5lib.
399
400 source can be either a file-object, local filename or a string.
401
402 The optional encoding parameter must be a string that indicates
403 the encoding. If specified, that encoding will be used,
404 regardless of any BOM or later declaration (such as in a meta
405 element)
406
407 """
408 # Raw Stream - for unicode objects this will encode to utf-8 and set
409 # self.charEncoding as appropriate
410 self.rawStream = self.openStream(source)
411
412 HTMLUnicodeInputStream.__init__(self, self.rawStream)
413
414 # Encoding Information
415 # Number of bytes to use when looking for a meta element with
416 # encoding information
417 self.numBytesMeta = 1024
418 # Number of bytes to use when using detecting encoding using chardet
419 self.numBytesChardet = 100
420 # Things from args
421 self.override_encoding = override_encoding
422 self.transport_encoding = transport_encoding
423 self.same_origin_parent_encoding = same_origin_parent_encoding
424 self.likely_encoding = likely_encoding
425 self.default_encoding = default_encoding
426
427 # Determine encoding
428 self.charEncoding = self.determineEncoding(useChardet)
429 assert self.charEncoding[0] is not None
430
431 # Call superclass
432 self.reset()
433
434 def reset(self):
435 self.dataStream = self.charEncoding[0].codec_info.streamreader(self.rawStream, 'replace')
436 HTMLUnicodeInputStream.reset(self)
437
438 def openStream(self, source):
439 """Produces a file object from source.
440
441 source can be either a file object, local filename or a string.
442
443 """
444 # Already a file object
445 if hasattr(source, 'read'):
446 stream = source
447 else:
448 stream = BytesIO(source)
449
450 try:
451 stream.seek(stream.tell())
452 except: # pylint:disable=bare-except
453 stream = BufferedStream(stream)
454
455 return stream
456
457 def determineEncoding(self, chardet=True):
458 # BOMs take precedence over everything
459 # This will also read past the BOM if present
460 charEncoding = self.detectBOM(), "certain"
461 if charEncoding[0] is not None:
462 return charEncoding
463
464 # If we've been overriden, we've been overriden
465 charEncoding = lookupEncoding(self.override_encoding), "certain"
466 if charEncoding[0] is not None:
467 return charEncoding
468
469 # Now check the transport layer
470 charEncoding = lookupEncoding(self.transport_encoding), "certain"
471 if charEncoding[0] is not None:
472 return charEncoding
473
474 # Look for meta elements with encoding information
475 charEncoding = self.detectEncodingMeta(), "tentative"
476 if charEncoding[0] is not None:
477 return charEncoding
478
479 # Parent document encoding
480 charEncoding = lookupEncoding(self.same_origin_parent_encoding), "tentative"
481 if charEncoding[0] is not None and not charEncoding[0].name.startswith("utf-16"):
482 return charEncoding
483
484 # "likely" encoding
485 charEncoding = lookupEncoding(self.likely_encoding), "tentative"
486 if charEncoding[0] is not None:
487 return charEncoding
488
489 # Guess with chardet, if available
490 if chardet:
491 try:
492 from pip._vendor.chardet.universaldetector import UniversalDetector
493 except ImportError:
494 pass
495 else:
496 buffers = []
497 detector = UniversalDetector()
498 while not detector.done:
499 buffer = self.rawStream.read(self.numBytesChardet)
500 assert isinstance(buffer, bytes)
501 if not buffer:
502 break
503 buffers.append(buffer)
504 detector.feed(buffer)
505 detector.close()
506 encoding = lookupEncoding(detector.result['encoding'])
507 self.rawStream.seek(0)
508 if encoding is not None:
509 return encoding, "tentative"
510
511 # Try the default encoding
512 charEncoding = lookupEncoding(self.default_encoding), "tentative"
513 if charEncoding[0] is not None:
514 return charEncoding
515
516 # Fallback to html5lib's default if even that hasn't worked
517 return lookupEncoding("windows-1252"), "tentative"
518
519 def changeEncoding(self, newEncoding):
520 assert self.charEncoding[1] != "certain"
521 newEncoding = lookupEncoding(newEncoding)
522 if newEncoding is None:
523 return
524 if newEncoding.name in ("utf-16be", "utf-16le"):
525 newEncoding = lookupEncoding("utf-8")
526 assert newEncoding is not None
527 elif newEncoding == self.charEncoding[0]:
528 self.charEncoding = (self.charEncoding[0], "certain")
529 else:
530 self.rawStream.seek(0)
531 self.charEncoding = (newEncoding, "certain")
532 self.reset()
533 raise _ReparseException("Encoding changed from %s to %s" % (self.charEncoding[0], newEncoding))
534
535 def detectBOM(self):
536 """Attempts to detect at BOM at the start of the stream. If
537 an encoding can be determined from the BOM return the name of the
538 encoding otherwise return None"""
539 bomDict = {
540 codecs.BOM_UTF8: 'utf-8',
541 codecs.BOM_UTF16_LE: 'utf-16le', codecs.BOM_UTF16_BE: 'utf-16be',
542 codecs.BOM_UTF32_LE: 'utf-32le', codecs.BOM_UTF32_BE: 'utf-32be'
543 }
544
545 # Go to beginning of file and read in 4 bytes
546 string = self.rawStream.read(4)
547 assert isinstance(string, bytes)
548
549 # Try detecting the BOM using bytes from the string
550 encoding = bomDict.get(string[:3]) # UTF-8
551 seek = 3
552 if not encoding:
553 # Need to detect UTF-32 before UTF-16
554 encoding = bomDict.get(string) # UTF-32
555 seek = 4
556 if not encoding:
557 encoding = bomDict.get(string[:2]) # UTF-16
558 seek = 2
559
560 # Set the read position past the BOM if one was found, otherwise
561 # set it to the start of the stream
562 if encoding:
563 self.rawStream.seek(seek)
564 return lookupEncoding(encoding)
565 else:
566 self.rawStream.seek(0)
567 return None
568
569 def detectEncodingMeta(self):
570 """Report the encoding declared by the meta element
571 """
572 buffer = self.rawStream.read(self.numBytesMeta)
573 assert isinstance(buffer, bytes)
574 parser = EncodingParser(buffer)
575 self.rawStream.seek(0)
576 encoding = parser.getEncoding()
577
578 if encoding is not None and encoding.name in ("utf-16be", "utf-16le"):
579 encoding = lookupEncoding("utf-8")
580
581 return encoding
582
583
584class EncodingBytes(bytes):
585 """String-like object with an associated position and various extra methods
586 If the position is ever greater than the string length then an exception is
587 raised"""
588 def __new__(self, value):
589 assert isinstance(value, bytes)
590 return bytes.__new__(self, value.lower())
591
592 def __init__(self, value):
593 # pylint:disable=unused-argument
594 self._position = -1
595
596 def __iter__(self):
597 return self
598
599 def __next__(self):
600 p = self._position = self._position + 1
601 if p >= len(self):
602 raise StopIteration
603 elif p < 0:
604 raise TypeError
605 return self[p:p + 1]
606
607 def next(self):
608 # Py2 compat
609 return self.__next__()
610
611 def previous(self):
612 p = self._position
613 if p >= len(self):
614 raise StopIteration
615 elif p < 0:
616 raise TypeError
617 self._position = p = p - 1
618 return self[p:p + 1]
619
620 def setPosition(self, position):
621 if self._position >= len(self):
622 raise StopIteration
623 self._position = position
624
625 def getPosition(self):
626 if self._position >= len(self):
627 raise StopIteration
628 if self._position >= 0:
629 return self._position
630 else:
631 return None
632
633 position = property(getPosition, setPosition)
634
635 def getCurrentByte(self):
636 return self[self.position:self.position + 1]
637
638 currentByte = property(getCurrentByte)
639
640 def skip(self, chars=spaceCharactersBytes):
641 """Skip past a list of characters"""
642 p = self.position # use property for the error-checking
643 while p < len(self):
644 c = self[p:p + 1]
645 if c not in chars:
646 self._position = p
647 return c
648 p += 1
649 self._position = p
650 return None
651
652 def skipUntil(self, chars):
653 p = self.position
654 while p < len(self):
655 c = self[p:p + 1]
656 if c in chars:
657 self._position = p
658 return c
659 p += 1
660 self._position = p
661 return None
662
663 def matchBytes(self, bytes):
664 """Look for a sequence of bytes at the start of a string. If the bytes
665 are found return True and advance the position to the byte after the
666 match. Otherwise return False and leave the position alone"""
667 p = self.position
668 data = self[p:p + len(bytes)]
669 rv = data.startswith(bytes)
670 if rv:
671 self.position += len(bytes)
672 return rv
673
674 def jumpTo(self, bytes):
675 """Look for the next sequence of bytes matching a given sequence. If
676 a match is found advance the position to the last byte of the match"""
677 newPosition = self[self.position:].find(bytes)
678 if newPosition > -1:
679 # XXX: This is ugly, but I can't see a nicer way to fix this.
680 if self._position == -1:
681 self._position = 0
682 self._position += (newPosition + len(bytes) - 1)
683 return True
684 else:
685 raise StopIteration
686
687
688class EncodingParser(object):
689 """Mini parser for detecting character encoding from meta elements"""
690
691 def __init__(self, data):
692 """string - the data to work on for encoding detection"""
693 self.data = EncodingBytes(data)
694 self.encoding = None
695
696 def getEncoding(self):
697 methodDispatch = (
698 (b"<!--", self.handleComment),
699 (b"<meta", self.handleMeta),
700 (b"</", self.handlePossibleEndTag),
701 (b"<!", self.handleOther),
702 (b"<?", self.handleOther),
703 (b"<", self.handlePossibleStartTag))
704 for _ in self.data:
705 keepParsing = True
706 for key, method in methodDispatch:
707 if self.data.matchBytes(key):
708 try:
709 keepParsing = method()
710 break
711 except StopIteration:
712 keepParsing = False
713 break
714 if not keepParsing:
715 break
716
717 return self.encoding
718
719 def handleComment(self):
720 """Skip over comments"""
721 return self.data.jumpTo(b"-->")
722
723 def handleMeta(self):
724 if self.data.currentByte not in spaceCharactersBytes:
725 # if we have <meta not followed by a space so just keep going
726 return True
727 # We have a valid meta element we want to search for attributes
728 hasPragma = False
729 pendingEncoding = None
730 while True:
731 # Try to find the next attribute after the current position
732 attr = self.getAttribute()
733 if attr is None:
734 return True
735 else:
736 if attr[0] == b"http-equiv":
737 hasPragma = attr[1] == b"content-type"
738 if hasPragma and pendingEncoding is not None:
739 self.encoding = pendingEncoding
740 return False
741 elif attr[0] == b"charset":
742 tentativeEncoding = attr[1]
743 codec = lookupEncoding(tentativeEncoding)
744 if codec is not None:
745 self.encoding = codec
746 return False
747 elif attr[0] == b"content":
748 contentParser = ContentAttrParser(EncodingBytes(attr[1]))
749 tentativeEncoding = contentParser.parse()
750 if tentativeEncoding is not None:
751 codec = lookupEncoding(tentativeEncoding)
752 if codec is not None:
753 if hasPragma:
754 self.encoding = codec
755 return False
756 else:
757 pendingEncoding = codec
758
759 def handlePossibleStartTag(self):
760 return self.handlePossibleTag(False)
761
762 def handlePossibleEndTag(self):
763 next(self.data)
764 return self.handlePossibleTag(True)
765
766 def handlePossibleTag(self, endTag):
767 data = self.data
768 if data.currentByte not in asciiLettersBytes:
769 # If the next byte is not an ascii letter either ignore this
770 # fragment (possible start tag case) or treat it according to
771 # handleOther
772 if endTag:
773 data.previous()
774 self.handleOther()
775 return True
776
777 c = data.skipUntil(spacesAngleBrackets)
778 if c == b"<":
779 # return to the first step in the overall "two step" algorithm
780 # reprocessing the < byte
781 data.previous()
782 else:
783 # Read all attributes
784 attr = self.getAttribute()
785 while attr is not None:
786 attr = self.getAttribute()
787 return True
788
789 def handleOther(self):
790 return self.data.jumpTo(b">")
791
792 def getAttribute(self):
793 """Return a name,value pair for the next attribute in the stream,
794 if one is found, or None"""
795 data = self.data
796 # Step 1 (skip chars)
797 c = data.skip(spaceCharactersBytes | frozenset([b"/"]))
798 assert c is None or len(c) == 1
799 # Step 2
800 if c in (b">", None):
801 return None
802 # Step 3
803 attrName = []
804 attrValue = []
805 # Step 4 attribute name
806 while True:
807 if c == b"=" and attrName:
808 break
809 elif c in spaceCharactersBytes:
810 # Step 6!
811 c = data.skip()
812 break
813 elif c in (b"/", b">"):
814 return b"".join(attrName), b""
815 elif c in asciiUppercaseBytes:
816 attrName.append(c.lower())
817 elif c is None:
818 return None
819 else:
820 attrName.append(c)
821 # Step 5
822 c = next(data)
823 # Step 7
824 if c != b"=":
825 data.previous()
826 return b"".join(attrName), b""
827 # Step 8
828 next(data)
829 # Step 9
830 c = data.skip()
831 # Step 10
832 if c in (b"'", b'"'):
833 # 10.1
834 quoteChar = c
835 while True:
836 # 10.2
837 c = next(data)
838 # 10.3
839 if c == quoteChar:
840 next(data)
841 return b"".join(attrName), b"".join(attrValue)
842 # 10.4
843 elif c in asciiUppercaseBytes:
844 attrValue.append(c.lower())
845 # 10.5
846 else:
847 attrValue.append(c)
848 elif c == b">":
849 return b"".join(attrName), b""
850 elif c in asciiUppercaseBytes:
851 attrValue.append(c.lower())
852 elif c is None:
853 return None
854 else:
855 attrValue.append(c)
856 # Step 11
857 while True:
858 c = next(data)
859 if c in spacesAngleBrackets:
860 return b"".join(attrName), b"".join(attrValue)
861 elif c in asciiUppercaseBytes:
862 attrValue.append(c.lower())
863 elif c is None:
864 return None
865 else:
866 attrValue.append(c)
867
868
869class ContentAttrParser(object):
870 def __init__(self, data):
871 assert isinstance(data, bytes)
872 self.data = data
873
874 def parse(self):
875 try:
876 # Check if the attr name is charset
877 # otherwise return
878 self.data.jumpTo(b"charset")
879 self.data.position += 1
880 self.data.skip()
881 if not self.data.currentByte == b"=":
882 # If there is no = sign keep looking for attrs
883 return None
884 self.data.position += 1
885 self.data.skip()
886 # Look for an encoding between matching quote marks
887 if self.data.currentByte in (b'"', b"'"):
888 quoteMark = self.data.currentByte
889 self.data.position += 1
890 oldPosition = self.data.position
891 if self.data.jumpTo(quoteMark):
892 return self.data[oldPosition:self.data.position]
893 else:
894 return None
895 else:
896 # Unquoted value
897 oldPosition = self.data.position
898 try:
899 self.data.skipUntil(spaceCharactersBytes)
900 return self.data[oldPosition:self.data.position]
901 except StopIteration:
902 # Return the whole remaining value
903 return self.data[oldPosition:]
904 except StopIteration:
905 return None
906
907
908def lookupEncoding(encoding):
909 """Return the python codec name corresponding to an encoding or None if the
910 string doesn't correspond to a valid encoding."""
911 if isinstance(encoding, binary_type):
912 try:
913 encoding = encoding.decode("ascii")
914 except UnicodeDecodeError:
915 return None
916
917 if encoding is not None:
918 try:
919 return webencodings.lookup(encoding)
920 except AttributeError:
921 return None
922 else:
923 return None