diff options
author | Shubham Saini <shubham6405@gmail.com> | 2018-12-11 10:01:23 +0000 |
---|---|---|
committer | Shubham Saini <shubham6405@gmail.com> | 2018-12-11 10:01:23 +0000 |
commit | 68df54d6629ec019142eb149dd037774f2d11e7c (patch) | |
tree | 345bc22d46b4e01a4ba8303b94278952a4ed2b9e /venv/lib/python3.7/site-packages/pip-10.0.1-py3.7.egg/pip/_vendor/html5lib/_inputstream.py |
First commit
Diffstat (limited to 'venv/lib/python3.7/site-packages/pip-10.0.1-py3.7.egg/pip/_vendor/html5lib/_inputstream.py')
-rw-r--r-- | venv/lib/python3.7/site-packages/pip-10.0.1-py3.7.egg/pip/_vendor/html5lib/_inputstream.py | 923 |
1 files changed, 923 insertions, 0 deletions
diff --git a/venv/lib/python3.7/site-packages/pip-10.0.1-py3.7.egg/pip/_vendor/html5lib/_inputstream.py b/venv/lib/python3.7/site-packages/pip-10.0.1-py3.7.egg/pip/_vendor/html5lib/_inputstream.py new file mode 100644 index 0000000..21c6bbc --- /dev/null +++ b/venv/lib/python3.7/site-packages/pip-10.0.1-py3.7.egg/pip/_vendor/html5lib/_inputstream.py | |||
@@ -0,0 +1,923 @@ | |||
1 | from __future__ import absolute_import, division, unicode_literals | ||
2 | |||
3 | from pip._vendor.six import text_type, binary_type | ||
4 | from pip._vendor.six.moves import http_client, urllib | ||
5 | |||
6 | import codecs | ||
7 | import re | ||
8 | |||
9 | from pip._vendor import webencodings | ||
10 | |||
11 | from .constants import EOF, spaceCharacters, asciiLetters, asciiUppercase | ||
12 | from .constants import _ReparseException | ||
13 | from . import _utils | ||
14 | |||
15 | from io import StringIO | ||
16 | |||
17 | try: | ||
18 | from io import BytesIO | ||
19 | except ImportError: | ||
20 | BytesIO = StringIO | ||
21 | |||
22 | # Non-unicode versions of constants for use in the pre-parser | ||
23 | spaceCharactersBytes = frozenset([item.encode("ascii") for item in spaceCharacters]) | ||
24 | asciiLettersBytes = frozenset([item.encode("ascii") for item in asciiLetters]) | ||
25 | asciiUppercaseBytes = frozenset([item.encode("ascii") for item in asciiUppercase]) | ||
26 | spacesAngleBrackets = spaceCharactersBytes | frozenset([b">", b"<"]) | ||
27 | |||
28 | |||
29 | invalid_unicode_no_surrogate = "[\u0001-\u0008\u000B\u000E-\u001F\u007F-\u009F\uFDD0-\uFDEF\uFFFE\uFFFF\U0001FFFE\U0001FFFF\U0002FFFE\U0002FFFF\U0003FFFE\U0003FFFF\U0004FFFE\U0004FFFF\U0005FFFE\U0005FFFF\U0006FFFE\U0006FFFF\U0007FFFE\U0007FFFF\U0008FFFE\U0008FFFF\U0009FFFE\U0009FFFF\U000AFFFE\U000AFFFF\U000BFFFE\U000BFFFF\U000CFFFE\U000CFFFF\U000DFFFE\U000DFFFF\U000EFFFE\U000EFFFF\U000FFFFE\U000FFFFF\U0010FFFE\U0010FFFF]" # noqa | ||
30 | |||
31 | if _utils.supports_lone_surrogates: | ||
32 | # Use one extra step of indirection and create surrogates with | ||
33 | # eval. Not using this indirection would introduce an illegal | ||
34 | # unicode literal on platforms not supporting such lone | ||
35 | # surrogates. | ||
36 | assert invalid_unicode_no_surrogate[-1] == "]" and invalid_unicode_no_surrogate.count("]") == 1 | ||
37 | invalid_unicode_re = re.compile(invalid_unicode_no_surrogate[:-1] + | ||
38 | eval('"\\uD800-\\uDFFF"') + # pylint:disable=eval-used | ||
39 | "]") | ||
40 | else: | ||
41 | invalid_unicode_re = re.compile(invalid_unicode_no_surrogate) | ||
42 | |||
43 | non_bmp_invalid_codepoints = set([0x1FFFE, 0x1FFFF, 0x2FFFE, 0x2FFFF, 0x3FFFE, | ||
44 | 0x3FFFF, 0x4FFFE, 0x4FFFF, 0x5FFFE, 0x5FFFF, | ||
45 | 0x6FFFE, 0x6FFFF, 0x7FFFE, 0x7FFFF, 0x8FFFE, | ||
46 | 0x8FFFF, 0x9FFFE, 0x9FFFF, 0xAFFFE, 0xAFFFF, | ||
47 | 0xBFFFE, 0xBFFFF, 0xCFFFE, 0xCFFFF, 0xDFFFE, | ||
48 | 0xDFFFF, 0xEFFFE, 0xEFFFF, 0xFFFFE, 0xFFFFF, | ||
49 | 0x10FFFE, 0x10FFFF]) | ||
50 | |||
51 | ascii_punctuation_re = re.compile("[\u0009-\u000D\u0020-\u002F\u003A-\u0040\u005C\u005B-\u0060\u007B-\u007E]") | ||
52 | |||
53 | # Cache for charsUntil() | ||
54 | charsUntilRegEx = {} | ||
55 | |||
56 | |||
57 | class BufferedStream(object): | ||
58 | """Buffering for streams that do not have buffering of their own | ||
59 | |||
60 | The buffer is implemented as a list of chunks on the assumption that | ||
61 | joining many strings will be slow since it is O(n**2) | ||
62 | """ | ||
63 | |||
64 | def __init__(self, stream): | ||
65 | self.stream = stream | ||
66 | self.buffer = [] | ||
67 | self.position = [-1, 0] # chunk number, offset | ||
68 | |||
69 | def tell(self): | ||
70 | pos = 0 | ||
71 | for chunk in self.buffer[:self.position[0]]: | ||
72 | pos += len(chunk) | ||
73 | pos += self.position[1] | ||
74 | return pos | ||
75 | |||
76 | def seek(self, pos): | ||
77 | assert pos <= self._bufferedBytes() | ||
78 | offset = pos | ||
79 | i = 0 | ||
80 | while len(self.buffer[i]) < offset: | ||
81 | offset -= len(self.buffer[i]) | ||
82 | i += 1 | ||
83 | self.position = [i, offset] | ||
84 | |||
85 | def read(self, bytes): | ||
86 | if not self.buffer: | ||
87 | return self._readStream(bytes) | ||
88 | elif (self.position[0] == len(self.buffer) and | ||
89 | self.position[1] == len(self.buffer[-1])): | ||
90 | return self._readStream(bytes) | ||
91 | else: | ||
92 | return self._readFromBuffer(bytes) | ||
93 | |||
94 | def _bufferedBytes(self): | ||
95 | return sum([len(item) for item in self.buffer]) | ||
96 | |||
97 | def _readStream(self, bytes): | ||
98 | data = self.stream.read(bytes) | ||
99 | self.buffer.append(data) | ||
100 | self.position[0] += 1 | ||
101 | self.position[1] = len(data) | ||
102 | return data | ||
103 | |||
104 | def _readFromBuffer(self, bytes): | ||
105 | remainingBytes = bytes | ||
106 | rv = [] | ||
107 | bufferIndex = self.position[0] | ||
108 | bufferOffset = self.position[1] | ||
109 | while bufferIndex < len(self.buffer) and remainingBytes != 0: | ||
110 | assert remainingBytes > 0 | ||
111 | bufferedData = self.buffer[bufferIndex] | ||
112 | |||
113 | if remainingBytes <= len(bufferedData) - bufferOffset: | ||
114 | bytesToRead = remainingBytes | ||
115 | self.position = [bufferIndex, bufferOffset + bytesToRead] | ||
116 | else: | ||
117 | bytesToRead = len(bufferedData) - bufferOffset | ||
118 | self.position = [bufferIndex, len(bufferedData)] | ||
119 | bufferIndex += 1 | ||
120 | rv.append(bufferedData[bufferOffset:bufferOffset + bytesToRead]) | ||
121 | remainingBytes -= bytesToRead | ||
122 | |||
123 | bufferOffset = 0 | ||
124 | |||
125 | if remainingBytes: | ||
126 | rv.append(self._readStream(remainingBytes)) | ||
127 | |||
128 | return b"".join(rv) | ||
129 | |||
130 | |||
131 | def HTMLInputStream(source, **kwargs): | ||
132 | # Work around Python bug #20007: read(0) closes the connection. | ||
133 | # http://bugs.python.org/issue20007 | ||
134 | if (isinstance(source, http_client.HTTPResponse) or | ||
135 | # Also check for addinfourl wrapping HTTPResponse | ||
136 | (isinstance(source, urllib.response.addbase) and | ||
137 | isinstance(source.fp, http_client.HTTPResponse))): | ||
138 | isUnicode = False | ||
139 | elif hasattr(source, "read"): | ||
140 | isUnicode = isinstance(source.read(0), text_type) | ||
141 | else: | ||
142 | isUnicode = isinstance(source, text_type) | ||
143 | |||
144 | if isUnicode: | ||
145 | encodings = [x for x in kwargs if x.endswith("_encoding")] | ||
146 | if encodings: | ||
147 | raise TypeError("Cannot set an encoding with a unicode input, set %r" % encodings) | ||
148 | |||
149 | return HTMLUnicodeInputStream(source, **kwargs) | ||
150 | else: | ||
151 | return HTMLBinaryInputStream(source, **kwargs) | ||
152 | |||
153 | |||
154 | class HTMLUnicodeInputStream(object): | ||
155 | """Provides a unicode stream of characters to the HTMLTokenizer. | ||
156 | |||
157 | This class takes care of character encoding and removing or replacing | ||
158 | incorrect byte-sequences and also provides column and line tracking. | ||
159 | |||
160 | """ | ||
161 | |||
162 | _defaultChunkSize = 10240 | ||
163 | |||
164 | def __init__(self, source): | ||
165 | """Initialises the HTMLInputStream. | ||
166 | |||
167 | HTMLInputStream(source, [encoding]) -> Normalized stream from source | ||
168 | for use by html5lib. | ||
169 | |||
170 | source can be either a file-object, local filename or a string. | ||
171 | |||
172 | The optional encoding parameter must be a string that indicates | ||
173 | the encoding. If specified, that encoding will be used, | ||
174 | regardless of any BOM or later declaration (such as in a meta | ||
175 | element) | ||
176 | |||
177 | """ | ||
178 | |||
179 | if not _utils.supports_lone_surrogates: | ||
180 | # Such platforms will have already checked for such | ||
181 | # surrogate errors, so no need to do this checking. | ||
182 | self.reportCharacterErrors = None | ||
183 | elif len("\U0010FFFF") == 1: | ||
184 | self.reportCharacterErrors = self.characterErrorsUCS4 | ||
185 | else: | ||
186 | self.reportCharacterErrors = self.characterErrorsUCS2 | ||
187 | |||
188 | # List of where new lines occur | ||
189 | self.newLines = [0] | ||
190 | |||
191 | self.charEncoding = (lookupEncoding("utf-8"), "certain") | ||
192 | self.dataStream = self.openStream(source) | ||
193 | |||
194 | self.reset() | ||
195 | |||
196 | def reset(self): | ||
197 | self.chunk = "" | ||
198 | self.chunkSize = 0 | ||
199 | self.chunkOffset = 0 | ||
200 | self.errors = [] | ||
201 | |||
202 | # number of (complete) lines in previous chunks | ||
203 | self.prevNumLines = 0 | ||
204 | # number of columns in the last line of the previous chunk | ||
205 | self.prevNumCols = 0 | ||
206 | |||
207 | # Deal with CR LF and surrogates split over chunk boundaries | ||
208 | self._bufferedCharacter = None | ||
209 | |||
210 | def openStream(self, source): | ||
211 | """Produces a file object from source. | ||
212 | |||
213 | source can be either a file object, local filename or a string. | ||
214 | |||
215 | """ | ||
216 | # Already a file object | ||
217 | if hasattr(source, 'read'): | ||
218 | stream = source | ||
219 | else: | ||
220 | stream = StringIO(source) | ||
221 | |||
222 | return stream | ||
223 | |||
224 | def _position(self, offset): | ||
225 | chunk = self.chunk | ||
226 | nLines = chunk.count('\n', 0, offset) | ||
227 | positionLine = self.prevNumLines + nLines | ||
228 | lastLinePos = chunk.rfind('\n', 0, offset) | ||
229 | if lastLinePos == -1: | ||
230 | positionColumn = self.prevNumCols + offset | ||
231 | else: | ||
232 | positionColumn = offset - (lastLinePos + 1) | ||
233 | return (positionLine, positionColumn) | ||
234 | |||
235 | def position(self): | ||
236 | """Returns (line, col) of the current position in the stream.""" | ||
237 | line, col = self._position(self.chunkOffset) | ||
238 | return (line + 1, col) | ||
239 | |||
240 | def char(self): | ||
241 | """ Read one character from the stream or queue if available. Return | ||
242 | EOF when EOF is reached. | ||
243 | """ | ||
244 | # Read a new chunk from the input stream if necessary | ||
245 | if self.chunkOffset >= self.chunkSize: | ||
246 | if not self.readChunk(): | ||
247 | return EOF | ||
248 | |||
249 | chunkOffset = self.chunkOffset | ||
250 | char = self.chunk[chunkOffset] | ||
251 | self.chunkOffset = chunkOffset + 1 | ||
252 | |||
253 | return char | ||
254 | |||
255 | def readChunk(self, chunkSize=None): | ||
256 | if chunkSize is None: | ||
257 | chunkSize = self._defaultChunkSize | ||
258 | |||
259 | self.prevNumLines, self.prevNumCols = self._position(self.chunkSize) | ||
260 | |||
261 | self.chunk = "" | ||
262 | self.chunkSize = 0 | ||
263 | self.chunkOffset = 0 | ||
264 | |||
265 | data = self.dataStream.read(chunkSize) | ||
266 | |||
267 | # Deal with CR LF and surrogates broken across chunks | ||
268 | if self._bufferedCharacter: | ||
269 | data = self._bufferedCharacter + data | ||
270 | self._bufferedCharacter = None | ||
271 | elif not data: | ||
272 | # We have no more data, bye-bye stream | ||
273 | return False | ||
274 | |||
275 | if len(data) > 1: | ||
276 | lastv = ord(data[-1]) | ||
277 | if lastv == 0x0D or 0xD800 <= lastv <= 0xDBFF: | ||
278 | self._bufferedCharacter = data[-1] | ||
279 | data = data[:-1] | ||
280 | |||
281 | if self.reportCharacterErrors: | ||
282 | self.reportCharacterErrors(data) | ||
283 | |||
284 | # Replace invalid characters | ||
285 | data = data.replace("\r\n", "\n") | ||
286 | data = data.replace("\r", "\n") | ||
287 | |||
288 | self.chunk = data | ||
289 | self.chunkSize = len(data) | ||
290 | |||
291 | return True | ||
292 | |||
293 | def characterErrorsUCS4(self, data): | ||
294 | for _ in range(len(invalid_unicode_re.findall(data))): | ||
295 | self.errors.append("invalid-codepoint") | ||
296 | |||
297 | def characterErrorsUCS2(self, data): | ||
298 | # Someone picked the wrong compile option | ||
299 | # You lose | ||
300 | skip = False | ||
301 | for match in invalid_unicode_re.finditer(data): | ||
302 | if skip: | ||
303 | continue | ||
304 | codepoint = ord(match.group()) | ||
305 | pos = match.start() | ||
306 | # Pretty sure there should be endianness issues here | ||
307 | if _utils.isSurrogatePair(data[pos:pos + 2]): | ||
308 | # We have a surrogate pair! | ||
309 | char_val = _utils.surrogatePairToCodepoint(data[pos:pos + 2]) | ||
310 | if char_val in non_bmp_invalid_codepoints: | ||
311 | self.errors.append("invalid-codepoint") | ||
312 | skip = True | ||
313 | elif (codepoint >= 0xD800 and codepoint <= 0xDFFF and | ||
314 | pos == len(data) - 1): | ||
315 | self.errors.append("invalid-codepoint") | ||
316 | else: | ||
317 | skip = False | ||
318 | self.errors.append("invalid-codepoint") | ||
319 | |||
320 | def charsUntil(self, characters, opposite=False): | ||
321 | """ Returns a string of characters from the stream up to but not | ||
322 | including any character in 'characters' or EOF. 'characters' must be | ||
323 | a container that supports the 'in' method and iteration over its | ||
324 | characters. | ||
325 | """ | ||
326 | |||
327 | # Use a cache of regexps to find the required characters | ||
328 | try: | ||
329 | chars = charsUntilRegEx[(characters, opposite)] | ||
330 | except KeyError: | ||
331 | if __debug__: | ||
332 | for c in characters: | ||
333 | assert(ord(c) < 128) | ||
334 | regex = "".join(["\\x%02x" % ord(c) for c in characters]) | ||
335 | if not opposite: | ||
336 | regex = "^%s" % regex | ||
337 | chars = charsUntilRegEx[(characters, opposite)] = re.compile("[%s]+" % regex) | ||
338 | |||
339 | rv = [] | ||
340 | |||
341 | while True: | ||
342 | # Find the longest matching prefix | ||
343 | m = chars.match(self.chunk, self.chunkOffset) | ||
344 | if m is None: | ||
345 | # If nothing matched, and it wasn't because we ran out of chunk, | ||
346 | # then stop | ||
347 | if self.chunkOffset != self.chunkSize: | ||
348 | break | ||
349 | else: | ||
350 | end = m.end() | ||
351 | # If not the whole chunk matched, return everything | ||
352 | # up to the part that didn't match | ||
353 | if end != self.chunkSize: | ||
354 | rv.append(self.chunk[self.chunkOffset:end]) | ||
355 | self.chunkOffset = end | ||
356 | break | ||
357 | # If the whole remainder of the chunk matched, | ||
358 | # use it all and read the next chunk | ||
359 | rv.append(self.chunk[self.chunkOffset:]) | ||
360 | if not self.readChunk(): | ||
361 | # Reached EOF | ||
362 | break | ||
363 | |||
364 | r = "".join(rv) | ||
365 | return r | ||
366 | |||
367 | def unget(self, char): | ||
368 | # Only one character is allowed to be ungotten at once - it must | ||
369 | # be consumed again before any further call to unget | ||
370 | if char is not None: | ||
371 | if self.chunkOffset == 0: | ||
372 | # unget is called quite rarely, so it's a good idea to do | ||
373 | # more work here if it saves a bit of work in the frequently | ||
374 | # called char and charsUntil. | ||
375 | # So, just prepend the ungotten character onto the current | ||
376 | # chunk: | ||
377 | self.chunk = char + self.chunk | ||
378 | self.chunkSize += 1 | ||
379 | else: | ||
380 | self.chunkOffset -= 1 | ||
381 | assert self.chunk[self.chunkOffset] == char | ||
382 | |||
383 | |||
384 | class HTMLBinaryInputStream(HTMLUnicodeInputStream): | ||
385 | """Provides a unicode stream of characters to the HTMLTokenizer. | ||
386 | |||
387 | This class takes care of character encoding and removing or replacing | ||
388 | incorrect byte-sequences and also provides column and line tracking. | ||
389 | |||
390 | """ | ||
391 | |||
392 | def __init__(self, source, override_encoding=None, transport_encoding=None, | ||
393 | same_origin_parent_encoding=None, likely_encoding=None, | ||
394 | default_encoding="windows-1252", useChardet=True): | ||
395 | """Initialises the HTMLInputStream. | ||
396 | |||
397 | HTMLInputStream(source, [encoding]) -> Normalized stream from source | ||
398 | for use by html5lib. | ||
399 | |||
400 | source can be either a file-object, local filename or a string. | ||
401 | |||
402 | The optional encoding parameter must be a string that indicates | ||
403 | the encoding. If specified, that encoding will be used, | ||
404 | regardless of any BOM or later declaration (such as in a meta | ||
405 | element) | ||
406 | |||
407 | """ | ||
408 | # Raw Stream - for unicode objects this will encode to utf-8 and set | ||
409 | # self.charEncoding as appropriate | ||
410 | self.rawStream = self.openStream(source) | ||
411 | |||
412 | HTMLUnicodeInputStream.__init__(self, self.rawStream) | ||
413 | |||
414 | # Encoding Information | ||
415 | # Number of bytes to use when looking for a meta element with | ||
416 | # encoding information | ||
417 | self.numBytesMeta = 1024 | ||
418 | # Number of bytes to use when using detecting encoding using chardet | ||
419 | self.numBytesChardet = 100 | ||
420 | # Things from args | ||
421 | self.override_encoding = override_encoding | ||
422 | self.transport_encoding = transport_encoding | ||
423 | self.same_origin_parent_encoding = same_origin_parent_encoding | ||
424 | self.likely_encoding = likely_encoding | ||
425 | self.default_encoding = default_encoding | ||
426 | |||
427 | # Determine encoding | ||
428 | self.charEncoding = self.determineEncoding(useChardet) | ||
429 | assert self.charEncoding[0] is not None | ||
430 | |||
431 | # Call superclass | ||
432 | self.reset() | ||
433 | |||
434 | def reset(self): | ||
435 | self.dataStream = self.charEncoding[0].codec_info.streamreader(self.rawStream, 'replace') | ||
436 | HTMLUnicodeInputStream.reset(self) | ||
437 | |||
438 | def openStream(self, source): | ||
439 | """Produces a file object from source. | ||
440 | |||
441 | source can be either a file object, local filename or a string. | ||
442 | |||
443 | """ | ||
444 | # Already a file object | ||
445 | if hasattr(source, 'read'): | ||
446 | stream = source | ||
447 | else: | ||
448 | stream = BytesIO(source) | ||
449 | |||
450 | try: | ||
451 | stream.seek(stream.tell()) | ||
452 | except: # pylint:disable=bare-except | ||
453 | stream = BufferedStream(stream) | ||
454 | |||
455 | return stream | ||
456 | |||
457 | def determineEncoding(self, chardet=True): | ||
458 | # BOMs take precedence over everything | ||
459 | # This will also read past the BOM if present | ||
460 | charEncoding = self.detectBOM(), "certain" | ||
461 | if charEncoding[0] is not None: | ||
462 | return charEncoding | ||
463 | |||
464 | # If we've been overriden, we've been overriden | ||
465 | charEncoding = lookupEncoding(self.override_encoding), "certain" | ||
466 | if charEncoding[0] is not None: | ||
467 | return charEncoding | ||
468 | |||
469 | # Now check the transport layer | ||
470 | charEncoding = lookupEncoding(self.transport_encoding), "certain" | ||
471 | if charEncoding[0] is not None: | ||
472 | return charEncoding | ||
473 | |||
474 | # Look for meta elements with encoding information | ||
475 | charEncoding = self.detectEncodingMeta(), "tentative" | ||
476 | if charEncoding[0] is not None: | ||
477 | return charEncoding | ||
478 | |||
479 | # Parent document encoding | ||
480 | charEncoding = lookupEncoding(self.same_origin_parent_encoding), "tentative" | ||
481 | if charEncoding[0] is not None and not charEncoding[0].name.startswith("utf-16"): | ||
482 | return charEncoding | ||
483 | |||
484 | # "likely" encoding | ||
485 | charEncoding = lookupEncoding(self.likely_encoding), "tentative" | ||
486 | if charEncoding[0] is not None: | ||
487 | return charEncoding | ||
488 | |||
489 | # Guess with chardet, if available | ||
490 | if chardet: | ||
491 | try: | ||
492 | from pip._vendor.chardet.universaldetector import UniversalDetector | ||
493 | except ImportError: | ||
494 | pass | ||
495 | else: | ||
496 | buffers = [] | ||
497 | detector = UniversalDetector() | ||
498 | while not detector.done: | ||
499 | buffer = self.rawStream.read(self.numBytesChardet) | ||
500 | assert isinstance(buffer, bytes) | ||
501 | if not buffer: | ||
502 | break | ||
503 | buffers.append(buffer) | ||
504 | detector.feed(buffer) | ||
505 | detector.close() | ||
506 | encoding = lookupEncoding(detector.result['encoding']) | ||
507 | self.rawStream.seek(0) | ||
508 | if encoding is not None: | ||
509 | return encoding, "tentative" | ||
510 | |||
511 | # Try the default encoding | ||
512 | charEncoding = lookupEncoding(self.default_encoding), "tentative" | ||
513 | if charEncoding[0] is not None: | ||
514 | return charEncoding | ||
515 | |||
516 | # Fallback to html5lib's default if even that hasn't worked | ||
517 | return lookupEncoding("windows-1252"), "tentative" | ||
518 | |||
519 | def changeEncoding(self, newEncoding): | ||
520 | assert self.charEncoding[1] != "certain" | ||
521 | newEncoding = lookupEncoding(newEncoding) | ||
522 | if newEncoding is None: | ||
523 | return | ||
524 | if newEncoding.name in ("utf-16be", "utf-16le"): | ||
525 | newEncoding = lookupEncoding("utf-8") | ||
526 | assert newEncoding is not None | ||
527 | elif newEncoding == self.charEncoding[0]: | ||
528 | self.charEncoding = (self.charEncoding[0], "certain") | ||
529 | else: | ||
530 | self.rawStream.seek(0) | ||
531 | self.charEncoding = (newEncoding, "certain") | ||
532 | self.reset() | ||
533 | raise _ReparseException("Encoding changed from %s to %s" % (self.charEncoding[0], newEncoding)) | ||
534 | |||
535 | def detectBOM(self): | ||
536 | """Attempts to detect at BOM at the start of the stream. If | ||
537 | an encoding can be determined from the BOM return the name of the | ||
538 | encoding otherwise return None""" | ||
539 | bomDict = { | ||
540 | codecs.BOM_UTF8: 'utf-8', | ||
541 | codecs.BOM_UTF16_LE: 'utf-16le', codecs.BOM_UTF16_BE: 'utf-16be', | ||
542 | codecs.BOM_UTF32_LE: 'utf-32le', codecs.BOM_UTF32_BE: 'utf-32be' | ||
543 | } | ||
544 | |||
545 | # Go to beginning of file and read in 4 bytes | ||
546 | string = self.rawStream.read(4) | ||
547 | assert isinstance(string, bytes) | ||
548 | |||
549 | # Try detecting the BOM using bytes from the string | ||
550 | encoding = bomDict.get(string[:3]) # UTF-8 | ||
551 | seek = 3 | ||
552 | if not encoding: | ||
553 | # Need to detect UTF-32 before UTF-16 | ||
554 | encoding = bomDict.get(string) # UTF-32 | ||
555 | seek = 4 | ||
556 | if not encoding: | ||
557 | encoding = bomDict.get(string[:2]) # UTF-16 | ||
558 | seek = 2 | ||
559 | |||
560 | # Set the read position past the BOM if one was found, otherwise | ||
561 | # set it to the start of the stream | ||
562 | if encoding: | ||
563 | self.rawStream.seek(seek) | ||
564 | return lookupEncoding(encoding) | ||
565 | else: | ||
566 | self.rawStream.seek(0) | ||
567 | return None | ||
568 | |||
569 | def detectEncodingMeta(self): | ||
570 | """Report the encoding declared by the meta element | ||
571 | """ | ||
572 | buffer = self.rawStream.read(self.numBytesMeta) | ||
573 | assert isinstance(buffer, bytes) | ||
574 | parser = EncodingParser(buffer) | ||
575 | self.rawStream.seek(0) | ||
576 | encoding = parser.getEncoding() | ||
577 | |||
578 | if encoding is not None and encoding.name in ("utf-16be", "utf-16le"): | ||
579 | encoding = lookupEncoding("utf-8") | ||
580 | |||
581 | return encoding | ||
582 | |||
583 | |||
584 | class EncodingBytes(bytes): | ||
585 | """String-like object with an associated position and various extra methods | ||
586 | If the position is ever greater than the string length then an exception is | ||
587 | raised""" | ||
588 | def __new__(self, value): | ||
589 | assert isinstance(value, bytes) | ||
590 | return bytes.__new__(self, value.lower()) | ||
591 | |||
592 | def __init__(self, value): | ||
593 | # pylint:disable=unused-argument | ||
594 | self._position = -1 | ||
595 | |||
596 | def __iter__(self): | ||
597 | return self | ||
598 | |||
599 | def __next__(self): | ||
600 | p = self._position = self._position + 1 | ||
601 | if p >= len(self): | ||
602 | raise StopIteration | ||
603 | elif p < 0: | ||
604 | raise TypeError | ||
605 | return self[p:p + 1] | ||
606 | |||
607 | def next(self): | ||
608 | # Py2 compat | ||
609 | return self.__next__() | ||
610 | |||
611 | def previous(self): | ||
612 | p = self._position | ||
613 | if p >= len(self): | ||
614 | raise StopIteration | ||
615 | elif p < 0: | ||
616 | raise TypeError | ||
617 | self._position = p = p - 1 | ||
618 | return self[p:p + 1] | ||
619 | |||
620 | def setPosition(self, position): | ||
621 | if self._position >= len(self): | ||
622 | raise StopIteration | ||
623 | self._position = position | ||
624 | |||
625 | def getPosition(self): | ||
626 | if self._position >= len(self): | ||
627 | raise StopIteration | ||
628 | if self._position >= 0: | ||
629 | return self._position | ||
630 | else: | ||
631 | return None | ||
632 | |||
633 | position = property(getPosition, setPosition) | ||
634 | |||
635 | def getCurrentByte(self): | ||
636 | return self[self.position:self.position + 1] | ||
637 | |||
638 | currentByte = property(getCurrentByte) | ||
639 | |||
640 | def skip(self, chars=spaceCharactersBytes): | ||
641 | """Skip past a list of characters""" | ||
642 | p = self.position # use property for the error-checking | ||
643 | while p < len(self): | ||
644 | c = self[p:p + 1] | ||
645 | if c not in chars: | ||
646 | self._position = p | ||
647 | return c | ||
648 | p += 1 | ||
649 | self._position = p | ||
650 | return None | ||
651 | |||
652 | def skipUntil(self, chars): | ||
653 | p = self.position | ||
654 | while p < len(self): | ||
655 | c = self[p:p + 1] | ||
656 | if c in chars: | ||
657 | self._position = p | ||
658 | return c | ||
659 | p += 1 | ||
660 | self._position = p | ||
661 | return None | ||
662 | |||
663 | def matchBytes(self, bytes): | ||
664 | """Look for a sequence of bytes at the start of a string. If the bytes | ||
665 | are found return True and advance the position to the byte after the | ||
666 | match. Otherwise return False and leave the position alone""" | ||
667 | p = self.position | ||
668 | data = self[p:p + len(bytes)] | ||
669 | rv = data.startswith(bytes) | ||
670 | if rv: | ||
671 | self.position += len(bytes) | ||
672 | return rv | ||
673 | |||
674 | def jumpTo(self, bytes): | ||
675 | """Look for the next sequence of bytes matching a given sequence. If | ||
676 | a match is found advance the position to the last byte of the match""" | ||
677 | newPosition = self[self.position:].find(bytes) | ||
678 | if newPosition > -1: | ||
679 | # XXX: This is ugly, but I can't see a nicer way to fix this. | ||
680 | if self._position == -1: | ||
681 | self._position = 0 | ||
682 | self._position += (newPosition + len(bytes) - 1) | ||
683 | return True | ||
684 | else: | ||
685 | raise StopIteration | ||
686 | |||
687 | |||
688 | class EncodingParser(object): | ||
689 | """Mini parser for detecting character encoding from meta elements""" | ||
690 | |||
691 | def __init__(self, data): | ||
692 | """string - the data to work on for encoding detection""" | ||
693 | self.data = EncodingBytes(data) | ||
694 | self.encoding = None | ||
695 | |||
696 | def getEncoding(self): | ||
697 | methodDispatch = ( | ||
698 | (b"<!--", self.handleComment), | ||
699 | (b"<meta", self.handleMeta), | ||
700 | (b"</", self.handlePossibleEndTag), | ||
701 | (b"<!", self.handleOther), | ||
702 | (b"<?", self.handleOther), | ||
703 | (b"<", self.handlePossibleStartTag)) | ||
704 | for _ in self.data: | ||
705 | keepParsing = True | ||
706 | for key, method in methodDispatch: | ||
707 | if self.data.matchBytes(key): | ||
708 | try: | ||
709 | keepParsing = method() | ||
710 | break | ||
711 | except StopIteration: | ||
712 | keepParsing = False | ||
713 | break | ||
714 | if not keepParsing: | ||
715 | break | ||
716 | |||
717 | return self.encoding | ||
718 | |||
719 | def handleComment(self): | ||
720 | """Skip over comments""" | ||
721 | return self.data.jumpTo(b"-->") | ||
722 | |||
723 | def handleMeta(self): | ||
724 | if self.data.currentByte not in spaceCharactersBytes: | ||
725 | # if we have <meta not followed by a space so just keep going | ||
726 | return True | ||
727 | # We have a valid meta element we want to search for attributes | ||
728 | hasPragma = False | ||
729 | pendingEncoding = None | ||
730 | while True: | ||
731 | # Try to find the next attribute after the current position | ||
732 | attr = self.getAttribute() | ||
733 | if attr is None: | ||
734 | return True | ||
735 | else: | ||
736 | if attr[0] == b"http-equiv": | ||
737 | hasPragma = attr[1] == b"content-type" | ||
738 | if hasPragma and pendingEncoding is not None: | ||
739 | self.encoding = pendingEncoding | ||
740 | return False | ||
741 | elif attr[0] == b"charset": | ||
742 | tentativeEncoding = attr[1] | ||
743 | codec = lookupEncoding(tentativeEncoding) | ||
744 | if codec is not None: | ||
745 | self.encoding = codec | ||
746 | return False | ||
747 | elif attr[0] == b"content": | ||
748 | contentParser = ContentAttrParser(EncodingBytes(attr[1])) | ||
749 | tentativeEncoding = contentParser.parse() | ||
750 | if tentativeEncoding is not None: | ||
751 | codec = lookupEncoding(tentativeEncoding) | ||
752 | if codec is not None: | ||
753 | if hasPragma: | ||
754 | self.encoding = codec | ||
755 | return False | ||
756 | else: | ||
757 | pendingEncoding = codec | ||
758 | |||
759 | def handlePossibleStartTag(self): | ||
760 | return self.handlePossibleTag(False) | ||
761 | |||
762 | def handlePossibleEndTag(self): | ||
763 | next(self.data) | ||
764 | return self.handlePossibleTag(True) | ||
765 | |||
766 | def handlePossibleTag(self, endTag): | ||
767 | data = self.data | ||
768 | if data.currentByte not in asciiLettersBytes: | ||
769 | # If the next byte is not an ascii letter either ignore this | ||
770 | # fragment (possible start tag case) or treat it according to | ||
771 | # handleOther | ||
772 | if endTag: | ||
773 | data.previous() | ||
774 | self.handleOther() | ||
775 | return True | ||
776 | |||
777 | c = data.skipUntil(spacesAngleBrackets) | ||
778 | if c == b"<": | ||
779 | # return to the first step in the overall "two step" algorithm | ||
780 | # reprocessing the < byte | ||
781 | data.previous() | ||
782 | else: | ||
783 | # Read all attributes | ||
784 | attr = self.getAttribute() | ||
785 | while attr is not None: | ||
786 | attr = self.getAttribute() | ||
787 | return True | ||
788 | |||
789 | def handleOther(self): | ||
790 | return self.data.jumpTo(b">") | ||
791 | |||
792 | def getAttribute(self): | ||
793 | """Return a name,value pair for the next attribute in the stream, | ||
794 | if one is found, or None""" | ||
795 | data = self.data | ||
796 | # Step 1 (skip chars) | ||
797 | c = data.skip(spaceCharactersBytes | frozenset([b"/"])) | ||
798 | assert c is None or len(c) == 1 | ||
799 | # Step 2 | ||
800 | if c in (b">", None): | ||
801 | return None | ||
802 | # Step 3 | ||
803 | attrName = [] | ||
804 | attrValue = [] | ||
805 | # Step 4 attribute name | ||
806 | while True: | ||
807 | if c == b"=" and attrName: | ||
808 | break | ||
809 | elif c in spaceCharactersBytes: | ||
810 | # Step 6! | ||
811 | c = data.skip() | ||
812 | break | ||
813 | elif c in (b"/", b">"): | ||
814 | return b"".join(attrName), b"" | ||
815 | elif c in asciiUppercaseBytes: | ||
816 | attrName.append(c.lower()) | ||
817 | elif c is None: | ||
818 | return None | ||
819 | else: | ||
820 | attrName.append(c) | ||
821 | # Step 5 | ||
822 | c = next(data) | ||
823 | # Step 7 | ||
824 | if c != b"=": | ||
825 | data.previous() | ||
826 | return b"".join(attrName), b"" | ||
827 | # Step 8 | ||
828 | next(data) | ||
829 | # Step 9 | ||
830 | c = data.skip() | ||
831 | # Step 10 | ||
832 | if c in (b"'", b'"'): | ||
833 | # 10.1 | ||
834 | quoteChar = c | ||
835 | while True: | ||
836 | # 10.2 | ||
837 | c = next(data) | ||
838 | # 10.3 | ||
839 | if c == quoteChar: | ||
840 | next(data) | ||
841 | return b"".join(attrName), b"".join(attrValue) | ||
842 | # 10.4 | ||
843 | elif c in asciiUppercaseBytes: | ||
844 | attrValue.append(c.lower()) | ||
845 | # 10.5 | ||
846 | else: | ||
847 | attrValue.append(c) | ||
848 | elif c == b">": | ||
849 | return b"".join(attrName), b"" | ||
850 | elif c in asciiUppercaseBytes: | ||
851 | attrValue.append(c.lower()) | ||
852 | elif c is None: | ||
853 | return None | ||
854 | else: | ||
855 | attrValue.append(c) | ||
856 | # Step 11 | ||
857 | while True: | ||
858 | c = next(data) | ||
859 | if c in spacesAngleBrackets: | ||
860 | return b"".join(attrName), b"".join(attrValue) | ||
861 | elif c in asciiUppercaseBytes: | ||
862 | attrValue.append(c.lower()) | ||
863 | elif c is None: | ||
864 | return None | ||
865 | else: | ||
866 | attrValue.append(c) | ||
867 | |||
868 | |||
869 | class ContentAttrParser(object): | ||
870 | def __init__(self, data): | ||
871 | assert isinstance(data, bytes) | ||
872 | self.data = data | ||
873 | |||
874 | def parse(self): | ||
875 | try: | ||
876 | # Check if the attr name is charset | ||
877 | # otherwise return | ||
878 | self.data.jumpTo(b"charset") | ||
879 | self.data.position += 1 | ||
880 | self.data.skip() | ||
881 | if not self.data.currentByte == b"=": | ||
882 | # If there is no = sign keep looking for attrs | ||
883 | return None | ||
884 | self.data.position += 1 | ||
885 | self.data.skip() | ||
886 | # Look for an encoding between matching quote marks | ||
887 | if self.data.currentByte in (b'"', b"'"): | ||
888 | quoteMark = self.data.currentByte | ||
889 | self.data.position += 1 | ||
890 | oldPosition = self.data.position | ||
891 | if self.data.jumpTo(quoteMark): | ||
892 | return self.data[oldPosition:self.data.position] | ||
893 | else: | ||
894 | return None | ||
895 | else: | ||
896 | # Unquoted value | ||
897 | oldPosition = self.data.position | ||
898 | try: | ||
899 | self.data.skipUntil(spaceCharactersBytes) | ||
900 | return self.data[oldPosition:self.data.position] | ||
901 | except StopIteration: | ||
902 | # Return the whole remaining value | ||
903 | return self.data[oldPosition:] | ||
904 | except StopIteration: | ||
905 | return None | ||
906 | |||
907 | |||
908 | def lookupEncoding(encoding): | ||
909 | """Return the python codec name corresponding to an encoding or None if the | ||
910 | string doesn't correspond to a valid encoding.""" | ||
911 | if isinstance(encoding, binary_type): | ||
912 | try: | ||
913 | encoding = encoding.decode("ascii") | ||
914 | except UnicodeDecodeError: | ||
915 | return None | ||
916 | |||
917 | if encoding is not None: | ||
918 | try: | ||
919 | return webencodings.lookup(encoding) | ||
920 | except AttributeError: | ||
921 | return None | ||
922 | else: | ||
923 | return None | ||