diff options
Diffstat (limited to 'venv/lib/python3.7/site-packages/pip-10.0.1-py3.7.egg/pip/_vendor/chardet/universaldetector.py')
-rw-r--r-- | venv/lib/python3.7/site-packages/pip-10.0.1-py3.7.egg/pip/_vendor/chardet/universaldetector.py | 286 |
1 files changed, 0 insertions, 286 deletions
diff --git a/venv/lib/python3.7/site-packages/pip-10.0.1-py3.7.egg/pip/_vendor/chardet/universaldetector.py b/venv/lib/python3.7/site-packages/pip-10.0.1-py3.7.egg/pip/_vendor/chardet/universaldetector.py deleted file mode 100644 index 8a6de3b..0000000 --- a/venv/lib/python3.7/site-packages/pip-10.0.1-py3.7.egg/pip/_vendor/chardet/universaldetector.py +++ /dev/null | |||
@@ -1,286 +0,0 @@ | |||
1 | ######################## BEGIN LICENSE BLOCK ######################## | ||
2 | # The Original Code is Mozilla Universal charset detector code. | ||
3 | # | ||
4 | # The Initial Developer of the Original Code is | ||
5 | # Netscape Communications Corporation. | ||
6 | # Portions created by the Initial Developer are Copyright (C) 2001 | ||
7 | # the Initial Developer. All Rights Reserved. | ||
8 | # | ||
9 | # Contributor(s): | ||
10 | # Mark Pilgrim - port to Python | ||
11 | # Shy Shalom - original C code | ||
12 | # | ||
13 | # This library is free software; you can redistribute it and/or | ||
14 | # modify it under the terms of the GNU Lesser General Public | ||
15 | # License as published by the Free Software Foundation; either | ||
16 | # version 2.1 of the License, or (at your option) any later version. | ||
17 | # | ||
18 | # This library is distributed in the hope that it will be useful, | ||
19 | # but WITHOUT ANY WARRANTY; without even the implied warranty of | ||
20 | # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU | ||
21 | # Lesser General Public License for more details. | ||
22 | # | ||
23 | # You should have received a copy of the GNU Lesser General Public | ||
24 | # License along with this library; if not, write to the Free Software | ||
25 | # Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA | ||
26 | # 02110-1301 USA | ||
27 | ######################### END LICENSE BLOCK ######################### | ||
28 | """ | ||
29 | Module containing the UniversalDetector detector class, which is the primary | ||
30 | class a user of ``chardet`` should use. | ||
31 | |||
32 | :author: Mark Pilgrim (initial port to Python) | ||
33 | :author: Shy Shalom (original C code) | ||
34 | :author: Dan Blanchard (major refactoring for 3.0) | ||
35 | :author: Ian Cordasco | ||
36 | """ | ||
37 | |||
38 | |||
39 | import codecs | ||
40 | import logging | ||
41 | import re | ||
42 | |||
43 | from .charsetgroupprober import CharSetGroupProber | ||
44 | from .enums import InputState, LanguageFilter, ProbingState | ||
45 | from .escprober import EscCharSetProber | ||
46 | from .latin1prober import Latin1Prober | ||
47 | from .mbcsgroupprober import MBCSGroupProber | ||
48 | from .sbcsgroupprober import SBCSGroupProber | ||
49 | |||
50 | |||
51 | class UniversalDetector(object): | ||
52 | """ | ||
53 | The ``UniversalDetector`` class underlies the ``chardet.detect`` function | ||
54 | and coordinates all of the different charset probers. | ||
55 | |||
56 | To get a ``dict`` containing an encoding and its confidence, you can simply | ||
57 | run: | ||
58 | |||
59 | .. code:: | ||
60 | |||
61 | u = UniversalDetector() | ||
62 | u.feed(some_bytes) | ||
63 | u.close() | ||
64 | detected = u.result | ||
65 | |||
66 | """ | ||
67 | |||
68 | MINIMUM_THRESHOLD = 0.20 | ||
69 | HIGH_BYTE_DETECTOR = re.compile(b'[\x80-\xFF]') | ||
70 | ESC_DETECTOR = re.compile(b'(\033|~{)') | ||
71 | WIN_BYTE_DETECTOR = re.compile(b'[\x80-\x9F]') | ||
72 | ISO_WIN_MAP = {'iso-8859-1': 'Windows-1252', | ||
73 | 'iso-8859-2': 'Windows-1250', | ||
74 | 'iso-8859-5': 'Windows-1251', | ||
75 | 'iso-8859-6': 'Windows-1256', | ||
76 | 'iso-8859-7': 'Windows-1253', | ||
77 | 'iso-8859-8': 'Windows-1255', | ||
78 | 'iso-8859-9': 'Windows-1254', | ||
79 | 'iso-8859-13': 'Windows-1257'} | ||
80 | |||
81 | def __init__(self, lang_filter=LanguageFilter.ALL): | ||
82 | self._esc_charset_prober = None | ||
83 | self._charset_probers = [] | ||
84 | self.result = None | ||
85 | self.done = None | ||
86 | self._got_data = None | ||
87 | self._input_state = None | ||
88 | self._last_char = None | ||
89 | self.lang_filter = lang_filter | ||
90 | self.logger = logging.getLogger(__name__) | ||
91 | self._has_win_bytes = None | ||
92 | self.reset() | ||
93 | |||
94 | def reset(self): | ||
95 | """ | ||
96 | Reset the UniversalDetector and all of its probers back to their | ||
97 | initial states. This is called by ``__init__``, so you only need to | ||
98 | call this directly in between analyses of different documents. | ||
99 | """ | ||
100 | self.result = {'encoding': None, 'confidence': 0.0, 'language': None} | ||
101 | self.done = False | ||
102 | self._got_data = False | ||
103 | self._has_win_bytes = False | ||
104 | self._input_state = InputState.PURE_ASCII | ||
105 | self._last_char = b'' | ||
106 | if self._esc_charset_prober: | ||
107 | self._esc_charset_prober.reset() | ||
108 | for prober in self._charset_probers: | ||
109 | prober.reset() | ||
110 | |||
111 | def feed(self, byte_str): | ||
112 | """ | ||
113 | Takes a chunk of a document and feeds it through all of the relevant | ||
114 | charset probers. | ||
115 | |||
116 | After calling ``feed``, you can check the value of the ``done`` | ||
117 | attribute to see if you need to continue feeding the | ||
118 | ``UniversalDetector`` more data, or if it has made a prediction | ||
119 | (in the ``result`` attribute). | ||
120 | |||
121 | .. note:: | ||
122 | You should always call ``close`` when you're done feeding in your | ||
123 | document if ``done`` is not already ``True``. | ||
124 | """ | ||
125 | if self.done: | ||
126 | return | ||
127 | |||
128 | if not len(byte_str): | ||
129 | return | ||
130 | |||
131 | if not isinstance(byte_str, bytearray): | ||
132 | byte_str = bytearray(byte_str) | ||
133 | |||
134 | # First check for known BOMs, since these are guaranteed to be correct | ||
135 | if not self._got_data: | ||
136 | # If the data starts with BOM, we know it is UTF | ||
137 | if byte_str.startswith(codecs.BOM_UTF8): | ||
138 | # EF BB BF UTF-8 with BOM | ||
139 | self.result = {'encoding': "UTF-8-SIG", | ||
140 | 'confidence': 1.0, | ||
141 | 'language': ''} | ||
142 | elif byte_str.startswith((codecs.BOM_UTF32_LE, | ||
143 | codecs.BOM_UTF32_BE)): | ||
144 | # FF FE 00 00 UTF-32, little-endian BOM | ||
145 | # 00 00 FE FF UTF-32, big-endian BOM | ||
146 | self.result = {'encoding': "UTF-32", | ||
147 | 'confidence': 1.0, | ||
148 | 'language': ''} | ||
149 | elif byte_str.startswith(b'\xFE\xFF\x00\x00'): | ||
150 | # FE FF 00 00 UCS-4, unusual octet order BOM (3412) | ||
151 | self.result = {'encoding': "X-ISO-10646-UCS-4-3412", | ||
152 | 'confidence': 1.0, | ||
153 | 'language': ''} | ||
154 | elif byte_str.startswith(b'\x00\x00\xFF\xFE'): | ||
155 | # 00 00 FF FE UCS-4, unusual octet order BOM (2143) | ||
156 | self.result = {'encoding': "X-ISO-10646-UCS-4-2143", | ||
157 | 'confidence': 1.0, | ||
158 | 'language': ''} | ||
159 | elif byte_str.startswith((codecs.BOM_LE, codecs.BOM_BE)): | ||
160 | # FF FE UTF-16, little endian BOM | ||
161 | # FE FF UTF-16, big endian BOM | ||
162 | self.result = {'encoding': "UTF-16", | ||
163 | 'confidence': 1.0, | ||
164 | 'language': ''} | ||
165 | |||
166 | self._got_data = True | ||
167 | if self.result['encoding'] is not None: | ||
168 | self.done = True | ||
169 | return | ||
170 | |||
171 | # If none of those matched and we've only see ASCII so far, check | ||
172 | # for high bytes and escape sequences | ||
173 | if self._input_state == InputState.PURE_ASCII: | ||
174 | if self.HIGH_BYTE_DETECTOR.search(byte_str): | ||
175 | self._input_state = InputState.HIGH_BYTE | ||
176 | elif self._input_state == InputState.PURE_ASCII and \ | ||
177 | self.ESC_DETECTOR.search(self._last_char + byte_str): | ||
178 | self._input_state = InputState.ESC_ASCII | ||
179 | |||
180 | self._last_char = byte_str[-1:] | ||
181 | |||
182 | # If we've seen escape sequences, use the EscCharSetProber, which | ||
183 | # uses a simple state machine to check for known escape sequences in | ||
184 | # HZ and ISO-2022 encodings, since those are the only encodings that | ||
185 | # use such sequences. | ||
186 | if self._input_state == InputState.ESC_ASCII: | ||
187 | if not self._esc_charset_prober: | ||
188 | self._esc_charset_prober = EscCharSetProber(self.lang_filter) | ||
189 | if self._esc_charset_prober.feed(byte_str) == ProbingState.FOUND_IT: | ||
190 | self.result = {'encoding': | ||
191 | self._esc_charset_prober.charset_name, | ||
192 | 'confidence': | ||
193 | self._esc_charset_prober.get_confidence(), | ||
194 | 'language': | ||
195 | self._esc_charset_prober.language} | ||
196 | self.done = True | ||
197 | # If we've seen high bytes (i.e., those with values greater than 127), | ||
198 | # we need to do more complicated checks using all our multi-byte and | ||
199 | # single-byte probers that are left. The single-byte probers | ||
200 | # use character bigram distributions to determine the encoding, whereas | ||
201 | # the multi-byte probers use a combination of character unigram and | ||
202 | # bigram distributions. | ||
203 | elif self._input_state == InputState.HIGH_BYTE: | ||
204 | if not self._charset_probers: | ||
205 | self._charset_probers = [MBCSGroupProber(self.lang_filter)] | ||
206 | # If we're checking non-CJK encodings, use single-byte prober | ||
207 | if self.lang_filter & LanguageFilter.NON_CJK: | ||
208 | self._charset_probers.append(SBCSGroupProber()) | ||
209 | self._charset_probers.append(Latin1Prober()) | ||
210 | for prober in self._charset_probers: | ||
211 | if prober.feed(byte_str) == ProbingState.FOUND_IT: | ||
212 | self.result = {'encoding': prober.charset_name, | ||
213 | 'confidence': prober.get_confidence(), | ||
214 | 'language': prober.language} | ||
215 | self.done = True | ||
216 | break | ||
217 | if self.WIN_BYTE_DETECTOR.search(byte_str): | ||
218 | self._has_win_bytes = True | ||
219 | |||
220 | def close(self): | ||
221 | """ | ||
222 | Stop analyzing the current document and come up with a final | ||
223 | prediction. | ||
224 | |||
225 | :returns: The ``result`` attribute, a ``dict`` with the keys | ||
226 | `encoding`, `confidence`, and `language`. | ||
227 | """ | ||
228 | # Don't bother with checks if we're already done | ||
229 | if self.done: | ||
230 | return self.result | ||
231 | self.done = True | ||
232 | |||
233 | if not self._got_data: | ||
234 | self.logger.debug('no data received!') | ||
235 | |||
236 | # Default to ASCII if it is all we've seen so far | ||
237 | elif self._input_state == InputState.PURE_ASCII: | ||
238 | self.result = {'encoding': 'ascii', | ||
239 | 'confidence': 1.0, | ||
240 | 'language': ''} | ||
241 | |||
242 | # If we have seen non-ASCII, return the best that met MINIMUM_THRESHOLD | ||
243 | elif self._input_state == InputState.HIGH_BYTE: | ||
244 | prober_confidence = None | ||
245 | max_prober_confidence = 0.0 | ||
246 | max_prober = None | ||
247 | for prober in self._charset_probers: | ||
248 | if not prober: | ||
249 | continue | ||
250 | prober_confidence = prober.get_confidence() | ||
251 | if prober_confidence > max_prober_confidence: | ||
252 | max_prober_confidence = prober_confidence | ||
253 | max_prober = prober | ||
254 | if max_prober and (max_prober_confidence > self.MINIMUM_THRESHOLD): | ||
255 | charset_name = max_prober.charset_name | ||
256 | lower_charset_name = max_prober.charset_name.lower() | ||
257 | confidence = max_prober.get_confidence() | ||
258 | # Use Windows encoding name instead of ISO-8859 if we saw any | ||
259 | # extra Windows-specific bytes | ||
260 | if lower_charset_name.startswith('iso-8859'): | ||
261 | if self._has_win_bytes: | ||
262 | charset_name = self.ISO_WIN_MAP.get(lower_charset_name, | ||
263 | charset_name) | ||
264 | self.result = {'encoding': charset_name, | ||
265 | 'confidence': confidence, | ||
266 | 'language': max_prober.language} | ||
267 | |||
268 | # Log all prober confidences if none met MINIMUM_THRESHOLD | ||
269 | if self.logger.getEffectiveLevel() == logging.DEBUG: | ||
270 | if self.result['encoding'] is None: | ||
271 | self.logger.debug('no probers hit minimum threshold') | ||
272 | for group_prober in self._charset_probers: | ||
273 | if not group_prober: | ||
274 | continue | ||
275 | if isinstance(group_prober, CharSetGroupProber): | ||
276 | for prober in group_prober.probers: | ||
277 | self.logger.debug('%s %s confidence = %s', | ||
278 | prober.charset_name, | ||
279 | prober.language, | ||
280 | prober.get_confidence()) | ||
281 | else: | ||
282 | self.logger.debug('%s %s confidence = %s', | ||
283 | prober.charset_name, | ||
284 | prober.language, | ||
285 | prober.get_confidence()) | ||
286 | return self.result | ||