diff options
author | Shubham Saini <shubham6405@gmail.com> | 2018-12-11 10:01:23 +0000 |
---|---|---|
committer | Shubham Saini <shubham6405@gmail.com> | 2018-12-11 10:01:23 +0000 |
commit | 68df54d6629ec019142eb149dd037774f2d11e7c (patch) | |
tree | 345bc22d46b4e01a4ba8303b94278952a4ed2b9e /venv/lib/python3.7/site-packages/pip-10.0.1-py3.7.egg/pip/_vendor/chardet/mbcharsetprober.py |
First commit
Diffstat (limited to 'venv/lib/python3.7/site-packages/pip-10.0.1-py3.7.egg/pip/_vendor/chardet/mbcharsetprober.py')
-rw-r--r-- | venv/lib/python3.7/site-packages/pip-10.0.1-py3.7.egg/pip/_vendor/chardet/mbcharsetprober.py | 91 |
1 files changed, 91 insertions, 0 deletions
diff --git a/venv/lib/python3.7/site-packages/pip-10.0.1-py3.7.egg/pip/_vendor/chardet/mbcharsetprober.py b/venv/lib/python3.7/site-packages/pip-10.0.1-py3.7.egg/pip/_vendor/chardet/mbcharsetprober.py new file mode 100644 index 0000000..4609154 --- /dev/null +++ b/venv/lib/python3.7/site-packages/pip-10.0.1-py3.7.egg/pip/_vendor/chardet/mbcharsetprober.py | |||
@@ -0,0 +1,91 @@ | |||
1 | ######################## BEGIN LICENSE BLOCK ######################## | ||
2 | # The Original Code is Mozilla Universal charset detector code. | ||
3 | # | ||
4 | # The Initial Developer of the Original Code is | ||
5 | # Netscape Communications Corporation. | ||
6 | # Portions created by the Initial Developer are Copyright (C) 2001 | ||
7 | # the Initial Developer. All Rights Reserved. | ||
8 | # | ||
9 | # Contributor(s): | ||
10 | # Mark Pilgrim - port to Python | ||
11 | # Shy Shalom - original C code | ||
12 | # Proofpoint, Inc. | ||
13 | # | ||
14 | # This library is free software; you can redistribute it and/or | ||
15 | # modify it under the terms of the GNU Lesser General Public | ||
16 | # License as published by the Free Software Foundation; either | ||
17 | # version 2.1 of the License, or (at your option) any later version. | ||
18 | # | ||
19 | # This library is distributed in the hope that it will be useful, | ||
20 | # but WITHOUT ANY WARRANTY; without even the implied warranty of | ||
21 | # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU | ||
22 | # Lesser General Public License for more details. | ||
23 | # | ||
24 | # You should have received a copy of the GNU Lesser General Public | ||
25 | # License along with this library; if not, write to the Free Software | ||
26 | # Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA | ||
27 | # 02110-1301 USA | ||
28 | ######################### END LICENSE BLOCK ######################### | ||
29 | |||
30 | from .charsetprober import CharSetProber | ||
31 | from .enums import ProbingState, MachineState | ||
32 | |||
33 | |||
34 | class MultiByteCharSetProber(CharSetProber): | ||
35 | """ | ||
36 | MultiByteCharSetProber | ||
37 | """ | ||
38 | |||
39 | def __init__(self, lang_filter=None): | ||
40 | super(MultiByteCharSetProber, self).__init__(lang_filter=lang_filter) | ||
41 | self.distribution_analyzer = None | ||
42 | self.coding_sm = None | ||
43 | self._last_char = [0, 0] | ||
44 | |||
45 | def reset(self): | ||
46 | super(MultiByteCharSetProber, self).reset() | ||
47 | if self.coding_sm: | ||
48 | self.coding_sm.reset() | ||
49 | if self.distribution_analyzer: | ||
50 | self.distribution_analyzer.reset() | ||
51 | self._last_char = [0, 0] | ||
52 | |||
53 | @property | ||
54 | def charset_name(self): | ||
55 | raise NotImplementedError | ||
56 | |||
57 | @property | ||
58 | def language(self): | ||
59 | raise NotImplementedError | ||
60 | |||
61 | def feed(self, byte_str): | ||
62 | for i in range(len(byte_str)): | ||
63 | coding_state = self.coding_sm.next_state(byte_str[i]) | ||
64 | if coding_state == MachineState.ERROR: | ||
65 | self.logger.debug('%s %s prober hit error at byte %s', | ||
66 | self.charset_name, self.language, i) | ||
67 | self._state = ProbingState.NOT_ME | ||
68 | break | ||
69 | elif coding_state == MachineState.ITS_ME: | ||
70 | self._state = ProbingState.FOUND_IT | ||
71 | break | ||
72 | elif coding_state == MachineState.START: | ||
73 | char_len = self.coding_sm.get_current_charlen() | ||
74 | if i == 0: | ||
75 | self._last_char[1] = byte_str[0] | ||
76 | self.distribution_analyzer.feed(self._last_char, char_len) | ||
77 | else: | ||
78 | self.distribution_analyzer.feed(byte_str[i - 1:i + 1], | ||
79 | char_len) | ||
80 | |||
81 | self._last_char[0] = byte_str[-1] | ||
82 | |||
83 | if self.state == ProbingState.DETECTING: | ||
84 | if (self.distribution_analyzer.got_enough_data() and | ||
85 | (self.get_confidence() > self.SHORTCUT_THRESHOLD)): | ||
86 | self._state = ProbingState.FOUND_IT | ||
87 | |||
88 | return self.state | ||
89 | |||
90 | def get_confidence(self): | ||
91 | return self.distribution_analyzer.get_confidence() | ||