diff options
author | Shubham Saini <shubham6405@gmail.com> | 2018-12-11 10:01:23 +0000 |
---|---|---|
committer | Shubham Saini <shubham6405@gmail.com> | 2018-12-11 10:01:23 +0000 |
commit | 68df54d6629ec019142eb149dd037774f2d11e7c (patch) | |
tree | 345bc22d46b4e01a4ba8303b94278952a4ed2b9e /venv/lib/python3.7/site-packages/pip-10.0.1-py3.7.egg/pip/_vendor/chardet/sbcharsetprober.py |
First commit
Diffstat (limited to 'venv/lib/python3.7/site-packages/pip-10.0.1-py3.7.egg/pip/_vendor/chardet/sbcharsetprober.py')
-rw-r--r-- | venv/lib/python3.7/site-packages/pip-10.0.1-py3.7.egg/pip/_vendor/chardet/sbcharsetprober.py | 132 |
1 files changed, 132 insertions, 0 deletions
diff --git a/venv/lib/python3.7/site-packages/pip-10.0.1-py3.7.egg/pip/_vendor/chardet/sbcharsetprober.py b/venv/lib/python3.7/site-packages/pip-10.0.1-py3.7.egg/pip/_vendor/chardet/sbcharsetprober.py new file mode 100644 index 0000000..66e0dfc --- /dev/null +++ b/venv/lib/python3.7/site-packages/pip-10.0.1-py3.7.egg/pip/_vendor/chardet/sbcharsetprober.py | |||
@@ -0,0 +1,132 @@ | |||
1 | ######################## BEGIN LICENSE BLOCK ######################## | ||
2 | # The Original Code is Mozilla Universal charset detector code. | ||
3 | # | ||
4 | # The Initial Developer of the Original Code is | ||
5 | # Netscape Communications Corporation. | ||
6 | # Portions created by the Initial Developer are Copyright (C) 2001 | ||
7 | # the Initial Developer. All Rights Reserved. | ||
8 | # | ||
9 | # Contributor(s): | ||
10 | # Mark Pilgrim - port to Python | ||
11 | # Shy Shalom - original C code | ||
12 | # | ||
13 | # This library is free software; you can redistribute it and/or | ||
14 | # modify it under the terms of the GNU Lesser General Public | ||
15 | # License as published by the Free Software Foundation; either | ||
16 | # version 2.1 of the License, or (at your option) any later version. | ||
17 | # | ||
18 | # This library is distributed in the hope that it will be useful, | ||
19 | # but WITHOUT ANY WARRANTY; without even the implied warranty of | ||
20 | # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU | ||
21 | # Lesser General Public License for more details. | ||
22 | # | ||
23 | # You should have received a copy of the GNU Lesser General Public | ||
24 | # License along with this library; if not, write to the Free Software | ||
25 | # Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA | ||
26 | # 02110-1301 USA | ||
27 | ######################### END LICENSE BLOCK ######################### | ||
28 | |||
29 | from .charsetprober import CharSetProber | ||
30 | from .enums import CharacterCategory, ProbingState, SequenceLikelihood | ||
31 | |||
32 | |||
33 | class SingleByteCharSetProber(CharSetProber): | ||
34 | SAMPLE_SIZE = 64 | ||
35 | SB_ENOUGH_REL_THRESHOLD = 1024 # 0.25 * SAMPLE_SIZE^2 | ||
36 | POSITIVE_SHORTCUT_THRESHOLD = 0.95 | ||
37 | NEGATIVE_SHORTCUT_THRESHOLD = 0.05 | ||
38 | |||
39 | def __init__(self, model, reversed=False, name_prober=None): | ||
40 | super(SingleByteCharSetProber, self).__init__() | ||
41 | self._model = model | ||
42 | # TRUE if we need to reverse every pair in the model lookup | ||
43 | self._reversed = reversed | ||
44 | # Optional auxiliary prober for name decision | ||
45 | self._name_prober = name_prober | ||
46 | self._last_order = None | ||
47 | self._seq_counters = None | ||
48 | self._total_seqs = None | ||
49 | self._total_char = None | ||
50 | self._freq_char = None | ||
51 | self.reset() | ||
52 | |||
53 | def reset(self): | ||
54 | super(SingleByteCharSetProber, self).reset() | ||
55 | # char order of last character | ||
56 | self._last_order = 255 | ||
57 | self._seq_counters = [0] * SequenceLikelihood.get_num_categories() | ||
58 | self._total_seqs = 0 | ||
59 | self._total_char = 0 | ||
60 | # characters that fall in our sampling range | ||
61 | self._freq_char = 0 | ||
62 | |||
63 | @property | ||
64 | def charset_name(self): | ||
65 | if self._name_prober: | ||
66 | return self._name_prober.charset_name | ||
67 | else: | ||
68 | return self._model['charset_name'] | ||
69 | |||
70 | @property | ||
71 | def language(self): | ||
72 | if self._name_prober: | ||
73 | return self._name_prober.language | ||
74 | else: | ||
75 | return self._model.get('language') | ||
76 | |||
77 | def feed(self, byte_str): | ||
78 | if not self._model['keep_english_letter']: | ||
79 | byte_str = self.filter_international_words(byte_str) | ||
80 | if not byte_str: | ||
81 | return self.state | ||
82 | char_to_order_map = self._model['char_to_order_map'] | ||
83 | for i, c in enumerate(byte_str): | ||
84 | # XXX: Order is in range 1-64, so one would think we want 0-63 here, | ||
85 | # but that leads to 27 more test failures than before. | ||
86 | order = char_to_order_map[c] | ||
87 | # XXX: This was SYMBOL_CAT_ORDER before, with a value of 250, but | ||
88 | # CharacterCategory.SYMBOL is actually 253, so we use CONTROL | ||
89 | # to make it closer to the original intent. The only difference | ||
90 | # is whether or not we count digits and control characters for | ||
91 | # _total_char purposes. | ||
92 | if order < CharacterCategory.CONTROL: | ||
93 | self._total_char += 1 | ||
94 | if order < self.SAMPLE_SIZE: | ||
95 | self._freq_char += 1 | ||
96 | if self._last_order < self.SAMPLE_SIZE: | ||
97 | self._total_seqs += 1 | ||
98 | if not self._reversed: | ||
99 | i = (self._last_order * self.SAMPLE_SIZE) + order | ||
100 | model = self._model['precedence_matrix'][i] | ||
101 | else: # reverse the order of the letters in the lookup | ||
102 | i = (order * self.SAMPLE_SIZE) + self._last_order | ||
103 | model = self._model['precedence_matrix'][i] | ||
104 | self._seq_counters[model] += 1 | ||
105 | self._last_order = order | ||
106 | |||
107 | charset_name = self._model['charset_name'] | ||
108 | if self.state == ProbingState.DETECTING: | ||
109 | if self._total_seqs > self.SB_ENOUGH_REL_THRESHOLD: | ||
110 | confidence = self.get_confidence() | ||
111 | if confidence > self.POSITIVE_SHORTCUT_THRESHOLD: | ||
112 | self.logger.debug('%s confidence = %s, we have a winner', | ||
113 | charset_name, confidence) | ||
114 | self._state = ProbingState.FOUND_IT | ||
115 | elif confidence < self.NEGATIVE_SHORTCUT_THRESHOLD: | ||
116 | self.logger.debug('%s confidence = %s, below negative ' | ||
117 | 'shortcut threshhold %s', charset_name, | ||
118 | confidence, | ||
119 | self.NEGATIVE_SHORTCUT_THRESHOLD) | ||
120 | self._state = ProbingState.NOT_ME | ||
121 | |||
122 | return self.state | ||
123 | |||
124 | def get_confidence(self): | ||
125 | r = 0.01 | ||
126 | if self._total_seqs > 0: | ||
127 | r = ((1.0 * self._seq_counters[SequenceLikelihood.POSITIVE]) / | ||
128 | self._total_seqs / self._model['typical_positive_ratio']) | ||
129 | r = r * self._freq_char / self._total_char | ||
130 | if r >= 1.0: | ||
131 | r = 0.99 | ||
132 | return r | ||