diff options
author | Shubham Saini <shubham6405@gmail.com> | 2018-12-11 10:01:23 +0000 |
---|---|---|
committer | Shubham Saini <shubham6405@gmail.com> | 2018-12-11 10:01:23 +0000 |
commit | 68df54d6629ec019142eb149dd037774f2d11e7c (patch) | |
tree | 345bc22d46b4e01a4ba8303b94278952a4ed2b9e /venv/lib/python3.7/site-packages/pip-10.0.1-py3.7.egg/pip/_vendor/chardet/chardistribution.py |
First commit
Diffstat (limited to 'venv/lib/python3.7/site-packages/pip-10.0.1-py3.7.egg/pip/_vendor/chardet/chardistribution.py')
-rw-r--r-- | venv/lib/python3.7/site-packages/pip-10.0.1-py3.7.egg/pip/_vendor/chardet/chardistribution.py | 233 |
1 files changed, 233 insertions, 0 deletions
diff --git a/venv/lib/python3.7/site-packages/pip-10.0.1-py3.7.egg/pip/_vendor/chardet/chardistribution.py b/venv/lib/python3.7/site-packages/pip-10.0.1-py3.7.egg/pip/_vendor/chardet/chardistribution.py new file mode 100644 index 0000000..e5509a0 --- /dev/null +++ b/venv/lib/python3.7/site-packages/pip-10.0.1-py3.7.egg/pip/_vendor/chardet/chardistribution.py | |||
@@ -0,0 +1,233 @@ | |||
1 | ######################## BEGIN LICENSE BLOCK ######################## | ||
2 | # The Original Code is Mozilla Communicator client code. | ||
3 | # | ||
4 | # The Initial Developer of the Original Code is | ||
5 | # Netscape Communications Corporation. | ||
6 | # Portions created by the Initial Developer are Copyright (C) 1998 | ||
7 | # the Initial Developer. All Rights Reserved. | ||
8 | # | ||
9 | # Contributor(s): | ||
10 | # Mark Pilgrim - port to Python | ||
11 | # | ||
12 | # This library is free software; you can redistribute it and/or | ||
13 | # modify it under the terms of the GNU Lesser General Public | ||
14 | # License as published by the Free Software Foundation; either | ||
15 | # version 2.1 of the License, or (at your option) any later version. | ||
16 | # | ||
17 | # This library is distributed in the hope that it will be useful, | ||
18 | # but WITHOUT ANY WARRANTY; without even the implied warranty of | ||
19 | # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU | ||
20 | # Lesser General Public License for more details. | ||
21 | # | ||
22 | # You should have received a copy of the GNU Lesser General Public | ||
23 | # License along with this library; if not, write to the Free Software | ||
24 | # Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA | ||
25 | # 02110-1301 USA | ||
26 | ######################### END LICENSE BLOCK ######################### | ||
27 | |||
28 | from .euctwfreq import (EUCTW_CHAR_TO_FREQ_ORDER, EUCTW_TABLE_SIZE, | ||
29 | EUCTW_TYPICAL_DISTRIBUTION_RATIO) | ||
30 | from .euckrfreq import (EUCKR_CHAR_TO_FREQ_ORDER, EUCKR_TABLE_SIZE, | ||
31 | EUCKR_TYPICAL_DISTRIBUTION_RATIO) | ||
32 | from .gb2312freq import (GB2312_CHAR_TO_FREQ_ORDER, GB2312_TABLE_SIZE, | ||
33 | GB2312_TYPICAL_DISTRIBUTION_RATIO) | ||
34 | from .big5freq import (BIG5_CHAR_TO_FREQ_ORDER, BIG5_TABLE_SIZE, | ||
35 | BIG5_TYPICAL_DISTRIBUTION_RATIO) | ||
36 | from .jisfreq import (JIS_CHAR_TO_FREQ_ORDER, JIS_TABLE_SIZE, | ||
37 | JIS_TYPICAL_DISTRIBUTION_RATIO) | ||
38 | |||
39 | |||
40 | class CharDistributionAnalysis(object): | ||
41 | ENOUGH_DATA_THRESHOLD = 1024 | ||
42 | SURE_YES = 0.99 | ||
43 | SURE_NO = 0.01 | ||
44 | MINIMUM_DATA_THRESHOLD = 3 | ||
45 | |||
46 | def __init__(self): | ||
47 | # Mapping table to get frequency order from char order (get from | ||
48 | # GetOrder()) | ||
49 | self._char_to_freq_order = None | ||
50 | self._table_size = None # Size of above table | ||
51 | # This is a constant value which varies from language to language, | ||
52 | # used in calculating confidence. See | ||
53 | # http://www.mozilla.org/projects/intl/UniversalCharsetDetection.html | ||
54 | # for further detail. | ||
55 | self.typical_distribution_ratio = None | ||
56 | self._done = None | ||
57 | self._total_chars = None | ||
58 | self._freq_chars = None | ||
59 | self.reset() | ||
60 | |||
61 | def reset(self): | ||
62 | """reset analyser, clear any state""" | ||
63 | # If this flag is set to True, detection is done and conclusion has | ||
64 | # been made | ||
65 | self._done = False | ||
66 | self._total_chars = 0 # Total characters encountered | ||
67 | # The number of characters whose frequency order is less than 512 | ||
68 | self._freq_chars = 0 | ||
69 | |||
70 | def feed(self, char, char_len): | ||
71 | """feed a character with known length""" | ||
72 | if char_len == 2: | ||
73 | # we only care about 2-bytes character in our distribution analysis | ||
74 | order = self.get_order(char) | ||
75 | else: | ||
76 | order = -1 | ||
77 | if order >= 0: | ||
78 | self._total_chars += 1 | ||
79 | # order is valid | ||
80 | if order < self._table_size: | ||
81 | if 512 > self._char_to_freq_order[order]: | ||
82 | self._freq_chars += 1 | ||
83 | |||
84 | def get_confidence(self): | ||
85 | """return confidence based on existing data""" | ||
86 | # if we didn't receive any character in our consideration range, | ||
87 | # return negative answer | ||
88 | if self._total_chars <= 0 or self._freq_chars <= self.MINIMUM_DATA_THRESHOLD: | ||
89 | return self.SURE_NO | ||
90 | |||
91 | if self._total_chars != self._freq_chars: | ||
92 | r = (self._freq_chars / ((self._total_chars - self._freq_chars) | ||
93 | * self.typical_distribution_ratio)) | ||
94 | if r < self.SURE_YES: | ||
95 | return r | ||
96 | |||
97 | # normalize confidence (we don't want to be 100% sure) | ||
98 | return self.SURE_YES | ||
99 | |||
100 | def got_enough_data(self): | ||
101 | # It is not necessary to receive all data to draw conclusion. | ||
102 | # For charset detection, certain amount of data is enough | ||
103 | return self._total_chars > self.ENOUGH_DATA_THRESHOLD | ||
104 | |||
105 | def get_order(self, byte_str): | ||
106 | # We do not handle characters based on the original encoding string, | ||
107 | # but convert this encoding string to a number, here called order. | ||
108 | # This allows multiple encodings of a language to share one frequency | ||
109 | # table. | ||
110 | return -1 | ||
111 | |||
112 | |||
113 | class EUCTWDistributionAnalysis(CharDistributionAnalysis): | ||
114 | def __init__(self): | ||
115 | super(EUCTWDistributionAnalysis, self).__init__() | ||
116 | self._char_to_freq_order = EUCTW_CHAR_TO_FREQ_ORDER | ||
117 | self._table_size = EUCTW_TABLE_SIZE | ||
118 | self.typical_distribution_ratio = EUCTW_TYPICAL_DISTRIBUTION_RATIO | ||
119 | |||
120 | def get_order(self, byte_str): | ||
121 | # for euc-TW encoding, we are interested | ||
122 | # first byte range: 0xc4 -- 0xfe | ||
123 | # second byte range: 0xa1 -- 0xfe | ||
124 | # no validation needed here. State machine has done that | ||
125 | first_char = byte_str[0] | ||
126 | if first_char >= 0xC4: | ||
127 | return 94 * (first_char - 0xC4) + byte_str[1] - 0xA1 | ||
128 | else: | ||
129 | return -1 | ||
130 | |||
131 | |||
132 | class EUCKRDistributionAnalysis(CharDistributionAnalysis): | ||
133 | def __init__(self): | ||
134 | super(EUCKRDistributionAnalysis, self).__init__() | ||
135 | self._char_to_freq_order = EUCKR_CHAR_TO_FREQ_ORDER | ||
136 | self._table_size = EUCKR_TABLE_SIZE | ||
137 | self.typical_distribution_ratio = EUCKR_TYPICAL_DISTRIBUTION_RATIO | ||
138 | |||
139 | def get_order(self, byte_str): | ||
140 | # for euc-KR encoding, we are interested | ||
141 | # first byte range: 0xb0 -- 0xfe | ||
142 | # second byte range: 0xa1 -- 0xfe | ||
143 | # no validation needed here. State machine has done that | ||
144 | first_char = byte_str[0] | ||
145 | if first_char >= 0xB0: | ||
146 | return 94 * (first_char - 0xB0) + byte_str[1] - 0xA1 | ||
147 | else: | ||
148 | return -1 | ||
149 | |||
150 | |||
151 | class GB2312DistributionAnalysis(CharDistributionAnalysis): | ||
152 | def __init__(self): | ||
153 | super(GB2312DistributionAnalysis, self).__init__() | ||
154 | self._char_to_freq_order = GB2312_CHAR_TO_FREQ_ORDER | ||
155 | self._table_size = GB2312_TABLE_SIZE | ||
156 | self.typical_distribution_ratio = GB2312_TYPICAL_DISTRIBUTION_RATIO | ||
157 | |||
158 | def get_order(self, byte_str): | ||
159 | # for GB2312 encoding, we are interested | ||
160 | # first byte range: 0xb0 -- 0xfe | ||
161 | # second byte range: 0xa1 -- 0xfe | ||
162 | # no validation needed here. State machine has done that | ||
163 | first_char, second_char = byte_str[0], byte_str[1] | ||
164 | if (first_char >= 0xB0) and (second_char >= 0xA1): | ||
165 | return 94 * (first_char - 0xB0) + second_char - 0xA1 | ||
166 | else: | ||
167 | return -1 | ||
168 | |||
169 | |||
170 | class Big5DistributionAnalysis(CharDistributionAnalysis): | ||
171 | def __init__(self): | ||
172 | super(Big5DistributionAnalysis, self).__init__() | ||
173 | self._char_to_freq_order = BIG5_CHAR_TO_FREQ_ORDER | ||
174 | self._table_size = BIG5_TABLE_SIZE | ||
175 | self.typical_distribution_ratio = BIG5_TYPICAL_DISTRIBUTION_RATIO | ||
176 | |||
177 | def get_order(self, byte_str): | ||
178 | # for big5 encoding, we are interested | ||
179 | # first byte range: 0xa4 -- 0xfe | ||
180 | # second byte range: 0x40 -- 0x7e , 0xa1 -- 0xfe | ||
181 | # no validation needed here. State machine has done that | ||
182 | first_char, second_char = byte_str[0], byte_str[1] | ||
183 | if first_char >= 0xA4: | ||
184 | if second_char >= 0xA1: | ||
185 | return 157 * (first_char - 0xA4) + second_char - 0xA1 + 63 | ||
186 | else: | ||
187 | return 157 * (first_char - 0xA4) + second_char - 0x40 | ||
188 | else: | ||
189 | return -1 | ||
190 | |||
191 | |||
192 | class SJISDistributionAnalysis(CharDistributionAnalysis): | ||
193 | def __init__(self): | ||
194 | super(SJISDistributionAnalysis, self).__init__() | ||
195 | self._char_to_freq_order = JIS_CHAR_TO_FREQ_ORDER | ||
196 | self._table_size = JIS_TABLE_SIZE | ||
197 | self.typical_distribution_ratio = JIS_TYPICAL_DISTRIBUTION_RATIO | ||
198 | |||
199 | def get_order(self, byte_str): | ||
200 | # for sjis encoding, we are interested | ||
201 | # first byte range: 0x81 -- 0x9f , 0xe0 -- 0xfe | ||
202 | # second byte range: 0x40 -- 0x7e, 0x81 -- oxfe | ||
203 | # no validation needed here. State machine has done that | ||
204 | first_char, second_char = byte_str[0], byte_str[1] | ||
205 | if (first_char >= 0x81) and (first_char <= 0x9F): | ||
206 | order = 188 * (first_char - 0x81) | ||
207 | elif (first_char >= 0xE0) and (first_char <= 0xEF): | ||
208 | order = 188 * (first_char - 0xE0 + 31) | ||
209 | else: | ||
210 | return -1 | ||
211 | order = order + second_char - 0x40 | ||
212 | if second_char > 0x7F: | ||
213 | order = -1 | ||
214 | return order | ||
215 | |||
216 | |||
217 | class EUCJPDistributionAnalysis(CharDistributionAnalysis): | ||
218 | def __init__(self): | ||
219 | super(EUCJPDistributionAnalysis, self).__init__() | ||
220 | self._char_to_freq_order = JIS_CHAR_TO_FREQ_ORDER | ||
221 | self._table_size = JIS_TABLE_SIZE | ||
222 | self.typical_distribution_ratio = JIS_TYPICAL_DISTRIBUTION_RATIO | ||
223 | |||
224 | def get_order(self, byte_str): | ||
225 | # for euc-JP encoding, we are interested | ||
226 | # first byte range: 0xa0 -- 0xfe | ||
227 | # second byte range: 0xa1 -- 0xfe | ||
228 | # no validation needed here. State machine has done that | ||
229 | char = byte_str[0] | ||
230 | if char >= 0xA0: | ||
231 | return 94 * (char - 0xA1) + byte_str[1] - 0xa1 | ||
232 | else: | ||
233 | return -1 | ||