diff options
author | Shubham Saini <shubham6405@gmail.com> | 2018-12-11 10:01:23 +0000 |
---|---|---|
committer | Shubham Saini <shubham6405@gmail.com> | 2018-12-11 10:01:23 +0000 |
commit | 68df54d6629ec019142eb149dd037774f2d11e7c (patch) | |
tree | 345bc22d46b4e01a4ba8303b94278952a4ed2b9e /venv/lib/python3.7/site-packages/pip-10.0.1-py3.7.egg/pip/_vendor/chardet/latin1prober.py |
First commit
Diffstat (limited to 'venv/lib/python3.7/site-packages/pip-10.0.1-py3.7.egg/pip/_vendor/chardet/latin1prober.py')
-rw-r--r-- | venv/lib/python3.7/site-packages/pip-10.0.1-py3.7.egg/pip/_vendor/chardet/latin1prober.py | 145 |
1 files changed, 145 insertions, 0 deletions
diff --git a/venv/lib/python3.7/site-packages/pip-10.0.1-py3.7.egg/pip/_vendor/chardet/latin1prober.py b/venv/lib/python3.7/site-packages/pip-10.0.1-py3.7.egg/pip/_vendor/chardet/latin1prober.py new file mode 100644 index 0000000..7c37520 --- /dev/null +++ b/venv/lib/python3.7/site-packages/pip-10.0.1-py3.7.egg/pip/_vendor/chardet/latin1prober.py | |||
@@ -0,0 +1,145 @@ | |||
1 | ######################## BEGIN LICENSE BLOCK ######################## | ||
2 | # The Original Code is Mozilla Universal charset detector code. | ||
3 | # | ||
4 | # The Initial Developer of the Original Code is | ||
5 | # Netscape Communications Corporation. | ||
6 | # Portions created by the Initial Developer are Copyright (C) 2001 | ||
7 | # the Initial Developer. All Rights Reserved. | ||
8 | # | ||
9 | # Contributor(s): | ||
10 | # Mark Pilgrim - port to Python | ||
11 | # Shy Shalom - original C code | ||
12 | # | ||
13 | # This library is free software; you can redistribute it and/or | ||
14 | # modify it under the terms of the GNU Lesser General Public | ||
15 | # License as published by the Free Software Foundation; either | ||
16 | # version 2.1 of the License, or (at your option) any later version. | ||
17 | # | ||
18 | # This library is distributed in the hope that it will be useful, | ||
19 | # but WITHOUT ANY WARRANTY; without even the implied warranty of | ||
20 | # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU | ||
21 | # Lesser General Public License for more details. | ||
22 | # | ||
23 | # You should have received a copy of the GNU Lesser General Public | ||
24 | # License along with this library; if not, write to the Free Software | ||
25 | # Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA | ||
26 | # 02110-1301 USA | ||
27 | ######################### END LICENSE BLOCK ######################### | ||
28 | |||
29 | from .charsetprober import CharSetProber | ||
30 | from .enums import ProbingState | ||
31 | |||
32 | FREQ_CAT_NUM = 4 | ||
33 | |||
34 | UDF = 0 # undefined | ||
35 | OTH = 1 # other | ||
36 | ASC = 2 # ascii capital letter | ||
37 | ASS = 3 # ascii small letter | ||
38 | ACV = 4 # accent capital vowel | ||
39 | ACO = 5 # accent capital other | ||
40 | ASV = 6 # accent small vowel | ||
41 | ASO = 7 # accent small other | ||
42 | CLASS_NUM = 8 # total classes | ||
43 | |||
44 | Latin1_CharToClass = ( | ||
45 | OTH, OTH, OTH, OTH, OTH, OTH, OTH, OTH, # 00 - 07 | ||
46 | OTH, OTH, OTH, OTH, OTH, OTH, OTH, OTH, # 08 - 0F | ||
47 | OTH, OTH, OTH, OTH, OTH, OTH, OTH, OTH, # 10 - 17 | ||
48 | OTH, OTH, OTH, OTH, OTH, OTH, OTH, OTH, # 18 - 1F | ||
49 | OTH, OTH, OTH, OTH, OTH, OTH, OTH, OTH, # 20 - 27 | ||
50 | OTH, OTH, OTH, OTH, OTH, OTH, OTH, OTH, # 28 - 2F | ||
51 | OTH, OTH, OTH, OTH, OTH, OTH, OTH, OTH, # 30 - 37 | ||
52 | OTH, OTH, OTH, OTH, OTH, OTH, OTH, OTH, # 38 - 3F | ||
53 | OTH, ASC, ASC, ASC, ASC, ASC, ASC, ASC, # 40 - 47 | ||
54 | ASC, ASC, ASC, ASC, ASC, ASC, ASC, ASC, # 48 - 4F | ||
55 | ASC, ASC, ASC, ASC, ASC, ASC, ASC, ASC, # 50 - 57 | ||
56 | ASC, ASC, ASC, OTH, OTH, OTH, OTH, OTH, # 58 - 5F | ||
57 | OTH, ASS, ASS, ASS, ASS, ASS, ASS, ASS, # 60 - 67 | ||
58 | ASS, ASS, ASS, ASS, ASS, ASS, ASS, ASS, # 68 - 6F | ||
59 | ASS, ASS, ASS, ASS, ASS, ASS, ASS, ASS, # 70 - 77 | ||
60 | ASS, ASS, ASS, OTH, OTH, OTH, OTH, OTH, # 78 - 7F | ||
61 | OTH, UDF, OTH, ASO, OTH, OTH, OTH, OTH, # 80 - 87 | ||
62 | OTH, OTH, ACO, OTH, ACO, UDF, ACO, UDF, # 88 - 8F | ||
63 | UDF, OTH, OTH, OTH, OTH, OTH, OTH, OTH, # 90 - 97 | ||
64 | OTH, OTH, ASO, OTH, ASO, UDF, ASO, ACO, # 98 - 9F | ||
65 | OTH, OTH, OTH, OTH, OTH, OTH, OTH, OTH, # A0 - A7 | ||
66 | OTH, OTH, OTH, OTH, OTH, OTH, OTH, OTH, # A8 - AF | ||
67 | OTH, OTH, OTH, OTH, OTH, OTH, OTH, OTH, # B0 - B7 | ||
68 | OTH, OTH, OTH, OTH, OTH, OTH, OTH, OTH, # B8 - BF | ||
69 | ACV, ACV, ACV, ACV, ACV, ACV, ACO, ACO, # C0 - C7 | ||
70 | ACV, ACV, ACV, ACV, ACV, ACV, ACV, ACV, # C8 - CF | ||
71 | ACO, ACO, ACV, ACV, ACV, ACV, ACV, OTH, # D0 - D7 | ||
72 | ACV, ACV, ACV, ACV, ACV, ACO, ACO, ACO, # D8 - DF | ||
73 | ASV, ASV, ASV, ASV, ASV, ASV, ASO, ASO, # E0 - E7 | ||
74 | ASV, ASV, ASV, ASV, ASV, ASV, ASV, ASV, # E8 - EF | ||
75 | ASO, ASO, ASV, ASV, ASV, ASV, ASV, OTH, # F0 - F7 | ||
76 | ASV, ASV, ASV, ASV, ASV, ASO, ASO, ASO, # F8 - FF | ||
77 | ) | ||
78 | |||
79 | # 0 : illegal | ||
80 | # 1 : very unlikely | ||
81 | # 2 : normal | ||
82 | # 3 : very likely | ||
83 | Latin1ClassModel = ( | ||
84 | # UDF OTH ASC ASS ACV ACO ASV ASO | ||
85 | 0, 0, 0, 0, 0, 0, 0, 0, # UDF | ||
86 | 0, 3, 3, 3, 3, 3, 3, 3, # OTH | ||
87 | 0, 3, 3, 3, 3, 3, 3, 3, # ASC | ||
88 | 0, 3, 3, 3, 1, 1, 3, 3, # ASS | ||
89 | 0, 3, 3, 3, 1, 2, 1, 2, # ACV | ||
90 | 0, 3, 3, 3, 3, 3, 3, 3, # ACO | ||
91 | 0, 3, 1, 3, 1, 1, 1, 3, # ASV | ||
92 | 0, 3, 1, 3, 1, 1, 3, 3, # ASO | ||
93 | ) | ||
94 | |||
95 | |||
96 | class Latin1Prober(CharSetProber): | ||
97 | def __init__(self): | ||
98 | super(Latin1Prober, self).__init__() | ||
99 | self._last_char_class = None | ||
100 | self._freq_counter = None | ||
101 | self.reset() | ||
102 | |||
103 | def reset(self): | ||
104 | self._last_char_class = OTH | ||
105 | self._freq_counter = [0] * FREQ_CAT_NUM | ||
106 | CharSetProber.reset(self) | ||
107 | |||
108 | @property | ||
109 | def charset_name(self): | ||
110 | return "ISO-8859-1" | ||
111 | |||
112 | @property | ||
113 | def language(self): | ||
114 | return "" | ||
115 | |||
116 | def feed(self, byte_str): | ||
117 | byte_str = self.filter_with_english_letters(byte_str) | ||
118 | for c in byte_str: | ||
119 | char_class = Latin1_CharToClass[c] | ||
120 | freq = Latin1ClassModel[(self._last_char_class * CLASS_NUM) | ||
121 | + char_class] | ||
122 | if freq == 0: | ||
123 | self._state = ProbingState.NOT_ME | ||
124 | break | ||
125 | self._freq_counter[freq] += 1 | ||
126 | self._last_char_class = char_class | ||
127 | |||
128 | return self.state | ||
129 | |||
130 | def get_confidence(self): | ||
131 | if self.state == ProbingState.NOT_ME: | ||
132 | return 0.01 | ||
133 | |||
134 | total = sum(self._freq_counter) | ||
135 | if total < 0.01: | ||
136 | confidence = 0.0 | ||
137 | else: | ||
138 | confidence = ((self._freq_counter[3] - self._freq_counter[1] * 20.0) | ||
139 | / total) | ||
140 | if confidence < 0.0: | ||
141 | confidence = 0.0 | ||
142 | # lower the confidence of latin1 so that other more accurate | ||
143 | # detector can take priority. | ||
144 | confidence = confidence * 0.73 | ||
145 | return confidence | ||