diff options
Diffstat (limited to 'venv/lib/python3.7/site-packages/pip-10.0.1-py3.7.egg/pip/_vendor/chardet/charsetprober.py')
-rw-r--r-- | venv/lib/python3.7/site-packages/pip-10.0.1-py3.7.egg/pip/_vendor/chardet/charsetprober.py | 145 |
1 files changed, 0 insertions, 145 deletions
diff --git a/venv/lib/python3.7/site-packages/pip-10.0.1-py3.7.egg/pip/_vendor/chardet/charsetprober.py b/venv/lib/python3.7/site-packages/pip-10.0.1-py3.7.egg/pip/_vendor/chardet/charsetprober.py deleted file mode 100644 index 1fc2746..0000000 --- a/venv/lib/python3.7/site-packages/pip-10.0.1-py3.7.egg/pip/_vendor/chardet/charsetprober.py +++ /dev/null | |||
@@ -1,145 +0,0 @@ | |||
1 | ######################## BEGIN LICENSE BLOCK ######################## | ||
2 | # The Original Code is Mozilla Universal charset detector code. | ||
3 | # | ||
4 | # The Initial Developer of the Original Code is | ||
5 | # Netscape Communications Corporation. | ||
6 | # Portions created by the Initial Developer are Copyright (C) 2001 | ||
7 | # the Initial Developer. All Rights Reserved. | ||
8 | # | ||
9 | # Contributor(s): | ||
10 | # Mark Pilgrim - port to Python | ||
11 | # Shy Shalom - original C code | ||
12 | # | ||
13 | # This library is free software; you can redistribute it and/or | ||
14 | # modify it under the terms of the GNU Lesser General Public | ||
15 | # License as published by the Free Software Foundation; either | ||
16 | # version 2.1 of the License, or (at your option) any later version. | ||
17 | # | ||
18 | # This library is distributed in the hope that it will be useful, | ||
19 | # but WITHOUT ANY WARRANTY; without even the implied warranty of | ||
20 | # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU | ||
21 | # Lesser General Public License for more details. | ||
22 | # | ||
23 | # You should have received a copy of the GNU Lesser General Public | ||
24 | # License along with this library; if not, write to the Free Software | ||
25 | # Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA | ||
26 | # 02110-1301 USA | ||
27 | ######################### END LICENSE BLOCK ######################### | ||
28 | |||
29 | import logging | ||
30 | import re | ||
31 | |||
32 | from .enums import ProbingState | ||
33 | |||
34 | |||
35 | class CharSetProber(object): | ||
36 | |||
37 | SHORTCUT_THRESHOLD = 0.95 | ||
38 | |||
39 | def __init__(self, lang_filter=None): | ||
40 | self._state = None | ||
41 | self.lang_filter = lang_filter | ||
42 | self.logger = logging.getLogger(__name__) | ||
43 | |||
44 | def reset(self): | ||
45 | self._state = ProbingState.DETECTING | ||
46 | |||
47 | @property | ||
48 | def charset_name(self): | ||
49 | return None | ||
50 | |||
51 | def feed(self, buf): | ||
52 | pass | ||
53 | |||
54 | @property | ||
55 | def state(self): | ||
56 | return self._state | ||
57 | |||
58 | def get_confidence(self): | ||
59 | return 0.0 | ||
60 | |||
61 | @staticmethod | ||
62 | def filter_high_byte_only(buf): | ||
63 | buf = re.sub(b'([\x00-\x7F])+', b' ', buf) | ||
64 | return buf | ||
65 | |||
66 | @staticmethod | ||
67 | def filter_international_words(buf): | ||
68 | """ | ||
69 | We define three types of bytes: | ||
70 | alphabet: english alphabets [a-zA-Z] | ||
71 | international: international characters [\x80-\xFF] | ||
72 | marker: everything else [^a-zA-Z\x80-\xFF] | ||
73 | |||
74 | The input buffer can be thought to contain a series of words delimited | ||
75 | by markers. This function works to filter all words that contain at | ||
76 | least one international character. All contiguous sequences of markers | ||
77 | are replaced by a single space ascii character. | ||
78 | |||
79 | This filter applies to all scripts which do not use English characters. | ||
80 | """ | ||
81 | filtered = bytearray() | ||
82 | |||
83 | # This regex expression filters out only words that have at-least one | ||
84 | # international character. The word may include one marker character at | ||
85 | # the end. | ||
86 | words = re.findall(b'[a-zA-Z]*[\x80-\xFF]+[a-zA-Z]*[^a-zA-Z\x80-\xFF]?', | ||
87 | buf) | ||
88 | |||
89 | for word in words: | ||
90 | filtered.extend(word[:-1]) | ||
91 | |||
92 | # If the last character in the word is a marker, replace it with a | ||
93 | # space as markers shouldn't affect our analysis (they are used | ||
94 | # similarly across all languages and may thus have similar | ||
95 | # frequencies). | ||
96 | last_char = word[-1:] | ||
97 | if not last_char.isalpha() and last_char < b'\x80': | ||
98 | last_char = b' ' | ||
99 | filtered.extend(last_char) | ||
100 | |||
101 | return filtered | ||
102 | |||
103 | @staticmethod | ||
104 | def filter_with_english_letters(buf): | ||
105 | """ | ||
106 | Returns a copy of ``buf`` that retains only the sequences of English | ||
107 | alphabet and high byte characters that are not between <> characters. | ||
108 | Also retains English alphabet and high byte characters immediately | ||
109 | before occurrences of >. | ||
110 | |||
111 | This filter can be applied to all scripts which contain both English | ||
112 | characters and extended ASCII characters, but is currently only used by | ||
113 | ``Latin1Prober``. | ||
114 | """ | ||
115 | filtered = bytearray() | ||
116 | in_tag = False | ||
117 | prev = 0 | ||
118 | |||
119 | for curr in range(len(buf)): | ||
120 | # Slice here to get bytes instead of an int with Python 3 | ||
121 | buf_char = buf[curr:curr + 1] | ||
122 | # Check if we're coming out of or entering an HTML tag | ||
123 | if buf_char == b'>': | ||
124 | in_tag = False | ||
125 | elif buf_char == b'<': | ||
126 | in_tag = True | ||
127 | |||
128 | # If current character is not extended-ASCII and not alphabetic... | ||
129 | if buf_char < b'\x80' and not buf_char.isalpha(): | ||
130 | # ...and we're not in a tag | ||
131 | if curr > prev and not in_tag: | ||
132 | # Keep everything after last non-extended-ASCII, | ||
133 | # non-alphabetic character | ||
134 | filtered.extend(buf[prev:curr]) | ||
135 | # Output a space to delimit stretch we kept | ||
136 | filtered.extend(b' ') | ||
137 | prev = curr + 1 | ||
138 | |||
139 | # If we're not in a tag... | ||
140 | if not in_tag: | ||
141 | # Keep everything after last non-extended-ASCII, non-alphabetic | ||
142 | # character | ||
143 | filtered.extend(buf[prev:]) | ||
144 | |||
145 | return filtered | ||