diff options
Diffstat (limited to 'venv/lib/python3.7/site-packages/pip-10.0.1-py3.7.egg/pip/_vendor/html5lib/serializer.py')
-rw-r--r-- | venv/lib/python3.7/site-packages/pip-10.0.1-py3.7.egg/pip/_vendor/html5lib/serializer.py | 409 |
1 files changed, 0 insertions, 409 deletions
diff --git a/venv/lib/python3.7/site-packages/pip-10.0.1-py3.7.egg/pip/_vendor/html5lib/serializer.py b/venv/lib/python3.7/site-packages/pip-10.0.1-py3.7.egg/pip/_vendor/html5lib/serializer.py deleted file mode 100644 index 641323e..0000000 --- a/venv/lib/python3.7/site-packages/pip-10.0.1-py3.7.egg/pip/_vendor/html5lib/serializer.py +++ /dev/null | |||
@@ -1,409 +0,0 @@ | |||
1 | from __future__ import absolute_import, division, unicode_literals | ||
2 | from pip._vendor.six import text_type | ||
3 | |||
4 | import re | ||
5 | |||
6 | from codecs import register_error, xmlcharrefreplace_errors | ||
7 | |||
8 | from .constants import voidElements, booleanAttributes, spaceCharacters | ||
9 | from .constants import rcdataElements, entities, xmlEntities | ||
10 | from . import treewalkers, _utils | ||
11 | from xml.sax.saxutils import escape | ||
12 | |||
13 | _quoteAttributeSpecChars = "".join(spaceCharacters) + "\"'=<>`" | ||
14 | _quoteAttributeSpec = re.compile("[" + _quoteAttributeSpecChars + "]") | ||
15 | _quoteAttributeLegacy = re.compile("[" + _quoteAttributeSpecChars + | ||
16 | "\x00\x01\x02\x03\x04\x05\x06\x07\x08\t\n" | ||
17 | "\x0b\x0c\r\x0e\x0f\x10\x11\x12\x13\x14\x15" | ||
18 | "\x16\x17\x18\x19\x1a\x1b\x1c\x1d\x1e\x1f" | ||
19 | "\x20\x2f\x60\xa0\u1680\u180e\u180f\u2000" | ||
20 | "\u2001\u2002\u2003\u2004\u2005\u2006\u2007" | ||
21 | "\u2008\u2009\u200a\u2028\u2029\u202f\u205f" | ||
22 | "\u3000]") | ||
23 | |||
24 | |||
25 | _encode_entity_map = {} | ||
26 | _is_ucs4 = len("\U0010FFFF") == 1 | ||
27 | for k, v in list(entities.items()): | ||
28 | # skip multi-character entities | ||
29 | if ((_is_ucs4 and len(v) > 1) or | ||
30 | (not _is_ucs4 and len(v) > 2)): | ||
31 | continue | ||
32 | if v != "&": | ||
33 | if len(v) == 2: | ||
34 | v = _utils.surrogatePairToCodepoint(v) | ||
35 | else: | ||
36 | v = ord(v) | ||
37 | if v not in _encode_entity_map or k.islower(): | ||
38 | # prefer < over < and similarly for &, >, etc. | ||
39 | _encode_entity_map[v] = k | ||
40 | |||
41 | |||
42 | def htmlentityreplace_errors(exc): | ||
43 | if isinstance(exc, (UnicodeEncodeError, UnicodeTranslateError)): | ||
44 | res = [] | ||
45 | codepoints = [] | ||
46 | skip = False | ||
47 | for i, c in enumerate(exc.object[exc.start:exc.end]): | ||
48 | if skip: | ||
49 | skip = False | ||
50 | continue | ||
51 | index = i + exc.start | ||
52 | if _utils.isSurrogatePair(exc.object[index:min([exc.end, index + 2])]): | ||
53 | codepoint = _utils.surrogatePairToCodepoint(exc.object[index:index + 2]) | ||
54 | skip = True | ||
55 | else: | ||
56 | codepoint = ord(c) | ||
57 | codepoints.append(codepoint) | ||
58 | for cp in codepoints: | ||
59 | e = _encode_entity_map.get(cp) | ||
60 | if e: | ||
61 | res.append("&") | ||
62 | res.append(e) | ||
63 | if not e.endswith(";"): | ||
64 | res.append(";") | ||
65 | else: | ||
66 | res.append("&#x%s;" % (hex(cp)[2:])) | ||
67 | return ("".join(res), exc.end) | ||
68 | else: | ||
69 | return xmlcharrefreplace_errors(exc) | ||
70 | |||
71 | |||
72 | register_error("htmlentityreplace", htmlentityreplace_errors) | ||
73 | |||
74 | |||
75 | def serialize(input, tree="etree", encoding=None, **serializer_opts): | ||
76 | """Serializes the input token stream using the specified treewalker | ||
77 | |||
78 | :arg input: the token stream to serialize | ||
79 | |||
80 | :arg tree: the treewalker to use | ||
81 | |||
82 | :arg encoding: the encoding to use | ||
83 | |||
84 | :arg serializer_opts: any options to pass to the | ||
85 | :py:class:`html5lib.serializer.HTMLSerializer` that gets created | ||
86 | |||
87 | :returns: the tree serialized as a string | ||
88 | |||
89 | Example: | ||
90 | |||
91 | >>> from html5lib.html5parser import parse | ||
92 | >>> from html5lib.serializer import serialize | ||
93 | >>> token_stream = parse('<html><body><p>Hi!</p></body></html>') | ||
94 | >>> serialize(token_stream, omit_optional_tags=False) | ||
95 | '<html><head></head><body><p>Hi!</p></body></html>' | ||
96 | |||
97 | """ | ||
98 | # XXX: Should we cache this? | ||
99 | walker = treewalkers.getTreeWalker(tree) | ||
100 | s = HTMLSerializer(**serializer_opts) | ||
101 | return s.render(walker(input), encoding) | ||
102 | |||
103 | |||
104 | class HTMLSerializer(object): | ||
105 | |||
106 | # attribute quoting options | ||
107 | quote_attr_values = "legacy" # be secure by default | ||
108 | quote_char = '"' | ||
109 | use_best_quote_char = True | ||
110 | |||
111 | # tag syntax options | ||
112 | omit_optional_tags = True | ||
113 | minimize_boolean_attributes = True | ||
114 | use_trailing_solidus = False | ||
115 | space_before_trailing_solidus = True | ||
116 | |||
117 | # escaping options | ||
118 | escape_lt_in_attrs = False | ||
119 | escape_rcdata = False | ||
120 | resolve_entities = True | ||
121 | |||
122 | # miscellaneous options | ||
123 | alphabetical_attributes = False | ||
124 | inject_meta_charset = True | ||
125 | strip_whitespace = False | ||
126 | sanitize = False | ||
127 | |||
128 | options = ("quote_attr_values", "quote_char", "use_best_quote_char", | ||
129 | "omit_optional_tags", "minimize_boolean_attributes", | ||
130 | "use_trailing_solidus", "space_before_trailing_solidus", | ||
131 | "escape_lt_in_attrs", "escape_rcdata", "resolve_entities", | ||
132 | "alphabetical_attributes", "inject_meta_charset", | ||
133 | "strip_whitespace", "sanitize") | ||
134 | |||
135 | def __init__(self, **kwargs): | ||
136 | """Initialize HTMLSerializer | ||
137 | |||
138 | :arg inject_meta_charset: Whether or not to inject the meta charset. | ||
139 | |||
140 | Defaults to ``True``. | ||
141 | |||
142 | :arg quote_attr_values: Whether to quote attribute values that don't | ||
143 | require quoting per legacy browser behavior (``"legacy"``), when | ||
144 | required by the standard (``"spec"``), or always (``"always"``). | ||
145 | |||
146 | Defaults to ``"legacy"``. | ||
147 | |||
148 | :arg quote_char: Use given quote character for attribute quoting. | ||
149 | |||
150 | Defaults to ``"`` which will use double quotes unless attribute | ||
151 | value contains a double quote, in which case single quotes are | ||
152 | used. | ||
153 | |||
154 | :arg escape_lt_in_attrs: Whether or not to escape ``<`` in attribute | ||
155 | values. | ||
156 | |||
157 | Defaults to ``False``. | ||
158 | |||
159 | :arg escape_rcdata: Whether to escape characters that need to be | ||
160 | escaped within normal elements within rcdata elements such as | ||
161 | style. | ||
162 | |||
163 | Defaults to ``False``. | ||
164 | |||
165 | :arg resolve_entities: Whether to resolve named character entities that | ||
166 | appear in the source tree. The XML predefined entities < > | ||
167 | & " ' are unaffected by this setting. | ||
168 | |||
169 | Defaults to ``True``. | ||
170 | |||
171 | :arg strip_whitespace: Whether to remove semantically meaningless | ||
172 | whitespace. (This compresses all whitespace to a single space | ||
173 | except within ``pre``.) | ||
174 | |||
175 | Defaults to ``False``. | ||
176 | |||
177 | :arg minimize_boolean_attributes: Shortens boolean attributes to give | ||
178 | just the attribute value, for example:: | ||
179 | |||
180 | <input disabled="disabled"> | ||
181 | |||
182 | becomes:: | ||
183 | |||
184 | <input disabled> | ||
185 | |||
186 | Defaults to ``True``. | ||
187 | |||
188 | :arg use_trailing_solidus: Includes a close-tag slash at the end of the | ||
189 | start tag of void elements (empty elements whose end tag is | ||
190 | forbidden). E.g. ``<hr/>``. | ||
191 | |||
192 | Defaults to ``False``. | ||
193 | |||
194 | :arg space_before_trailing_solidus: Places a space immediately before | ||
195 | the closing slash in a tag using a trailing solidus. E.g. | ||
196 | ``<hr />``. Requires ``use_trailing_solidus=True``. | ||
197 | |||
198 | Defaults to ``True``. | ||
199 | |||
200 | :arg sanitize: Strip all unsafe or unknown constructs from output. | ||
201 | See :py:class:`html5lib.filters.sanitizer.Filter`. | ||
202 | |||
203 | Defaults to ``False``. | ||
204 | |||
205 | :arg omit_optional_tags: Omit start/end tags that are optional. | ||
206 | |||
207 | Defaults to ``True``. | ||
208 | |||
209 | :arg alphabetical_attributes: Reorder attributes to be in alphabetical order. | ||
210 | |||
211 | Defaults to ``False``. | ||
212 | |||
213 | """ | ||
214 | unexpected_args = frozenset(kwargs) - frozenset(self.options) | ||
215 | if len(unexpected_args) > 0: | ||
216 | raise TypeError("__init__() got an unexpected keyword argument '%s'" % next(iter(unexpected_args))) | ||
217 | if 'quote_char' in kwargs: | ||
218 | self.use_best_quote_char = False | ||
219 | for attr in self.options: | ||
220 | setattr(self, attr, kwargs.get(attr, getattr(self, attr))) | ||
221 | self.errors = [] | ||
222 | self.strict = False | ||
223 | |||
224 | def encode(self, string): | ||
225 | assert(isinstance(string, text_type)) | ||
226 | if self.encoding: | ||
227 | return string.encode(self.encoding, "htmlentityreplace") | ||
228 | else: | ||
229 | return string | ||
230 | |||
231 | def encodeStrict(self, string): | ||
232 | assert(isinstance(string, text_type)) | ||
233 | if self.encoding: | ||
234 | return string.encode(self.encoding, "strict") | ||
235 | else: | ||
236 | return string | ||
237 | |||
238 | def serialize(self, treewalker, encoding=None): | ||
239 | # pylint:disable=too-many-nested-blocks | ||
240 | self.encoding = encoding | ||
241 | in_cdata = False | ||
242 | self.errors = [] | ||
243 | |||
244 | if encoding and self.inject_meta_charset: | ||
245 | from .filters.inject_meta_charset import Filter | ||
246 | treewalker = Filter(treewalker, encoding) | ||
247 | # Alphabetical attributes is here under the assumption that none of | ||
248 | # the later filters add or change order of attributes; it needs to be | ||
249 | # before the sanitizer so escaped elements come out correctly | ||
250 | if self.alphabetical_attributes: | ||
251 | from .filters.alphabeticalattributes import Filter | ||
252 | treewalker = Filter(treewalker) | ||
253 | # WhitespaceFilter should be used before OptionalTagFilter | ||
254 | # for maximum efficiently of this latter filter | ||
255 | if self.strip_whitespace: | ||
256 | from .filters.whitespace import Filter | ||
257 | treewalker = Filter(treewalker) | ||
258 | if self.sanitize: | ||
259 | from .filters.sanitizer import Filter | ||
260 | treewalker = Filter(treewalker) | ||
261 | if self.omit_optional_tags: | ||
262 | from .filters.optionaltags import Filter | ||
263 | treewalker = Filter(treewalker) | ||
264 | |||
265 | for token in treewalker: | ||
266 | type = token["type"] | ||
267 | if type == "Doctype": | ||
268 | doctype = "<!DOCTYPE %s" % token["name"] | ||
269 | |||
270 | if token["publicId"]: | ||
271 | doctype += ' PUBLIC "%s"' % token["publicId"] | ||
272 | elif token["systemId"]: | ||
273 | doctype += " SYSTEM" | ||
274 | if token["systemId"]: | ||
275 | if token["systemId"].find('"') >= 0: | ||
276 | if token["systemId"].find("'") >= 0: | ||
277 | self.serializeError("System identifer contains both single and double quote characters") | ||
278 | quote_char = "'" | ||
279 | else: | ||
280 | quote_char = '"' | ||
281 | doctype += " %s%s%s" % (quote_char, token["systemId"], quote_char) | ||
282 | |||
283 | doctype += ">" | ||
284 | yield self.encodeStrict(doctype) | ||
285 | |||
286 | elif type in ("Characters", "SpaceCharacters"): | ||
287 | if type == "SpaceCharacters" or in_cdata: | ||
288 | if in_cdata and token["data"].find("</") >= 0: | ||
289 | self.serializeError("Unexpected </ in CDATA") | ||
290 | yield self.encode(token["data"]) | ||
291 | else: | ||
292 | yield self.encode(escape(token["data"])) | ||
293 | |||
294 | elif type in ("StartTag", "EmptyTag"): | ||
295 | name = token["name"] | ||
296 | yield self.encodeStrict("<%s" % name) | ||
297 | if name in rcdataElements and not self.escape_rcdata: | ||
298 | in_cdata = True | ||
299 | elif in_cdata: | ||
300 | self.serializeError("Unexpected child element of a CDATA element") | ||
301 | for (_, attr_name), attr_value in token["data"].items(): | ||
302 | # TODO: Add namespace support here | ||
303 | k = attr_name | ||
304 | v = attr_value | ||
305 | yield self.encodeStrict(' ') | ||
306 | |||
307 | yield self.encodeStrict(k) | ||
308 | if not self.minimize_boolean_attributes or \ | ||
309 | (k not in booleanAttributes.get(name, tuple()) and | ||
310 | k not in booleanAttributes.get("", tuple())): | ||
311 | yield self.encodeStrict("=") | ||
312 | if self.quote_attr_values == "always" or len(v) == 0: | ||
313 | quote_attr = True | ||
314 | elif self.quote_attr_values == "spec": | ||
315 | quote_attr = _quoteAttributeSpec.search(v) is not None | ||
316 | elif self.quote_attr_values == "legacy": | ||
317 | quote_attr = _quoteAttributeLegacy.search(v) is not None | ||
318 | else: | ||
319 | raise ValueError("quote_attr_values must be one of: " | ||
320 | "'always', 'spec', or 'legacy'") | ||
321 | v = v.replace("&", "&") | ||
322 | if self.escape_lt_in_attrs: | ||
323 | v = v.replace("<", "<") | ||
324 | if quote_attr: | ||
325 | quote_char = self.quote_char | ||
326 | if self.use_best_quote_char: | ||
327 | if "'" in v and '"' not in v: | ||
328 | quote_char = '"' | ||
329 | elif '"' in v and "'" not in v: | ||
330 | quote_char = "'" | ||
331 | if quote_char == "'": | ||
332 | v = v.replace("'", "'") | ||
333 | else: | ||
334 | v = v.replace('"', """) | ||
335 | yield self.encodeStrict(quote_char) | ||
336 | yield self.encode(v) | ||
337 | yield self.encodeStrict(quote_char) | ||
338 | else: | ||
339 | yield self.encode(v) | ||
340 | if name in voidElements and self.use_trailing_solidus: | ||
341 | if self.space_before_trailing_solidus: | ||
342 | yield self.encodeStrict(" /") | ||
343 | else: | ||
344 | yield self.encodeStrict("/") | ||
345 | yield self.encode(">") | ||
346 | |||
347 | elif type == "EndTag": | ||
348 | name = token["name"] | ||
349 | if name in rcdataElements: | ||
350 | in_cdata = False | ||
351 | elif in_cdata: | ||
352 | self.serializeError("Unexpected child element of a CDATA element") | ||
353 | yield self.encodeStrict("</%s>" % name) | ||
354 | |||
355 | elif type == "Comment": | ||
356 | data = token["data"] | ||
357 | if data.find("--") >= 0: | ||
358 | self.serializeError("Comment contains --") | ||
359 | yield self.encodeStrict("<!--%s-->" % token["data"]) | ||
360 | |||
361 | elif type == "Entity": | ||
362 | name = token["name"] | ||
363 | key = name + ";" | ||
364 | if key not in entities: | ||
365 | self.serializeError("Entity %s not recognized" % name) | ||
366 | if self.resolve_entities and key not in xmlEntities: | ||
367 | data = entities[key] | ||
368 | else: | ||
369 | data = "&%s;" % name | ||
370 | yield self.encodeStrict(data) | ||
371 | |||
372 | else: | ||
373 | self.serializeError(token["data"]) | ||
374 | |||
375 | def render(self, treewalker, encoding=None): | ||
376 | """Serializes the stream from the treewalker into a string | ||
377 | |||
378 | :arg treewalker: the treewalker to serialize | ||
379 | |||
380 | :arg encoding: the string encoding to use | ||
381 | |||
382 | :returns: the serialized tree | ||
383 | |||
384 | Example: | ||
385 | |||
386 | >>> from html5lib import parse, getTreeWalker | ||
387 | >>> from html5lib.serializer import HTMLSerializer | ||
388 | >>> token_stream = parse('<html><body>Hi!</body></html>') | ||
389 | >>> walker = getTreeWalker('etree') | ||
390 | >>> serializer = HTMLSerializer(omit_optional_tags=False) | ||
391 | >>> serializer.render(walker(token_stream)) | ||
392 | '<html><head></head><body>Hi!</body></html>' | ||
393 | |||
394 | """ | ||
395 | if encoding: | ||
396 | return b"".join(list(self.serialize(treewalker, encoding))) | ||
397 | else: | ||
398 | return "".join(list(self.serialize(treewalker))) | ||
399 | |||
400 | def serializeError(self, data="XXX ERROR MESSAGE NEEDED"): | ||
401 | # XXX The idea is to make data mandatory. | ||
402 | self.errors.append(data) | ||
403 | if self.strict: | ||
404 | raise SerializeError | ||
405 | |||
406 | |||
407 | class SerializeError(Exception): | ||
408 | """Error in serialized tree""" | ||
409 | pass | ||