summaryrefslogtreecommitdiff
path: root/venv/lib/python3.7/site-packages/pip-10.0.1-py3.7.egg/pip/_vendor/html5lib
diff options
context:
space:
mode:
Diffstat (limited to 'venv/lib/python3.7/site-packages/pip-10.0.1-py3.7.egg/pip/_vendor/html5lib')
-rw-r--r--venv/lib/python3.7/site-packages/pip-10.0.1-py3.7.egg/pip/_vendor/html5lib/__init__.py35
-rw-r--r--venv/lib/python3.7/site-packages/pip-10.0.1-py3.7.egg/pip/_vendor/html5lib/_ihatexml.py288
-rw-r--r--venv/lib/python3.7/site-packages/pip-10.0.1-py3.7.egg/pip/_vendor/html5lib/_inputstream.py923
-rw-r--r--venv/lib/python3.7/site-packages/pip-10.0.1-py3.7.egg/pip/_vendor/html5lib/_tokenizer.py1721
-rw-r--r--venv/lib/python3.7/site-packages/pip-10.0.1-py3.7.egg/pip/_vendor/html5lib/_trie/__init__.py14
-rw-r--r--venv/lib/python3.7/site-packages/pip-10.0.1-py3.7.egg/pip/_vendor/html5lib/_trie/_base.py37
-rw-r--r--venv/lib/python3.7/site-packages/pip-10.0.1-py3.7.egg/pip/_vendor/html5lib/_trie/datrie.py44
-rw-r--r--venv/lib/python3.7/site-packages/pip-10.0.1-py3.7.egg/pip/_vendor/html5lib/_trie/py.py67
-rw-r--r--venv/lib/python3.7/site-packages/pip-10.0.1-py3.7.egg/pip/_vendor/html5lib/_utils.py124
-rw-r--r--venv/lib/python3.7/site-packages/pip-10.0.1-py3.7.egg/pip/_vendor/html5lib/constants.py2947
-rw-r--r--venv/lib/python3.7/site-packages/pip-10.0.1-py3.7.egg/pip/_vendor/html5lib/filters/__init__.py0
-rw-r--r--venv/lib/python3.7/site-packages/pip-10.0.1-py3.7.egg/pip/_vendor/html5lib/filters/alphabeticalattributes.py29
-rw-r--r--venv/lib/python3.7/site-packages/pip-10.0.1-py3.7.egg/pip/_vendor/html5lib/filters/base.py12
-rw-r--r--venv/lib/python3.7/site-packages/pip-10.0.1-py3.7.egg/pip/_vendor/html5lib/filters/inject_meta_charset.py73
-rw-r--r--venv/lib/python3.7/site-packages/pip-10.0.1-py3.7.egg/pip/_vendor/html5lib/filters/lint.py93
-rw-r--r--venv/lib/python3.7/site-packages/pip-10.0.1-py3.7.egg/pip/_vendor/html5lib/filters/optionaltags.py207
-rw-r--r--venv/lib/python3.7/site-packages/pip-10.0.1-py3.7.egg/pip/_vendor/html5lib/filters/sanitizer.py896
-rw-r--r--venv/lib/python3.7/site-packages/pip-10.0.1-py3.7.egg/pip/_vendor/html5lib/filters/whitespace.py38
-rw-r--r--venv/lib/python3.7/site-packages/pip-10.0.1-py3.7.egg/pip/_vendor/html5lib/html5parser.py2791
-rw-r--r--venv/lib/python3.7/site-packages/pip-10.0.1-py3.7.egg/pip/_vendor/html5lib/serializer.py409
-rw-r--r--venv/lib/python3.7/site-packages/pip-10.0.1-py3.7.egg/pip/_vendor/html5lib/treeadapters/__init__.py30
-rw-r--r--venv/lib/python3.7/site-packages/pip-10.0.1-py3.7.egg/pip/_vendor/html5lib/treeadapters/genshi.py54
-rw-r--r--venv/lib/python3.7/site-packages/pip-10.0.1-py3.7.egg/pip/_vendor/html5lib/treeadapters/sax.py50
-rw-r--r--venv/lib/python3.7/site-packages/pip-10.0.1-py3.7.egg/pip/_vendor/html5lib/treebuilders/__init__.py88
-rw-r--r--venv/lib/python3.7/site-packages/pip-10.0.1-py3.7.egg/pip/_vendor/html5lib/treebuilders/base.py417
-rw-r--r--venv/lib/python3.7/site-packages/pip-10.0.1-py3.7.egg/pip/_vendor/html5lib/treebuilders/dom.py236
-rw-r--r--venv/lib/python3.7/site-packages/pip-10.0.1-py3.7.egg/pip/_vendor/html5lib/treebuilders/etree.py340
-rw-r--r--venv/lib/python3.7/site-packages/pip-10.0.1-py3.7.egg/pip/_vendor/html5lib/treebuilders/etree_lxml.py366
-rw-r--r--venv/lib/python3.7/site-packages/pip-10.0.1-py3.7.egg/pip/_vendor/html5lib/treewalkers/__init__.py154
-rw-r--r--venv/lib/python3.7/site-packages/pip-10.0.1-py3.7.egg/pip/_vendor/html5lib/treewalkers/base.py252
-rw-r--r--venv/lib/python3.7/site-packages/pip-10.0.1-py3.7.egg/pip/_vendor/html5lib/treewalkers/dom.py43
-rw-r--r--venv/lib/python3.7/site-packages/pip-10.0.1-py3.7.egg/pip/_vendor/html5lib/treewalkers/etree.py130
-rw-r--r--venv/lib/python3.7/site-packages/pip-10.0.1-py3.7.egg/pip/_vendor/html5lib/treewalkers/etree_lxml.py213
-rw-r--r--venv/lib/python3.7/site-packages/pip-10.0.1-py3.7.egg/pip/_vendor/html5lib/treewalkers/genshi.py69
34 files changed, 13190 insertions, 0 deletions
diff --git a/venv/lib/python3.7/site-packages/pip-10.0.1-py3.7.egg/pip/_vendor/html5lib/__init__.py b/venv/lib/python3.7/site-packages/pip-10.0.1-py3.7.egg/pip/_vendor/html5lib/__init__.py
new file mode 100644
index 0000000..0b54002
--- /dev/null
+++ b/venv/lib/python3.7/site-packages/pip-10.0.1-py3.7.egg/pip/_vendor/html5lib/__init__.py
@@ -0,0 +1,35 @@
1"""
2HTML parsing library based on the `WHATWG HTML specification
3<https://whatwg.org/html>`_. The parser is designed to be compatible with
4existing HTML found in the wild and implements well-defined error recovery that
5is largely compatible with modern desktop web browsers.
6
7Example usage::
8
9 from pip._vendor import html5lib
10 with open("my_document.html", "rb") as f:
11 tree = html5lib.parse(f)
12
13For convenience, this module re-exports the following names:
14
15* :func:`~.html5parser.parse`
16* :func:`~.html5parser.parseFragment`
17* :class:`~.html5parser.HTMLParser`
18* :func:`~.treebuilders.getTreeBuilder`
19* :func:`~.treewalkers.getTreeWalker`
20* :func:`~.serializer.serialize`
21"""
22
23from __future__ import absolute_import, division, unicode_literals
24
25from .html5parser import HTMLParser, parse, parseFragment
26from .treebuilders import getTreeBuilder
27from .treewalkers import getTreeWalker
28from .serializer import serialize
29
30__all__ = ["HTMLParser", "parse", "parseFragment", "getTreeBuilder",
31 "getTreeWalker", "serialize"]
32
33# this has to be at the top level, see how setup.py parses this
34#: Distribution version number.
35__version__ = "1.0.1"
diff --git a/venv/lib/python3.7/site-packages/pip-10.0.1-py3.7.egg/pip/_vendor/html5lib/_ihatexml.py b/venv/lib/python3.7/site-packages/pip-10.0.1-py3.7.egg/pip/_vendor/html5lib/_ihatexml.py
new file mode 100644
index 0000000..68f9b1e
--- /dev/null
+++ b/venv/lib/python3.7/site-packages/pip-10.0.1-py3.7.egg/pip/_vendor/html5lib/_ihatexml.py
@@ -0,0 +1,288 @@
1from __future__ import absolute_import, division, unicode_literals
2
3import re
4import warnings
5
6from .constants import DataLossWarning
7
8baseChar = """
9[#x0041-#x005A] | [#x0061-#x007A] | [#x00C0-#x00D6] | [#x00D8-#x00F6] |
10[#x00F8-#x00FF] | [#x0100-#x0131] | [#x0134-#x013E] | [#x0141-#x0148] |
11[#x014A-#x017E] | [#x0180-#x01C3] | [#x01CD-#x01F0] | [#x01F4-#x01F5] |
12[#x01FA-#x0217] | [#x0250-#x02A8] | [#x02BB-#x02C1] | #x0386 |
13[#x0388-#x038A] | #x038C | [#x038E-#x03A1] | [#x03A3-#x03CE] |
14[#x03D0-#x03D6] | #x03DA | #x03DC | #x03DE | #x03E0 | [#x03E2-#x03F3] |
15[#x0401-#x040C] | [#x040E-#x044F] | [#x0451-#x045C] | [#x045E-#x0481] |
16[#x0490-#x04C4] | [#x04C7-#x04C8] | [#x04CB-#x04CC] | [#x04D0-#x04EB] |
17[#x04EE-#x04F5] | [#x04F8-#x04F9] | [#x0531-#x0556] | #x0559 |
18[#x0561-#x0586] | [#x05D0-#x05EA] | [#x05F0-#x05F2] | [#x0621-#x063A] |
19[#x0641-#x064A] | [#x0671-#x06B7] | [#x06BA-#x06BE] | [#x06C0-#x06CE] |
20[#x06D0-#x06D3] | #x06D5 | [#x06E5-#x06E6] | [#x0905-#x0939] | #x093D |
21[#x0958-#x0961] | [#x0985-#x098C] | [#x098F-#x0990] | [#x0993-#x09A8] |
22[#x09AA-#x09B0] | #x09B2 | [#x09B6-#x09B9] | [#x09DC-#x09DD] |
23[#x09DF-#x09E1] | [#x09F0-#x09F1] | [#x0A05-#x0A0A] | [#x0A0F-#x0A10] |
24[#x0A13-#x0A28] | [#x0A2A-#x0A30] | [#x0A32-#x0A33] | [#x0A35-#x0A36] |
25[#x0A38-#x0A39] | [#x0A59-#x0A5C] | #x0A5E | [#x0A72-#x0A74] |
26[#x0A85-#x0A8B] | #x0A8D | [#x0A8F-#x0A91] | [#x0A93-#x0AA8] |
27[#x0AAA-#x0AB0] | [#x0AB2-#x0AB3] | [#x0AB5-#x0AB9] | #x0ABD | #x0AE0 |
28[#x0B05-#x0B0C] | [#x0B0F-#x0B10] | [#x0B13-#x0B28] | [#x0B2A-#x0B30] |
29[#x0B32-#x0B33] | [#x0B36-#x0B39] | #x0B3D | [#x0B5C-#x0B5D] |
30[#x0B5F-#x0B61] | [#x0B85-#x0B8A] | [#x0B8E-#x0B90] | [#x0B92-#x0B95] |
31[#x0B99-#x0B9A] | #x0B9C | [#x0B9E-#x0B9F] | [#x0BA3-#x0BA4] |
32[#x0BA8-#x0BAA] | [#x0BAE-#x0BB5] | [#x0BB7-#x0BB9] | [#x0C05-#x0C0C] |
33[#x0C0E-#x0C10] | [#x0C12-#x0C28] | [#x0C2A-#x0C33] | [#x0C35-#x0C39] |
34[#x0C60-#x0C61] | [#x0C85-#x0C8C] | [#x0C8E-#x0C90] | [#x0C92-#x0CA8] |
35[#x0CAA-#x0CB3] | [#x0CB5-#x0CB9] | #x0CDE | [#x0CE0-#x0CE1] |
36[#x0D05-#x0D0C] | [#x0D0E-#x0D10] | [#x0D12-#x0D28] | [#x0D2A-#x0D39] |
37[#x0D60-#x0D61] | [#x0E01-#x0E2E] | #x0E30 | [#x0E32-#x0E33] |
38[#x0E40-#x0E45] | [#x0E81-#x0E82] | #x0E84 | [#x0E87-#x0E88] | #x0E8A |
39#x0E8D | [#x0E94-#x0E97] | [#x0E99-#x0E9F] | [#x0EA1-#x0EA3] | #x0EA5 |
40#x0EA7 | [#x0EAA-#x0EAB] | [#x0EAD-#x0EAE] | #x0EB0 | [#x0EB2-#x0EB3] |
41#x0EBD | [#x0EC0-#x0EC4] | [#x0F40-#x0F47] | [#x0F49-#x0F69] |
42[#x10A0-#x10C5] | [#x10D0-#x10F6] | #x1100 | [#x1102-#x1103] |
43[#x1105-#x1107] | #x1109 | [#x110B-#x110C] | [#x110E-#x1112] | #x113C |
44#x113E | #x1140 | #x114C | #x114E | #x1150 | [#x1154-#x1155] | #x1159 |
45[#x115F-#x1161] | #x1163 | #x1165 | #x1167 | #x1169 | [#x116D-#x116E] |
46[#x1172-#x1173] | #x1175 | #x119E | #x11A8 | #x11AB | [#x11AE-#x11AF] |
47[#x11B7-#x11B8] | #x11BA | [#x11BC-#x11C2] | #x11EB | #x11F0 | #x11F9 |
48[#x1E00-#x1E9B] | [#x1EA0-#x1EF9] | [#x1F00-#x1F15] | [#x1F18-#x1F1D] |
49[#x1F20-#x1F45] | [#x1F48-#x1F4D] | [#x1F50-#x1F57] | #x1F59 | #x1F5B |
50#x1F5D | [#x1F5F-#x1F7D] | [#x1F80-#x1FB4] | [#x1FB6-#x1FBC] | #x1FBE |
51[#x1FC2-#x1FC4] | [#x1FC6-#x1FCC] | [#x1FD0-#x1FD3] | [#x1FD6-#x1FDB] |
52[#x1FE0-#x1FEC] | [#x1FF2-#x1FF4] | [#x1FF6-#x1FFC] | #x2126 |
53[#x212A-#x212B] | #x212E | [#x2180-#x2182] | [#x3041-#x3094] |
54[#x30A1-#x30FA] | [#x3105-#x312C] | [#xAC00-#xD7A3]"""
55
56ideographic = """[#x4E00-#x9FA5] | #x3007 | [#x3021-#x3029]"""
57
58combiningCharacter = """
59[#x0300-#x0345] | [#x0360-#x0361] | [#x0483-#x0486] | [#x0591-#x05A1] |
60[#x05A3-#x05B9] | [#x05BB-#x05BD] | #x05BF | [#x05C1-#x05C2] | #x05C4 |
61[#x064B-#x0652] | #x0670 | [#x06D6-#x06DC] | [#x06DD-#x06DF] |
62[#x06E0-#x06E4] | [#x06E7-#x06E8] | [#x06EA-#x06ED] | [#x0901-#x0903] |
63#x093C | [#x093E-#x094C] | #x094D | [#x0951-#x0954] | [#x0962-#x0963] |
64[#x0981-#x0983] | #x09BC | #x09BE | #x09BF | [#x09C0-#x09C4] |
65[#x09C7-#x09C8] | [#x09CB-#x09CD] | #x09D7 | [#x09E2-#x09E3] | #x0A02 |
66#x0A3C | #x0A3E | #x0A3F | [#x0A40-#x0A42] | [#x0A47-#x0A48] |
67[#x0A4B-#x0A4D] | [#x0A70-#x0A71] | [#x0A81-#x0A83] | #x0ABC |
68[#x0ABE-#x0AC5] | [#x0AC7-#x0AC9] | [#x0ACB-#x0ACD] | [#x0B01-#x0B03] |
69#x0B3C | [#x0B3E-#x0B43] | [#x0B47-#x0B48] | [#x0B4B-#x0B4D] |
70[#x0B56-#x0B57] | [#x0B82-#x0B83] | [#x0BBE-#x0BC2] | [#x0BC6-#x0BC8] |
71[#x0BCA-#x0BCD] | #x0BD7 | [#x0C01-#x0C03] | [#x0C3E-#x0C44] |
72[#x0C46-#x0C48] | [#x0C4A-#x0C4D] | [#x0C55-#x0C56] | [#x0C82-#x0C83] |
73[#x0CBE-#x0CC4] | [#x0CC6-#x0CC8] | [#x0CCA-#x0CCD] | [#x0CD5-#x0CD6] |
74[#x0D02-#x0D03] | [#x0D3E-#x0D43] | [#x0D46-#x0D48] | [#x0D4A-#x0D4D] |
75#x0D57 | #x0E31 | [#x0E34-#x0E3A] | [#x0E47-#x0E4E] | #x0EB1 |
76[#x0EB4-#x0EB9] | [#x0EBB-#x0EBC] | [#x0EC8-#x0ECD] | [#x0F18-#x0F19] |
77#x0F35 | #x0F37 | #x0F39 | #x0F3E | #x0F3F | [#x0F71-#x0F84] |
78[#x0F86-#x0F8B] | [#x0F90-#x0F95] | #x0F97 | [#x0F99-#x0FAD] |
79[#x0FB1-#x0FB7] | #x0FB9 | [#x20D0-#x20DC] | #x20E1 | [#x302A-#x302F] |
80#x3099 | #x309A"""
81
82digit = """
83[#x0030-#x0039] | [#x0660-#x0669] | [#x06F0-#x06F9] | [#x0966-#x096F] |
84[#x09E6-#x09EF] | [#x0A66-#x0A6F] | [#x0AE6-#x0AEF] | [#x0B66-#x0B6F] |
85[#x0BE7-#x0BEF] | [#x0C66-#x0C6F] | [#x0CE6-#x0CEF] | [#x0D66-#x0D6F] |
86[#x0E50-#x0E59] | [#x0ED0-#x0ED9] | [#x0F20-#x0F29]"""
87
88extender = """
89#x00B7 | #x02D0 | #x02D1 | #x0387 | #x0640 | #x0E46 | #x0EC6 | #x3005 |
90#[#x3031-#x3035] | [#x309D-#x309E] | [#x30FC-#x30FE]"""
91
92letter = " | ".join([baseChar, ideographic])
93
94# Without the
95name = " | ".join([letter, digit, ".", "-", "_", combiningCharacter,
96 extender])
97nameFirst = " | ".join([letter, "_"])
98
99reChar = re.compile(r"#x([\d|A-F]{4,4})")
100reCharRange = re.compile(r"\[#x([\d|A-F]{4,4})-#x([\d|A-F]{4,4})\]")
101
102
103def charStringToList(chars):
104 charRanges = [item.strip() for item in chars.split(" | ")]
105 rv = []
106 for item in charRanges:
107 foundMatch = False
108 for regexp in (reChar, reCharRange):
109 match = regexp.match(item)
110 if match is not None:
111 rv.append([hexToInt(item) for item in match.groups()])
112 if len(rv[-1]) == 1:
113 rv[-1] = rv[-1] * 2
114 foundMatch = True
115 break
116 if not foundMatch:
117 assert len(item) == 1
118
119 rv.append([ord(item)] * 2)
120 rv = normaliseCharList(rv)
121 return rv
122
123
124def normaliseCharList(charList):
125 charList = sorted(charList)
126 for item in charList:
127 assert item[1] >= item[0]
128 rv = []
129 i = 0
130 while i < len(charList):
131 j = 1
132 rv.append(charList[i])
133 while i + j < len(charList) and charList[i + j][0] <= rv[-1][1] + 1:
134 rv[-1][1] = charList[i + j][1]
135 j += 1
136 i += j
137 return rv
138
139# We don't really support characters above the BMP :(
140max_unicode = int("FFFF", 16)
141
142
143def missingRanges(charList):
144 rv = []
145 if charList[0] != 0:
146 rv.append([0, charList[0][0] - 1])
147 for i, item in enumerate(charList[:-1]):
148 rv.append([item[1] + 1, charList[i + 1][0] - 1])
149 if charList[-1][1] != max_unicode:
150 rv.append([charList[-1][1] + 1, max_unicode])
151 return rv
152
153
154def listToRegexpStr(charList):
155 rv = []
156 for item in charList:
157 if item[0] == item[1]:
158 rv.append(escapeRegexp(chr(item[0])))
159 else:
160 rv.append(escapeRegexp(chr(item[0])) + "-" +
161 escapeRegexp(chr(item[1])))
162 return "[%s]" % "".join(rv)
163
164
165def hexToInt(hex_str):
166 return int(hex_str, 16)
167
168
169def escapeRegexp(string):
170 specialCharacters = (".", "^", "$", "*", "+", "?", "{", "}",
171 "[", "]", "|", "(", ")", "-")
172 for char in specialCharacters:
173 string = string.replace(char, "\\" + char)
174
175 return string
176
177# output from the above
178nonXmlNameBMPRegexp = re.compile('[\x00-,/:-@\\[-\\^`\\{-\xb6\xb8-\xbf\xd7\xf7\u0132-\u0133\u013f-\u0140\u0149\u017f\u01c4-\u01cc\u01f1-\u01f3\u01f6-\u01f9\u0218-\u024f\u02a9-\u02ba\u02c2-\u02cf\u02d2-\u02ff\u0346-\u035f\u0362-\u0385\u038b\u038d\u03a2\u03cf\u03d7-\u03d9\u03db\u03dd\u03df\u03e1\u03f4-\u0400\u040d\u0450\u045d\u0482\u0487-\u048f\u04c5-\u04c6\u04c9-\u04ca\u04cd-\u04cf\u04ec-\u04ed\u04f6-\u04f7\u04fa-\u0530\u0557-\u0558\u055a-\u0560\u0587-\u0590\u05a2\u05ba\u05be\u05c0\u05c3\u05c5-\u05cf\u05eb-\u05ef\u05f3-\u0620\u063b-\u063f\u0653-\u065f\u066a-\u066f\u06b8-\u06b9\u06bf\u06cf\u06d4\u06e9\u06ee-\u06ef\u06fa-\u0900\u0904\u093a-\u093b\u094e-\u0950\u0955-\u0957\u0964-\u0965\u0970-\u0980\u0984\u098d-\u098e\u0991-\u0992\u09a9\u09b1\u09b3-\u09b5\u09ba-\u09bb\u09bd\u09c5-\u09c6\u09c9-\u09ca\u09ce-\u09d6\u09d8-\u09db\u09de\u09e4-\u09e5\u09f2-\u0a01\u0a03-\u0a04\u0a0b-\u0a0e\u0a11-\u0a12\u0a29\u0a31\u0a34\u0a37\u0a3a-\u0a3b\u0a3d\u0a43-\u0a46\u0a49-\u0a4a\u0a4e-\u0a58\u0a5d\u0a5f-\u0a65\u0a75-\u0a80\u0a84\u0a8c\u0a8e\u0a92\u0aa9\u0ab1\u0ab4\u0aba-\u0abb\u0ac6\u0aca\u0ace-\u0adf\u0ae1-\u0ae5\u0af0-\u0b00\u0b04\u0b0d-\u0b0e\u0b11-\u0b12\u0b29\u0b31\u0b34-\u0b35\u0b3a-\u0b3b\u0b44-\u0b46\u0b49-\u0b4a\u0b4e-\u0b55\u0b58-\u0b5b\u0b5e\u0b62-\u0b65\u0b70-\u0b81\u0b84\u0b8b-\u0b8d\u0b91\u0b96-\u0b98\u0b9b\u0b9d\u0ba0-\u0ba2\u0ba5-\u0ba7\u0bab-\u0bad\u0bb6\u0bba-\u0bbd\u0bc3-\u0bc5\u0bc9\u0bce-\u0bd6\u0bd8-\u0be6\u0bf0-\u0c00\u0c04\u0c0d\u0c11\u0c29\u0c34\u0c3a-\u0c3d\u0c45\u0c49\u0c4e-\u0c54\u0c57-\u0c5f\u0c62-\u0c65\u0c70-\u0c81\u0c84\u0c8d\u0c91\u0ca9\u0cb4\u0cba-\u0cbd\u0cc5\u0cc9\u0cce-\u0cd4\u0cd7-\u0cdd\u0cdf\u0ce2-\u0ce5\u0cf0-\u0d01\u0d04\u0d0d\u0d11\u0d29\u0d3a-\u0d3d\u0d44-\u0d45\u0d49\u0d4e-\u0d56\u0d58-\u0d5f\u0d62-\u0d65\u0d70-\u0e00\u0e2f\u0e3b-\u0e3f\u0e4f\u0e5a-\u0e80\u0e83\u0e85-\u0e86\u0e89\u0e8b-\u0e8c\u0e8e-\u0e93\u0e98\u0ea0\u0ea4\u0ea6\u0ea8-\u0ea9\u0eac\u0eaf\u0eba\u0ebe-\u0ebf\u0ec5\u0ec7\u0ece-\u0ecf\u0eda-\u0f17\u0f1a-\u0f1f\u0f2a-\u0f34\u0f36\u0f38\u0f3a-\u0f3d\u0f48\u0f6a-\u0f70\u0f85\u0f8c-\u0f8f\u0f96\u0f98\u0fae-\u0fb0\u0fb8\u0fba-\u109f\u10c6-\u10cf\u10f7-\u10ff\u1101\u1104\u1108\u110a\u110d\u1113-\u113b\u113d\u113f\u1141-\u114b\u114d\u114f\u1151-\u1153\u1156-\u1158\u115a-\u115e\u1162\u1164\u1166\u1168\u116a-\u116c\u116f-\u1171\u1174\u1176-\u119d\u119f-\u11a7\u11a9-\u11aa\u11ac-\u11ad\u11b0-\u11b6\u11b9\u11bb\u11c3-\u11ea\u11ec-\u11ef\u11f1-\u11f8\u11fa-\u1dff\u1e9c-\u1e9f\u1efa-\u1eff\u1f16-\u1f17\u1f1e-\u1f1f\u1f46-\u1f47\u1f4e-\u1f4f\u1f58\u1f5a\u1f5c\u1f5e\u1f7e-\u1f7f\u1fb5\u1fbd\u1fbf-\u1fc1\u1fc5\u1fcd-\u1fcf\u1fd4-\u1fd5\u1fdc-\u1fdf\u1fed-\u1ff1\u1ff5\u1ffd-\u20cf\u20dd-\u20e0\u20e2-\u2125\u2127-\u2129\u212c-\u212d\u212f-\u217f\u2183-\u3004\u3006\u3008-\u3020\u3030\u3036-\u3040\u3095-\u3098\u309b-\u309c\u309f-\u30a0\u30fb\u30ff-\u3104\u312d-\u4dff\u9fa6-\uabff\ud7a4-\uffff]') # noqa
179
180nonXmlNameFirstBMPRegexp = re.compile('[\x00-@\\[-\\^`\\{-\xbf\xd7\xf7\u0132-\u0133\u013f-\u0140\u0149\u017f\u01c4-\u01cc\u01f1-\u01f3\u01f6-\u01f9\u0218-\u024f\u02a9-\u02ba\u02c2-\u0385\u0387\u038b\u038d\u03a2\u03cf\u03d7-\u03d9\u03db\u03dd\u03df\u03e1\u03f4-\u0400\u040d\u0450\u045d\u0482-\u048f\u04c5-\u04c6\u04c9-\u04ca\u04cd-\u04cf\u04ec-\u04ed\u04f6-\u04f7\u04fa-\u0530\u0557-\u0558\u055a-\u0560\u0587-\u05cf\u05eb-\u05ef\u05f3-\u0620\u063b-\u0640\u064b-\u0670\u06b8-\u06b9\u06bf\u06cf\u06d4\u06d6-\u06e4\u06e7-\u0904\u093a-\u093c\u093e-\u0957\u0962-\u0984\u098d-\u098e\u0991-\u0992\u09a9\u09b1\u09b3-\u09b5\u09ba-\u09db\u09de\u09e2-\u09ef\u09f2-\u0a04\u0a0b-\u0a0e\u0a11-\u0a12\u0a29\u0a31\u0a34\u0a37\u0a3a-\u0a58\u0a5d\u0a5f-\u0a71\u0a75-\u0a84\u0a8c\u0a8e\u0a92\u0aa9\u0ab1\u0ab4\u0aba-\u0abc\u0abe-\u0adf\u0ae1-\u0b04\u0b0d-\u0b0e\u0b11-\u0b12\u0b29\u0b31\u0b34-\u0b35\u0b3a-\u0b3c\u0b3e-\u0b5b\u0b5e\u0b62-\u0b84\u0b8b-\u0b8d\u0b91\u0b96-\u0b98\u0b9b\u0b9d\u0ba0-\u0ba2\u0ba5-\u0ba7\u0bab-\u0bad\u0bb6\u0bba-\u0c04\u0c0d\u0c11\u0c29\u0c34\u0c3a-\u0c5f\u0c62-\u0c84\u0c8d\u0c91\u0ca9\u0cb4\u0cba-\u0cdd\u0cdf\u0ce2-\u0d04\u0d0d\u0d11\u0d29\u0d3a-\u0d5f\u0d62-\u0e00\u0e2f\u0e31\u0e34-\u0e3f\u0e46-\u0e80\u0e83\u0e85-\u0e86\u0e89\u0e8b-\u0e8c\u0e8e-\u0e93\u0e98\u0ea0\u0ea4\u0ea6\u0ea8-\u0ea9\u0eac\u0eaf\u0eb1\u0eb4-\u0ebc\u0ebe-\u0ebf\u0ec5-\u0f3f\u0f48\u0f6a-\u109f\u10c6-\u10cf\u10f7-\u10ff\u1101\u1104\u1108\u110a\u110d\u1113-\u113b\u113d\u113f\u1141-\u114b\u114d\u114f\u1151-\u1153\u1156-\u1158\u115a-\u115e\u1162\u1164\u1166\u1168\u116a-\u116c\u116f-\u1171\u1174\u1176-\u119d\u119f-\u11a7\u11a9-\u11aa\u11ac-\u11ad\u11b0-\u11b6\u11b9\u11bb\u11c3-\u11ea\u11ec-\u11ef\u11f1-\u11f8\u11fa-\u1dff\u1e9c-\u1e9f\u1efa-\u1eff\u1f16-\u1f17\u1f1e-\u1f1f\u1f46-\u1f47\u1f4e-\u1f4f\u1f58\u1f5a\u1f5c\u1f5e\u1f7e-\u1f7f\u1fb5\u1fbd\u1fbf-\u1fc1\u1fc5\u1fcd-\u1fcf\u1fd4-\u1fd5\u1fdc-\u1fdf\u1fed-\u1ff1\u1ff5\u1ffd-\u2125\u2127-\u2129\u212c-\u212d\u212f-\u217f\u2183-\u3006\u3008-\u3020\u302a-\u3040\u3095-\u30a0\u30fb-\u3104\u312d-\u4dff\u9fa6-\uabff\ud7a4-\uffff]') # noqa
181
182# Simpler things
183nonPubidCharRegexp = re.compile("[^\x20\x0D\x0Aa-zA-Z0-9\\-'()+,./:=?;!*#@$_%]")
184
185
186class InfosetFilter(object):
187 replacementRegexp = re.compile(r"U[\dA-F]{5,5}")
188
189 def __init__(self,
190 dropXmlnsLocalName=False,
191 dropXmlnsAttrNs=False,
192 preventDoubleDashComments=False,
193 preventDashAtCommentEnd=False,
194 replaceFormFeedCharacters=True,
195 preventSingleQuotePubid=False):
196
197 self.dropXmlnsLocalName = dropXmlnsLocalName
198 self.dropXmlnsAttrNs = dropXmlnsAttrNs
199
200 self.preventDoubleDashComments = preventDoubleDashComments
201 self.preventDashAtCommentEnd = preventDashAtCommentEnd
202
203 self.replaceFormFeedCharacters = replaceFormFeedCharacters
204
205 self.preventSingleQuotePubid = preventSingleQuotePubid
206
207 self.replaceCache = {}
208
209 def coerceAttribute(self, name, namespace=None):
210 if self.dropXmlnsLocalName and name.startswith("xmlns:"):
211 warnings.warn("Attributes cannot begin with xmlns", DataLossWarning)
212 return None
213 elif (self.dropXmlnsAttrNs and
214 namespace == "http://www.w3.org/2000/xmlns/"):
215 warnings.warn("Attributes cannot be in the xml namespace", DataLossWarning)
216 return None
217 else:
218 return self.toXmlName(name)
219
220 def coerceElement(self, name):
221 return self.toXmlName(name)
222
223 def coerceComment(self, data):
224 if self.preventDoubleDashComments:
225 while "--" in data:
226 warnings.warn("Comments cannot contain adjacent dashes", DataLossWarning)
227 data = data.replace("--", "- -")
228 if data.endswith("-"):
229 warnings.warn("Comments cannot end in a dash", DataLossWarning)
230 data += " "
231 return data
232
233 def coerceCharacters(self, data):
234 if self.replaceFormFeedCharacters:
235 for _ in range(data.count("\x0C")):
236 warnings.warn("Text cannot contain U+000C", DataLossWarning)
237 data = data.replace("\x0C", " ")
238 # Other non-xml characters
239 return data
240
241 def coercePubid(self, data):
242 dataOutput = data
243 for char in nonPubidCharRegexp.findall(data):
244 warnings.warn("Coercing non-XML pubid", DataLossWarning)
245 replacement = self.getReplacementCharacter(char)
246 dataOutput = dataOutput.replace(char, replacement)
247 if self.preventSingleQuotePubid and dataOutput.find("'") >= 0:
248 warnings.warn("Pubid cannot contain single quote", DataLossWarning)
249 dataOutput = dataOutput.replace("'", self.getReplacementCharacter("'"))
250 return dataOutput
251
252 def toXmlName(self, name):
253 nameFirst = name[0]
254 nameRest = name[1:]
255 m = nonXmlNameFirstBMPRegexp.match(nameFirst)
256 if m:
257 warnings.warn("Coercing non-XML name", DataLossWarning)
258 nameFirstOutput = self.getReplacementCharacter(nameFirst)
259 else:
260 nameFirstOutput = nameFirst
261
262 nameRestOutput = nameRest
263 replaceChars = set(nonXmlNameBMPRegexp.findall(nameRest))
264 for char in replaceChars:
265 warnings.warn("Coercing non-XML name", DataLossWarning)
266 replacement = self.getReplacementCharacter(char)
267 nameRestOutput = nameRestOutput.replace(char, replacement)
268 return nameFirstOutput + nameRestOutput
269
270 def getReplacementCharacter(self, char):
271 if char in self.replaceCache:
272 replacement = self.replaceCache[char]
273 else:
274 replacement = self.escapeChar(char)
275 return replacement
276
277 def fromXmlName(self, name):
278 for item in set(self.replacementRegexp.findall(name)):
279 name = name.replace(item, self.unescapeChar(item))
280 return name
281
282 def escapeChar(self, char):
283 replacement = "U%05X" % ord(char)
284 self.replaceCache[char] = replacement
285 return replacement
286
287 def unescapeChar(self, charcode):
288 return chr(int(charcode[1:], 16))
diff --git a/venv/lib/python3.7/site-packages/pip-10.0.1-py3.7.egg/pip/_vendor/html5lib/_inputstream.py b/venv/lib/python3.7/site-packages/pip-10.0.1-py3.7.egg/pip/_vendor/html5lib/_inputstream.py
new file mode 100644
index 0000000..21c6bbc
--- /dev/null
+++ b/venv/lib/python3.7/site-packages/pip-10.0.1-py3.7.egg/pip/_vendor/html5lib/_inputstream.py
@@ -0,0 +1,923 @@
1from __future__ import absolute_import, division, unicode_literals
2
3from pip._vendor.six import text_type, binary_type
4from pip._vendor.six.moves import http_client, urllib
5
6import codecs
7import re
8
9from pip._vendor import webencodings
10
11from .constants import EOF, spaceCharacters, asciiLetters, asciiUppercase
12from .constants import _ReparseException
13from . import _utils
14
15from io import StringIO
16
17try:
18 from io import BytesIO
19except ImportError:
20 BytesIO = StringIO
21
22# Non-unicode versions of constants for use in the pre-parser
23spaceCharactersBytes = frozenset([item.encode("ascii") for item in spaceCharacters])
24asciiLettersBytes = frozenset([item.encode("ascii") for item in asciiLetters])
25asciiUppercaseBytes = frozenset([item.encode("ascii") for item in asciiUppercase])
26spacesAngleBrackets = spaceCharactersBytes | frozenset([b">", b"<"])
27
28
29invalid_unicode_no_surrogate = "[\u0001-\u0008\u000B\u000E-\u001F\u007F-\u009F\uFDD0-\uFDEF\uFFFE\uFFFF\U0001FFFE\U0001FFFF\U0002FFFE\U0002FFFF\U0003FFFE\U0003FFFF\U0004FFFE\U0004FFFF\U0005FFFE\U0005FFFF\U0006FFFE\U0006FFFF\U0007FFFE\U0007FFFF\U0008FFFE\U0008FFFF\U0009FFFE\U0009FFFF\U000AFFFE\U000AFFFF\U000BFFFE\U000BFFFF\U000CFFFE\U000CFFFF\U000DFFFE\U000DFFFF\U000EFFFE\U000EFFFF\U000FFFFE\U000FFFFF\U0010FFFE\U0010FFFF]" # noqa
30
31if _utils.supports_lone_surrogates:
32 # Use one extra step of indirection and create surrogates with
33 # eval. Not using this indirection would introduce an illegal
34 # unicode literal on platforms not supporting such lone
35 # surrogates.
36 assert invalid_unicode_no_surrogate[-1] == "]" and invalid_unicode_no_surrogate.count("]") == 1
37 invalid_unicode_re = re.compile(invalid_unicode_no_surrogate[:-1] +
38 eval('"\\uD800-\\uDFFF"') + # pylint:disable=eval-used
39 "]")
40else:
41 invalid_unicode_re = re.compile(invalid_unicode_no_surrogate)
42
43non_bmp_invalid_codepoints = set([0x1FFFE, 0x1FFFF, 0x2FFFE, 0x2FFFF, 0x3FFFE,
44 0x3FFFF, 0x4FFFE, 0x4FFFF, 0x5FFFE, 0x5FFFF,
45 0x6FFFE, 0x6FFFF, 0x7FFFE, 0x7FFFF, 0x8FFFE,
46 0x8FFFF, 0x9FFFE, 0x9FFFF, 0xAFFFE, 0xAFFFF,
47 0xBFFFE, 0xBFFFF, 0xCFFFE, 0xCFFFF, 0xDFFFE,
48 0xDFFFF, 0xEFFFE, 0xEFFFF, 0xFFFFE, 0xFFFFF,
49 0x10FFFE, 0x10FFFF])
50
51ascii_punctuation_re = re.compile("[\u0009-\u000D\u0020-\u002F\u003A-\u0040\u005C\u005B-\u0060\u007B-\u007E]")
52
53# Cache for charsUntil()
54charsUntilRegEx = {}
55
56
57class BufferedStream(object):
58 """Buffering for streams that do not have buffering of their own
59
60 The buffer is implemented as a list of chunks on the assumption that
61 joining many strings will be slow since it is O(n**2)
62 """
63
64 def __init__(self, stream):
65 self.stream = stream
66 self.buffer = []
67 self.position = [-1, 0] # chunk number, offset
68
69 def tell(self):
70 pos = 0
71 for chunk in self.buffer[:self.position[0]]:
72 pos += len(chunk)
73 pos += self.position[1]
74 return pos
75
76 def seek(self, pos):
77 assert pos <= self._bufferedBytes()
78 offset = pos
79 i = 0
80 while len(self.buffer[i]) < offset:
81 offset -= len(self.buffer[i])
82 i += 1
83 self.position = [i, offset]
84
85 def read(self, bytes):
86 if not self.buffer:
87 return self._readStream(bytes)
88 elif (self.position[0] == len(self.buffer) and
89 self.position[1] == len(self.buffer[-1])):
90 return self._readStream(bytes)
91 else:
92 return self._readFromBuffer(bytes)
93
94 def _bufferedBytes(self):
95 return sum([len(item) for item in self.buffer])
96
97 def _readStream(self, bytes):
98 data = self.stream.read(bytes)
99 self.buffer.append(data)
100 self.position[0] += 1
101 self.position[1] = len(data)
102 return data
103
104 def _readFromBuffer(self, bytes):
105 remainingBytes = bytes
106 rv = []
107 bufferIndex = self.position[0]
108 bufferOffset = self.position[1]
109 while bufferIndex < len(self.buffer) and remainingBytes != 0:
110 assert remainingBytes > 0
111 bufferedData = self.buffer[bufferIndex]
112
113 if remainingBytes <= len(bufferedData) - bufferOffset:
114 bytesToRead = remainingBytes
115 self.position = [bufferIndex, bufferOffset + bytesToRead]
116 else:
117 bytesToRead = len(bufferedData) - bufferOffset
118 self.position = [bufferIndex, len(bufferedData)]
119 bufferIndex += 1
120 rv.append(bufferedData[bufferOffset:bufferOffset + bytesToRead])
121 remainingBytes -= bytesToRead
122
123 bufferOffset = 0
124
125 if remainingBytes:
126 rv.append(self._readStream(remainingBytes))
127
128 return b"".join(rv)
129
130
131def HTMLInputStream(source, **kwargs):
132 # Work around Python bug #20007: read(0) closes the connection.
133 # http://bugs.python.org/issue20007
134 if (isinstance(source, http_client.HTTPResponse) or
135 # Also check for addinfourl wrapping HTTPResponse
136 (isinstance(source, urllib.response.addbase) and
137 isinstance(source.fp, http_client.HTTPResponse))):
138 isUnicode = False
139 elif hasattr(source, "read"):
140 isUnicode = isinstance(source.read(0), text_type)
141 else:
142 isUnicode = isinstance(source, text_type)
143
144 if isUnicode:
145 encodings = [x for x in kwargs if x.endswith("_encoding")]
146 if encodings:
147 raise TypeError("Cannot set an encoding with a unicode input, set %r" % encodings)
148
149 return HTMLUnicodeInputStream(source, **kwargs)
150 else:
151 return HTMLBinaryInputStream(source, **kwargs)
152
153
154class HTMLUnicodeInputStream(object):
155 """Provides a unicode stream of characters to the HTMLTokenizer.
156
157 This class takes care of character encoding and removing or replacing
158 incorrect byte-sequences and also provides column and line tracking.
159
160 """
161
162 _defaultChunkSize = 10240
163
164 def __init__(self, source):
165 """Initialises the HTMLInputStream.
166
167 HTMLInputStream(source, [encoding]) -> Normalized stream from source
168 for use by html5lib.
169
170 source can be either a file-object, local filename or a string.
171
172 The optional encoding parameter must be a string that indicates
173 the encoding. If specified, that encoding will be used,
174 regardless of any BOM or later declaration (such as in a meta
175 element)
176
177 """
178
179 if not _utils.supports_lone_surrogates:
180 # Such platforms will have already checked for such
181 # surrogate errors, so no need to do this checking.
182 self.reportCharacterErrors = None
183 elif len("\U0010FFFF") == 1:
184 self.reportCharacterErrors = self.characterErrorsUCS4
185 else:
186 self.reportCharacterErrors = self.characterErrorsUCS2
187
188 # List of where new lines occur
189 self.newLines = [0]
190
191 self.charEncoding = (lookupEncoding("utf-8"), "certain")
192 self.dataStream = self.openStream(source)
193
194 self.reset()
195
196 def reset(self):
197 self.chunk = ""
198 self.chunkSize = 0
199 self.chunkOffset = 0
200 self.errors = []
201
202 # number of (complete) lines in previous chunks
203 self.prevNumLines = 0
204 # number of columns in the last line of the previous chunk
205 self.prevNumCols = 0
206
207 # Deal with CR LF and surrogates split over chunk boundaries
208 self._bufferedCharacter = None
209
210 def openStream(self, source):
211 """Produces a file object from source.
212
213 source can be either a file object, local filename or a string.
214
215 """
216 # Already a file object
217 if hasattr(source, 'read'):
218 stream = source
219 else:
220 stream = StringIO(source)
221
222 return stream
223
224 def _position(self, offset):
225 chunk = self.chunk
226 nLines = chunk.count('\n', 0, offset)
227 positionLine = self.prevNumLines + nLines
228 lastLinePos = chunk.rfind('\n', 0, offset)
229 if lastLinePos == -1:
230 positionColumn = self.prevNumCols + offset
231 else:
232 positionColumn = offset - (lastLinePos + 1)
233 return (positionLine, positionColumn)
234
235 def position(self):
236 """Returns (line, col) of the current position in the stream."""
237 line, col = self._position(self.chunkOffset)
238 return (line + 1, col)
239
240 def char(self):
241 """ Read one character from the stream or queue if available. Return
242 EOF when EOF is reached.
243 """
244 # Read a new chunk from the input stream if necessary
245 if self.chunkOffset >= self.chunkSize:
246 if not self.readChunk():
247 return EOF
248
249 chunkOffset = self.chunkOffset
250 char = self.chunk[chunkOffset]
251 self.chunkOffset = chunkOffset + 1
252
253 return char
254
255 def readChunk(self, chunkSize=None):
256 if chunkSize is None:
257 chunkSize = self._defaultChunkSize
258
259 self.prevNumLines, self.prevNumCols = self._position(self.chunkSize)
260
261 self.chunk = ""
262 self.chunkSize = 0
263 self.chunkOffset = 0
264
265 data = self.dataStream.read(chunkSize)
266
267 # Deal with CR LF and surrogates broken across chunks
268 if self._bufferedCharacter:
269 data = self._bufferedCharacter + data
270 self._bufferedCharacter = None
271 elif not data:
272 # We have no more data, bye-bye stream
273 return False
274
275 if len(data) > 1:
276 lastv = ord(data[-1])
277 if lastv == 0x0D or 0xD800 <= lastv <= 0xDBFF:
278 self._bufferedCharacter = data[-1]
279 data = data[:-1]
280
281 if self.reportCharacterErrors:
282 self.reportCharacterErrors(data)
283
284 # Replace invalid characters
285 data = data.replace("\r\n", "\n")
286 data = data.replace("\r", "\n")
287
288 self.chunk = data
289 self.chunkSize = len(data)
290
291 return True
292
293 def characterErrorsUCS4(self, data):
294 for _ in range(len(invalid_unicode_re.findall(data))):
295 self.errors.append("invalid-codepoint")
296
297 def characterErrorsUCS2(self, data):
298 # Someone picked the wrong compile option
299 # You lose
300 skip = False
301 for match in invalid_unicode_re.finditer(data):
302 if skip:
303 continue
304 codepoint = ord(match.group())
305 pos = match.start()
306 # Pretty sure there should be endianness issues here
307 if _utils.isSurrogatePair(data[pos:pos + 2]):
308 # We have a surrogate pair!
309 char_val = _utils.surrogatePairToCodepoint(data[pos:pos + 2])
310 if char_val in non_bmp_invalid_codepoints:
311 self.errors.append("invalid-codepoint")
312 skip = True
313 elif (codepoint >= 0xD800 and codepoint <= 0xDFFF and
314 pos == len(data) - 1):
315 self.errors.append("invalid-codepoint")
316 else:
317 skip = False
318 self.errors.append("invalid-codepoint")
319
320 def charsUntil(self, characters, opposite=False):
321 """ Returns a string of characters from the stream up to but not
322 including any character in 'characters' or EOF. 'characters' must be
323 a container that supports the 'in' method and iteration over its
324 characters.
325 """
326
327 # Use a cache of regexps to find the required characters
328 try:
329 chars = charsUntilRegEx[(characters, opposite)]
330 except KeyError:
331 if __debug__:
332 for c in characters:
333 assert(ord(c) < 128)
334 regex = "".join(["\\x%02x" % ord(c) for c in characters])
335 if not opposite:
336 regex = "^%s" % regex
337 chars = charsUntilRegEx[(characters, opposite)] = re.compile("[%s]+" % regex)
338
339 rv = []
340
341 while True:
342 # Find the longest matching prefix
343 m = chars.match(self.chunk, self.chunkOffset)
344 if m is None:
345 # If nothing matched, and it wasn't because we ran out of chunk,
346 # then stop
347 if self.chunkOffset != self.chunkSize:
348 break
349 else:
350 end = m.end()
351 # If not the whole chunk matched, return everything
352 # up to the part that didn't match
353 if end != self.chunkSize:
354 rv.append(self.chunk[self.chunkOffset:end])
355 self.chunkOffset = end
356 break
357 # If the whole remainder of the chunk matched,
358 # use it all and read the next chunk
359 rv.append(self.chunk[self.chunkOffset:])
360 if not self.readChunk():
361 # Reached EOF
362 break
363
364 r = "".join(rv)
365 return r
366
367 def unget(self, char):
368 # Only one character is allowed to be ungotten at once - it must
369 # be consumed again before any further call to unget
370 if char is not None:
371 if self.chunkOffset == 0:
372 # unget is called quite rarely, so it's a good idea to do
373 # more work here if it saves a bit of work in the frequently
374 # called char and charsUntil.
375 # So, just prepend the ungotten character onto the current
376 # chunk:
377 self.chunk = char + self.chunk
378 self.chunkSize += 1
379 else:
380 self.chunkOffset -= 1
381 assert self.chunk[self.chunkOffset] == char
382
383
384class HTMLBinaryInputStream(HTMLUnicodeInputStream):
385 """Provides a unicode stream of characters to the HTMLTokenizer.
386
387 This class takes care of character encoding and removing or replacing
388 incorrect byte-sequences and also provides column and line tracking.
389
390 """
391
392 def __init__(self, source, override_encoding=None, transport_encoding=None,
393 same_origin_parent_encoding=None, likely_encoding=None,
394 default_encoding="windows-1252", useChardet=True):
395 """Initialises the HTMLInputStream.
396
397 HTMLInputStream(source, [encoding]) -> Normalized stream from source
398 for use by html5lib.
399
400 source can be either a file-object, local filename or a string.
401
402 The optional encoding parameter must be a string that indicates
403 the encoding. If specified, that encoding will be used,
404 regardless of any BOM or later declaration (such as in a meta
405 element)
406
407 """
408 # Raw Stream - for unicode objects this will encode to utf-8 and set
409 # self.charEncoding as appropriate
410 self.rawStream = self.openStream(source)
411
412 HTMLUnicodeInputStream.__init__(self, self.rawStream)
413
414 # Encoding Information
415 # Number of bytes to use when looking for a meta element with
416 # encoding information
417 self.numBytesMeta = 1024
418 # Number of bytes to use when using detecting encoding using chardet
419 self.numBytesChardet = 100
420 # Things from args
421 self.override_encoding = override_encoding
422 self.transport_encoding = transport_encoding
423 self.same_origin_parent_encoding = same_origin_parent_encoding
424 self.likely_encoding = likely_encoding
425 self.default_encoding = default_encoding
426
427 # Determine encoding
428 self.charEncoding = self.determineEncoding(useChardet)
429 assert self.charEncoding[0] is not None
430
431 # Call superclass
432 self.reset()
433
434 def reset(self):
435 self.dataStream = self.charEncoding[0].codec_info.streamreader(self.rawStream, 'replace')
436 HTMLUnicodeInputStream.reset(self)
437
438 def openStream(self, source):
439 """Produces a file object from source.
440
441 source can be either a file object, local filename or a string.
442
443 """
444 # Already a file object
445 if hasattr(source, 'read'):
446 stream = source
447 else:
448 stream = BytesIO(source)
449
450 try:
451 stream.seek(stream.tell())
452 except: # pylint:disable=bare-except
453 stream = BufferedStream(stream)
454
455 return stream
456
457 def determineEncoding(self, chardet=True):
458 # BOMs take precedence over everything
459 # This will also read past the BOM if present
460 charEncoding = self.detectBOM(), "certain"
461 if charEncoding[0] is not None:
462 return charEncoding
463
464 # If we've been overriden, we've been overriden
465 charEncoding = lookupEncoding(self.override_encoding), "certain"
466 if charEncoding[0] is not None:
467 return charEncoding
468
469 # Now check the transport layer
470 charEncoding = lookupEncoding(self.transport_encoding), "certain"
471 if charEncoding[0] is not None:
472 return charEncoding
473
474 # Look for meta elements with encoding information
475 charEncoding = self.detectEncodingMeta(), "tentative"
476 if charEncoding[0] is not None:
477 return charEncoding
478
479 # Parent document encoding
480 charEncoding = lookupEncoding(self.same_origin_parent_encoding), "tentative"
481 if charEncoding[0] is not None and not charEncoding[0].name.startswith("utf-16"):
482 return charEncoding
483
484 # "likely" encoding
485 charEncoding = lookupEncoding(self.likely_encoding), "tentative"
486 if charEncoding[0] is not None:
487 return charEncoding
488
489 # Guess with chardet, if available
490 if chardet:
491 try:
492 from pip._vendor.chardet.universaldetector import UniversalDetector
493 except ImportError:
494 pass
495 else:
496 buffers = []
497 detector = UniversalDetector()
498 while not detector.done:
499 buffer = self.rawStream.read(self.numBytesChardet)
500 assert isinstance(buffer, bytes)
501 if not buffer:
502 break
503 buffers.append(buffer)
504 detector.feed(buffer)
505 detector.close()
506 encoding = lookupEncoding(detector.result['encoding'])
507 self.rawStream.seek(0)
508 if encoding is not None:
509 return encoding, "tentative"
510
511 # Try the default encoding
512 charEncoding = lookupEncoding(self.default_encoding), "tentative"
513 if charEncoding[0] is not None:
514 return charEncoding
515
516 # Fallback to html5lib's default if even that hasn't worked
517 return lookupEncoding("windows-1252"), "tentative"
518
519 def changeEncoding(self, newEncoding):
520 assert self.charEncoding[1] != "certain"
521 newEncoding = lookupEncoding(newEncoding)
522 if newEncoding is None:
523 return
524 if newEncoding.name in ("utf-16be", "utf-16le"):
525 newEncoding = lookupEncoding("utf-8")
526 assert newEncoding is not None
527 elif newEncoding == self.charEncoding[0]:
528 self.charEncoding = (self.charEncoding[0], "certain")
529 else:
530 self.rawStream.seek(0)
531 self.charEncoding = (newEncoding, "certain")
532 self.reset()
533 raise _ReparseException("Encoding changed from %s to %s" % (self.charEncoding[0], newEncoding))
534
535 def detectBOM(self):
536 """Attempts to detect at BOM at the start of the stream. If
537 an encoding can be determined from the BOM return the name of the
538 encoding otherwise return None"""
539 bomDict = {
540 codecs.BOM_UTF8: 'utf-8',
541 codecs.BOM_UTF16_LE: 'utf-16le', codecs.BOM_UTF16_BE: 'utf-16be',
542 codecs.BOM_UTF32_LE: 'utf-32le', codecs.BOM_UTF32_BE: 'utf-32be'
543 }
544
545 # Go to beginning of file and read in 4 bytes
546 string = self.rawStream.read(4)
547 assert isinstance(string, bytes)
548
549 # Try detecting the BOM using bytes from the string
550 encoding = bomDict.get(string[:3]) # UTF-8
551 seek = 3
552 if not encoding:
553 # Need to detect UTF-32 before UTF-16
554 encoding = bomDict.get(string) # UTF-32
555 seek = 4
556 if not encoding:
557 encoding = bomDict.get(string[:2]) # UTF-16
558 seek = 2
559
560 # Set the read position past the BOM if one was found, otherwise
561 # set it to the start of the stream
562 if encoding:
563 self.rawStream.seek(seek)
564 return lookupEncoding(encoding)
565 else:
566 self.rawStream.seek(0)
567 return None
568
569 def detectEncodingMeta(self):
570 """Report the encoding declared by the meta element
571 """
572 buffer = self.rawStream.read(self.numBytesMeta)
573 assert isinstance(buffer, bytes)
574 parser = EncodingParser(buffer)
575 self.rawStream.seek(0)
576 encoding = parser.getEncoding()
577
578 if encoding is not None and encoding.name in ("utf-16be", "utf-16le"):
579 encoding = lookupEncoding("utf-8")
580
581 return encoding
582
583
584class EncodingBytes(bytes):
585 """String-like object with an associated position and various extra methods
586 If the position is ever greater than the string length then an exception is
587 raised"""
588 def __new__(self, value):
589 assert isinstance(value, bytes)
590 return bytes.__new__(self, value.lower())
591
592 def __init__(self, value):
593 # pylint:disable=unused-argument
594 self._position = -1
595
596 def __iter__(self):
597 return self
598
599 def __next__(self):
600 p = self._position = self._position + 1
601 if p >= len(self):
602 raise StopIteration
603 elif p < 0:
604 raise TypeError
605 return self[p:p + 1]
606
607 def next(self):
608 # Py2 compat
609 return self.__next__()
610
611 def previous(self):
612 p = self._position
613 if p >= len(self):
614 raise StopIteration
615 elif p < 0:
616 raise TypeError
617 self._position = p = p - 1
618 return self[p:p + 1]
619
620 def setPosition(self, position):
621 if self._position >= len(self):
622 raise StopIteration
623 self._position = position
624
625 def getPosition(self):
626 if self._position >= len(self):
627 raise StopIteration
628 if self._position >= 0:
629 return self._position
630 else:
631 return None
632
633 position = property(getPosition, setPosition)
634
635 def getCurrentByte(self):
636 return self[self.position:self.position + 1]
637
638 currentByte = property(getCurrentByte)
639
640 def skip(self, chars=spaceCharactersBytes):
641 """Skip past a list of characters"""
642 p = self.position # use property for the error-checking
643 while p < len(self):
644 c = self[p:p + 1]
645 if c not in chars:
646 self._position = p
647 return c
648 p += 1
649 self._position = p
650 return None
651
652 def skipUntil(self, chars):
653 p = self.position
654 while p < len(self):
655 c = self[p:p + 1]
656 if c in chars:
657 self._position = p
658 return c
659 p += 1
660 self._position = p
661 return None
662
663 def matchBytes(self, bytes):
664 """Look for a sequence of bytes at the start of a string. If the bytes
665 are found return True and advance the position to the byte after the
666 match. Otherwise return False and leave the position alone"""
667 p = self.position
668 data = self[p:p + len(bytes)]
669 rv = data.startswith(bytes)
670 if rv:
671 self.position += len(bytes)
672 return rv
673
674 def jumpTo(self, bytes):
675 """Look for the next sequence of bytes matching a given sequence. If
676 a match is found advance the position to the last byte of the match"""
677 newPosition = self[self.position:].find(bytes)
678 if newPosition > -1:
679 # XXX: This is ugly, but I can't see a nicer way to fix this.
680 if self._position == -1:
681 self._position = 0
682 self._position += (newPosition + len(bytes) - 1)
683 return True
684 else:
685 raise StopIteration
686
687
688class EncodingParser(object):
689 """Mini parser for detecting character encoding from meta elements"""
690
691 def __init__(self, data):
692 """string - the data to work on for encoding detection"""
693 self.data = EncodingBytes(data)
694 self.encoding = None
695
696 def getEncoding(self):
697 methodDispatch = (
698 (b"<!--", self.handleComment),
699 (b"<meta", self.handleMeta),
700 (b"</", self.handlePossibleEndTag),
701 (b"<!", self.handleOther),
702 (b"<?", self.handleOther),
703 (b"<", self.handlePossibleStartTag))
704 for _ in self.data:
705 keepParsing = True
706 for key, method in methodDispatch:
707 if self.data.matchBytes(key):
708 try:
709 keepParsing = method()
710 break
711 except StopIteration:
712 keepParsing = False
713 break
714 if not keepParsing:
715 break
716
717 return self.encoding
718
719 def handleComment(self):
720 """Skip over comments"""
721 return self.data.jumpTo(b"-->")
722
723 def handleMeta(self):
724 if self.data.currentByte not in spaceCharactersBytes:
725 # if we have <meta not followed by a space so just keep going
726 return True
727 # We have a valid meta element we want to search for attributes
728 hasPragma = False
729 pendingEncoding = None
730 while True:
731 # Try to find the next attribute after the current position
732 attr = self.getAttribute()
733 if attr is None:
734 return True
735 else:
736 if attr[0] == b"http-equiv":
737 hasPragma = attr[1] == b"content-type"
738 if hasPragma and pendingEncoding is not None:
739 self.encoding = pendingEncoding
740 return False
741 elif attr[0] == b"charset":
742 tentativeEncoding = attr[1]
743 codec = lookupEncoding(tentativeEncoding)
744 if codec is not None:
745 self.encoding = codec
746 return False
747 elif attr[0] == b"content":
748 contentParser = ContentAttrParser(EncodingBytes(attr[1]))
749 tentativeEncoding = contentParser.parse()
750 if tentativeEncoding is not None:
751 codec = lookupEncoding(tentativeEncoding)
752 if codec is not None:
753 if hasPragma:
754 self.encoding = codec
755 return False
756 else:
757 pendingEncoding = codec
758
759 def handlePossibleStartTag(self):
760 return self.handlePossibleTag(False)
761
762 def handlePossibleEndTag(self):
763 next(self.data)
764 return self.handlePossibleTag(True)
765
766 def handlePossibleTag(self, endTag):
767 data = self.data
768 if data.currentByte not in asciiLettersBytes:
769 # If the next byte is not an ascii letter either ignore this
770 # fragment (possible start tag case) or treat it according to
771 # handleOther
772 if endTag:
773 data.previous()
774 self.handleOther()
775 return True
776
777 c = data.skipUntil(spacesAngleBrackets)
778 if c == b"<":
779 # return to the first step in the overall "two step" algorithm
780 # reprocessing the < byte
781 data.previous()
782 else:
783 # Read all attributes
784 attr = self.getAttribute()
785 while attr is not None:
786 attr = self.getAttribute()
787 return True
788
789 def handleOther(self):
790 return self.data.jumpTo(b">")
791
792 def getAttribute(self):
793 """Return a name,value pair for the next attribute in the stream,
794 if one is found, or None"""
795 data = self.data
796 # Step 1 (skip chars)
797 c = data.skip(spaceCharactersBytes | frozenset([b"/"]))
798 assert c is None or len(c) == 1
799 # Step 2
800 if c in (b">", None):
801 return None
802 # Step 3
803 attrName = []
804 attrValue = []
805 # Step 4 attribute name
806 while True:
807 if c == b"=" and attrName:
808 break
809 elif c in spaceCharactersBytes:
810 # Step 6!
811 c = data.skip()
812 break
813 elif c in (b"/", b">"):
814 return b"".join(attrName), b""
815 elif c in asciiUppercaseBytes:
816 attrName.append(c.lower())
817 elif c is None:
818 return None
819 else:
820 attrName.append(c)
821 # Step 5
822 c = next(data)
823 # Step 7
824 if c != b"=":
825 data.previous()
826 return b"".join(attrName), b""
827 # Step 8
828 next(data)
829 # Step 9
830 c = data.skip()
831 # Step 10
832 if c in (b"'", b'"'):
833 # 10.1
834 quoteChar = c
835 while True:
836 # 10.2
837 c = next(data)
838 # 10.3
839 if c == quoteChar:
840 next(data)
841 return b"".join(attrName), b"".join(attrValue)
842 # 10.4
843 elif c in asciiUppercaseBytes:
844 attrValue.append(c.lower())
845 # 10.5
846 else:
847 attrValue.append(c)
848 elif c == b">":
849 return b"".join(attrName), b""
850 elif c in asciiUppercaseBytes:
851 attrValue.append(c.lower())
852 elif c is None:
853 return None
854 else:
855 attrValue.append(c)
856 # Step 11
857 while True:
858 c = next(data)
859 if c in spacesAngleBrackets:
860 return b"".join(attrName), b"".join(attrValue)
861 elif c in asciiUppercaseBytes:
862 attrValue.append(c.lower())
863 elif c is None:
864 return None
865 else:
866 attrValue.append(c)
867
868
869class ContentAttrParser(object):
870 def __init__(self, data):
871 assert isinstance(data, bytes)
872 self.data = data
873
874 def parse(self):
875 try:
876 # Check if the attr name is charset
877 # otherwise return
878 self.data.jumpTo(b"charset")
879 self.data.position += 1
880 self.data.skip()
881 if not self.data.currentByte == b"=":
882 # If there is no = sign keep looking for attrs
883 return None
884 self.data.position += 1
885 self.data.skip()
886 # Look for an encoding between matching quote marks
887 if self.data.currentByte in (b'"', b"'"):
888 quoteMark = self.data.currentByte
889 self.data.position += 1
890 oldPosition = self.data.position
891 if self.data.jumpTo(quoteMark):
892 return self.data[oldPosition:self.data.position]
893 else:
894 return None
895 else:
896 # Unquoted value
897 oldPosition = self.data.position
898 try:
899 self.data.skipUntil(spaceCharactersBytes)
900 return self.data[oldPosition:self.data.position]
901 except StopIteration:
902 # Return the whole remaining value
903 return self.data[oldPosition:]
904 except StopIteration:
905 return None
906
907
908def lookupEncoding(encoding):
909 """Return the python codec name corresponding to an encoding or None if the
910 string doesn't correspond to a valid encoding."""
911 if isinstance(encoding, binary_type):
912 try:
913 encoding = encoding.decode("ascii")
914 except UnicodeDecodeError:
915 return None
916
917 if encoding is not None:
918 try:
919 return webencodings.lookup(encoding)
920 except AttributeError:
921 return None
922 else:
923 return None
diff --git a/venv/lib/python3.7/site-packages/pip-10.0.1-py3.7.egg/pip/_vendor/html5lib/_tokenizer.py b/venv/lib/python3.7/site-packages/pip-10.0.1-py3.7.egg/pip/_vendor/html5lib/_tokenizer.py
new file mode 100644
index 0000000..ef1ccf8
--- /dev/null
+++ b/venv/lib/python3.7/site-packages/pip-10.0.1-py3.7.egg/pip/_vendor/html5lib/_tokenizer.py
@@ -0,0 +1,1721 @@
1from __future__ import absolute_import, division, unicode_literals
2
3from pip._vendor.six import unichr as chr
4
5from collections import deque
6
7from .constants import spaceCharacters
8from .constants import entities
9from .constants import asciiLetters, asciiUpper2Lower
10from .constants import digits, hexDigits, EOF
11from .constants import tokenTypes, tagTokenTypes
12from .constants import replacementCharacters
13
14from ._inputstream import HTMLInputStream
15
16from ._trie import Trie
17
18entitiesTrie = Trie(entities)
19
20
21class HTMLTokenizer(object):
22 """ This class takes care of tokenizing HTML.
23
24 * self.currentToken
25 Holds the token that is currently being processed.
26
27 * self.state
28 Holds a reference to the method to be invoked... XXX
29
30 * self.stream
31 Points to HTMLInputStream object.
32 """
33
34 def __init__(self, stream, parser=None, **kwargs):
35
36 self.stream = HTMLInputStream(stream, **kwargs)
37 self.parser = parser
38
39 # Setup the initial tokenizer state
40 self.escapeFlag = False
41 self.lastFourChars = []
42 self.state = self.dataState
43 self.escape = False
44
45 # The current token being created
46 self.currentToken = None
47 super(HTMLTokenizer, self).__init__()
48
49 def __iter__(self):
50 """ This is where the magic happens.
51
52 We do our usually processing through the states and when we have a token
53 to return we yield the token which pauses processing until the next token
54 is requested.
55 """
56 self.tokenQueue = deque([])
57 # Start processing. When EOF is reached self.state will return False
58 # instead of True and the loop will terminate.
59 while self.state():
60 while self.stream.errors:
61 yield {"type": tokenTypes["ParseError"], "data": self.stream.errors.pop(0)}
62 while self.tokenQueue:
63 yield self.tokenQueue.popleft()
64
65 def consumeNumberEntity(self, isHex):
66 """This function returns either U+FFFD or the character based on the
67 decimal or hexadecimal representation. It also discards ";" if present.
68 If not present self.tokenQueue.append({"type": tokenTypes["ParseError"]}) is invoked.
69 """
70
71 allowed = digits
72 radix = 10
73 if isHex:
74 allowed = hexDigits
75 radix = 16
76
77 charStack = []
78
79 # Consume all the characters that are in range while making sure we
80 # don't hit an EOF.
81 c = self.stream.char()
82 while c in allowed and c is not EOF:
83 charStack.append(c)
84 c = self.stream.char()
85
86 # Convert the set of characters consumed to an int.
87 charAsInt = int("".join(charStack), radix)
88
89 # Certain characters get replaced with others
90 if charAsInt in replacementCharacters:
91 char = replacementCharacters[charAsInt]
92 self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
93 "illegal-codepoint-for-numeric-entity",
94 "datavars": {"charAsInt": charAsInt}})
95 elif ((0xD800 <= charAsInt <= 0xDFFF) or
96 (charAsInt > 0x10FFFF)):
97 char = "\uFFFD"
98 self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
99 "illegal-codepoint-for-numeric-entity",
100 "datavars": {"charAsInt": charAsInt}})
101 else:
102 # Should speed up this check somehow (e.g. move the set to a constant)
103 if ((0x0001 <= charAsInt <= 0x0008) or
104 (0x000E <= charAsInt <= 0x001F) or
105 (0x007F <= charAsInt <= 0x009F) or
106 (0xFDD0 <= charAsInt <= 0xFDEF) or
107 charAsInt in frozenset([0x000B, 0xFFFE, 0xFFFF, 0x1FFFE,
108 0x1FFFF, 0x2FFFE, 0x2FFFF, 0x3FFFE,
109 0x3FFFF, 0x4FFFE, 0x4FFFF, 0x5FFFE,
110 0x5FFFF, 0x6FFFE, 0x6FFFF, 0x7FFFE,
111 0x7FFFF, 0x8FFFE, 0x8FFFF, 0x9FFFE,
112 0x9FFFF, 0xAFFFE, 0xAFFFF, 0xBFFFE,
113 0xBFFFF, 0xCFFFE, 0xCFFFF, 0xDFFFE,
114 0xDFFFF, 0xEFFFE, 0xEFFFF, 0xFFFFE,
115 0xFFFFF, 0x10FFFE, 0x10FFFF])):
116 self.tokenQueue.append({"type": tokenTypes["ParseError"],
117 "data":
118 "illegal-codepoint-for-numeric-entity",
119 "datavars": {"charAsInt": charAsInt}})
120 try:
121 # Try/except needed as UCS-2 Python builds' unichar only works
122 # within the BMP.
123 char = chr(charAsInt)
124 except ValueError:
125 v = charAsInt - 0x10000
126 char = chr(0xD800 | (v >> 10)) + chr(0xDC00 | (v & 0x3FF))
127
128 # Discard the ; if present. Otherwise, put it back on the queue and
129 # invoke parseError on parser.
130 if c != ";":
131 self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
132 "numeric-entity-without-semicolon"})
133 self.stream.unget(c)
134
135 return char
136
137 def consumeEntity(self, allowedChar=None, fromAttribute=False):
138 # Initialise to the default output for when no entity is matched
139 output = "&"
140
141 charStack = [self.stream.char()]
142 if (charStack[0] in spaceCharacters or charStack[0] in (EOF, "<", "&") or
143 (allowedChar is not None and allowedChar == charStack[0])):
144 self.stream.unget(charStack[0])
145
146 elif charStack[0] == "#":
147 # Read the next character to see if it's hex or decimal
148 hex = False
149 charStack.append(self.stream.char())
150 if charStack[-1] in ("x", "X"):
151 hex = True
152 charStack.append(self.stream.char())
153
154 # charStack[-1] should be the first digit
155 if (hex and charStack[-1] in hexDigits) \
156 or (not hex and charStack[-1] in digits):
157 # At least one digit found, so consume the whole number
158 self.stream.unget(charStack[-1])
159 output = self.consumeNumberEntity(hex)
160 else:
161 # No digits found
162 self.tokenQueue.append({"type": tokenTypes["ParseError"],
163 "data": "expected-numeric-entity"})
164 self.stream.unget(charStack.pop())
165 output = "&" + "".join(charStack)
166
167 else:
168 # At this point in the process might have named entity. Entities
169 # are stored in the global variable "entities".
170 #
171 # Consume characters and compare to these to a substring of the
172 # entity names in the list until the substring no longer matches.
173 while (charStack[-1] is not EOF):
174 if not entitiesTrie.has_keys_with_prefix("".join(charStack)):
175 break
176 charStack.append(self.stream.char())
177
178 # At this point we have a string that starts with some characters
179 # that may match an entity
180 # Try to find the longest entity the string will match to take care
181 # of &noti for instance.
182 try:
183 entityName = entitiesTrie.longest_prefix("".join(charStack[:-1]))
184 entityLength = len(entityName)
185 except KeyError:
186 entityName = None
187
188 if entityName is not None:
189 if entityName[-1] != ";":
190 self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
191 "named-entity-without-semicolon"})
192 if (entityName[-1] != ";" and fromAttribute and
193 (charStack[entityLength] in asciiLetters or
194 charStack[entityLength] in digits or
195 charStack[entityLength] == "=")):
196 self.stream.unget(charStack.pop())
197 output = "&" + "".join(charStack)
198 else:
199 output = entities[entityName]
200 self.stream.unget(charStack.pop())
201 output += "".join(charStack[entityLength:])
202 else:
203 self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
204 "expected-named-entity"})
205 self.stream.unget(charStack.pop())
206 output = "&" + "".join(charStack)
207
208 if fromAttribute:
209 self.currentToken["data"][-1][1] += output
210 else:
211 if output in spaceCharacters:
212 tokenType = "SpaceCharacters"
213 else:
214 tokenType = "Characters"
215 self.tokenQueue.append({"type": tokenTypes[tokenType], "data": output})
216
217 def processEntityInAttribute(self, allowedChar):
218 """This method replaces the need for "entityInAttributeValueState".
219 """
220 self.consumeEntity(allowedChar=allowedChar, fromAttribute=True)
221
222 def emitCurrentToken(self):
223 """This method is a generic handler for emitting the tags. It also sets
224 the state to "data" because that's what's needed after a token has been
225 emitted.
226 """
227 token = self.currentToken
228 # Add token to the queue to be yielded
229 if (token["type"] in tagTokenTypes):
230 token["name"] = token["name"].translate(asciiUpper2Lower)
231 if token["type"] == tokenTypes["EndTag"]:
232 if token["data"]:
233 self.tokenQueue.append({"type": tokenTypes["ParseError"],
234 "data": "attributes-in-end-tag"})
235 if token["selfClosing"]:
236 self.tokenQueue.append({"type": tokenTypes["ParseError"],
237 "data": "self-closing-flag-on-end-tag"})
238 self.tokenQueue.append(token)
239 self.state = self.dataState
240
241 # Below are the various tokenizer states worked out.
242 def dataState(self):
243 data = self.stream.char()
244 if data == "&":
245 self.state = self.entityDataState
246 elif data == "<":
247 self.state = self.tagOpenState
248 elif data == "\u0000":
249 self.tokenQueue.append({"type": tokenTypes["ParseError"],
250 "data": "invalid-codepoint"})
251 self.tokenQueue.append({"type": tokenTypes["Characters"],
252 "data": "\u0000"})
253 elif data is EOF:
254 # Tokenization ends.
255 return False
256 elif data in spaceCharacters:
257 # Directly after emitting a token you switch back to the "data
258 # state". At that point spaceCharacters are important so they are
259 # emitted separately.
260 self.tokenQueue.append({"type": tokenTypes["SpaceCharacters"], "data":
261 data + self.stream.charsUntil(spaceCharacters, True)})
262 # No need to update lastFourChars here, since the first space will
263 # have already been appended to lastFourChars and will have broken
264 # any <!-- or --> sequences
265 else:
266 chars = self.stream.charsUntil(("&", "<", "\u0000"))
267 self.tokenQueue.append({"type": tokenTypes["Characters"], "data":
268 data + chars})
269 return True
270
271 def entityDataState(self):
272 self.consumeEntity()
273 self.state = self.dataState
274 return True
275
276 def rcdataState(self):
277 data = self.stream.char()
278 if data == "&":
279 self.state = self.characterReferenceInRcdata
280 elif data == "<":
281 self.state = self.rcdataLessThanSignState
282 elif data == EOF:
283 # Tokenization ends.
284 return False
285 elif data == "\u0000":
286 self.tokenQueue.append({"type": tokenTypes["ParseError"],
287 "data": "invalid-codepoint"})
288 self.tokenQueue.append({"type": tokenTypes["Characters"],
289 "data": "\uFFFD"})
290 elif data in spaceCharacters:
291 # Directly after emitting a token you switch back to the "data
292 # state". At that point spaceCharacters are important so they are
293 # emitted separately.
294 self.tokenQueue.append({"type": tokenTypes["SpaceCharacters"], "data":
295 data + self.stream.charsUntil(spaceCharacters, True)})
296 # No need to update lastFourChars here, since the first space will
297 # have already been appended to lastFourChars and will have broken
298 # any <!-- or --> sequences
299 else:
300 chars = self.stream.charsUntil(("&", "<", "\u0000"))
301 self.tokenQueue.append({"type": tokenTypes["Characters"], "data":
302 data + chars})
303 return True
304
305 def characterReferenceInRcdata(self):
306 self.consumeEntity()
307 self.state = self.rcdataState
308 return True
309
310 def rawtextState(self):
311 data = self.stream.char()
312 if data == "<":
313 self.state = self.rawtextLessThanSignState
314 elif data == "\u0000":
315 self.tokenQueue.append({"type": tokenTypes["ParseError"],
316 "data": "invalid-codepoint"})
317 self.tokenQueue.append({"type": tokenTypes["Characters"],
318 "data": "\uFFFD"})
319 elif data == EOF:
320 # Tokenization ends.
321 return False
322 else:
323 chars = self.stream.charsUntil(("<", "\u0000"))
324 self.tokenQueue.append({"type": tokenTypes["Characters"], "data":
325 data + chars})
326 return True
327
328 def scriptDataState(self):
329 data = self.stream.char()
330 if data == "<":
331 self.state = self.scriptDataLessThanSignState
332 elif data == "\u0000":
333 self.tokenQueue.append({"type": tokenTypes["ParseError"],
334 "data": "invalid-codepoint"})
335 self.tokenQueue.append({"type": tokenTypes["Characters"],
336 "data": "\uFFFD"})
337 elif data == EOF:
338 # Tokenization ends.
339 return False
340 else:
341 chars = self.stream.charsUntil(("<", "\u0000"))
342 self.tokenQueue.append({"type": tokenTypes["Characters"], "data":
343 data + chars})
344 return True
345
346 def plaintextState(self):
347 data = self.stream.char()
348 if data == EOF:
349 # Tokenization ends.
350 return False
351 elif data == "\u0000":
352 self.tokenQueue.append({"type": tokenTypes["ParseError"],
353 "data": "invalid-codepoint"})
354 self.tokenQueue.append({"type": tokenTypes["Characters"],
355 "data": "\uFFFD"})
356 else:
357 self.tokenQueue.append({"type": tokenTypes["Characters"], "data":
358 data + self.stream.charsUntil("\u0000")})
359 return True
360
361 def tagOpenState(self):
362 data = self.stream.char()
363 if data == "!":
364 self.state = self.markupDeclarationOpenState
365 elif data == "/":
366 self.state = self.closeTagOpenState
367 elif data in asciiLetters:
368 self.currentToken = {"type": tokenTypes["StartTag"],
369 "name": data, "data": [],
370 "selfClosing": False,
371 "selfClosingAcknowledged": False}
372 self.state = self.tagNameState
373 elif data == ">":
374 # XXX In theory it could be something besides a tag name. But
375 # do we really care?
376 self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
377 "expected-tag-name-but-got-right-bracket"})
378 self.tokenQueue.append({"type": tokenTypes["Characters"], "data": "<>"})
379 self.state = self.dataState
380 elif data == "?":
381 # XXX In theory it could be something besides a tag name. But
382 # do we really care?
383 self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
384 "expected-tag-name-but-got-question-mark"})
385 self.stream.unget(data)
386 self.state = self.bogusCommentState
387 else:
388 # XXX
389 self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
390 "expected-tag-name"})
391 self.tokenQueue.append({"type": tokenTypes["Characters"], "data": "<"})
392 self.stream.unget(data)
393 self.state = self.dataState
394 return True
395
396 def closeTagOpenState(self):
397 data = self.stream.char()
398 if data in asciiLetters:
399 self.currentToken = {"type": tokenTypes["EndTag"], "name": data,
400 "data": [], "selfClosing": False}
401 self.state = self.tagNameState
402 elif data == ">":
403 self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
404 "expected-closing-tag-but-got-right-bracket"})
405 self.state = self.dataState
406 elif data is EOF:
407 self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
408 "expected-closing-tag-but-got-eof"})
409 self.tokenQueue.append({"type": tokenTypes["Characters"], "data": "</"})
410 self.state = self.dataState
411 else:
412 # XXX data can be _'_...
413 self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
414 "expected-closing-tag-but-got-char",
415 "datavars": {"data": data}})
416 self.stream.unget(data)
417 self.state = self.bogusCommentState
418 return True
419
420 def tagNameState(self):
421 data = self.stream.char()
422 if data in spaceCharacters:
423 self.state = self.beforeAttributeNameState
424 elif data == ">":
425 self.emitCurrentToken()
426 elif data is EOF:
427 self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
428 "eof-in-tag-name"})
429 self.state = self.dataState
430 elif data == "/":
431 self.state = self.selfClosingStartTagState
432 elif data == "\u0000":
433 self.tokenQueue.append({"type": tokenTypes["ParseError"],
434 "data": "invalid-codepoint"})
435 self.currentToken["name"] += "\uFFFD"
436 else:
437 self.currentToken["name"] += data
438 # (Don't use charsUntil here, because tag names are
439 # very short and it's faster to not do anything fancy)
440 return True
441
442 def rcdataLessThanSignState(self):
443 data = self.stream.char()
444 if data == "/":
445 self.temporaryBuffer = ""
446 self.state = self.rcdataEndTagOpenState
447 else:
448 self.tokenQueue.append({"type": tokenTypes["Characters"], "data": "<"})
449 self.stream.unget(data)
450 self.state = self.rcdataState
451 return True
452
453 def rcdataEndTagOpenState(self):
454 data = self.stream.char()
455 if data in asciiLetters:
456 self.temporaryBuffer += data
457 self.state = self.rcdataEndTagNameState
458 else:
459 self.tokenQueue.append({"type": tokenTypes["Characters"], "data": "</"})
460 self.stream.unget(data)
461 self.state = self.rcdataState
462 return True
463
464 def rcdataEndTagNameState(self):
465 appropriate = self.currentToken and self.currentToken["name"].lower() == self.temporaryBuffer.lower()
466 data = self.stream.char()
467 if data in spaceCharacters and appropriate:
468 self.currentToken = {"type": tokenTypes["EndTag"],
469 "name": self.temporaryBuffer,
470 "data": [], "selfClosing": False}
471 self.state = self.beforeAttributeNameState
472 elif data == "/" and appropriate:
473 self.currentToken = {"type": tokenTypes["EndTag"],
474 "name": self.temporaryBuffer,
475 "data": [], "selfClosing": False}
476 self.state = self.selfClosingStartTagState
477 elif data == ">" and appropriate:
478 self.currentToken = {"type": tokenTypes["EndTag"],
479 "name": self.temporaryBuffer,
480 "data": [], "selfClosing": False}
481 self.emitCurrentToken()
482 self.state = self.dataState
483 elif data in asciiLetters:
484 self.temporaryBuffer += data
485 else:
486 self.tokenQueue.append({"type": tokenTypes["Characters"],
487 "data": "</" + self.temporaryBuffer})
488 self.stream.unget(data)
489 self.state = self.rcdataState
490 return True
491
492 def rawtextLessThanSignState(self):
493 data = self.stream.char()
494 if data == "/":
495 self.temporaryBuffer = ""
496 self.state = self.rawtextEndTagOpenState
497 else:
498 self.tokenQueue.append({"type": tokenTypes["Characters"], "data": "<"})
499 self.stream.unget(data)
500 self.state = self.rawtextState
501 return True
502
503 def rawtextEndTagOpenState(self):
504 data = self.stream.char()
505 if data in asciiLetters:
506 self.temporaryBuffer += data
507 self.state = self.rawtextEndTagNameState
508 else:
509 self.tokenQueue.append({"type": tokenTypes["Characters"], "data": "</"})
510 self.stream.unget(data)
511 self.state = self.rawtextState
512 return True
513
514 def rawtextEndTagNameState(self):
515 appropriate = self.currentToken and self.currentToken["name"].lower() == self.temporaryBuffer.lower()
516 data = self.stream.char()
517 if data in spaceCharacters and appropriate:
518 self.currentToken = {"type": tokenTypes["EndTag"],
519 "name": self.temporaryBuffer,
520 "data": [], "selfClosing": False}
521 self.state = self.beforeAttributeNameState
522 elif data == "/" and appropriate:
523 self.currentToken = {"type": tokenTypes["EndTag"],
524 "name": self.temporaryBuffer,
525 "data": [], "selfClosing": False}
526 self.state = self.selfClosingStartTagState
527 elif data == ">" and appropriate:
528 self.currentToken = {"type": tokenTypes["EndTag"],
529 "name": self.temporaryBuffer,
530 "data": [], "selfClosing": False}
531 self.emitCurrentToken()
532 self.state = self.dataState
533 elif data in asciiLetters:
534 self.temporaryBuffer += data
535 else:
536 self.tokenQueue.append({"type": tokenTypes["Characters"],
537 "data": "</" + self.temporaryBuffer})
538 self.stream.unget(data)
539 self.state = self.rawtextState
540 return True
541
542 def scriptDataLessThanSignState(self):
543 data = self.stream.char()
544 if data == "/":
545 self.temporaryBuffer = ""
546 self.state = self.scriptDataEndTagOpenState
547 elif data == "!":
548 self.tokenQueue.append({"type": tokenTypes["Characters"], "data": "<!"})
549 self.state = self.scriptDataEscapeStartState
550 else:
551 self.tokenQueue.append({"type": tokenTypes["Characters"], "data": "<"})
552 self.stream.unget(data)
553 self.state = self.scriptDataState
554 return True
555
556 def scriptDataEndTagOpenState(self):
557 data = self.stream.char()
558 if data in asciiLetters:
559 self.temporaryBuffer += data
560 self.state = self.scriptDataEndTagNameState
561 else:
562 self.tokenQueue.append({"type": tokenTypes["Characters"], "data": "</"})
563 self.stream.unget(data)
564 self.state = self.scriptDataState
565 return True
566
567 def scriptDataEndTagNameState(self):
568 appropriate = self.currentToken and self.currentToken["name"].lower() == self.temporaryBuffer.lower()
569 data = self.stream.char()
570 if data in spaceCharacters and appropriate:
571 self.currentToken = {"type": tokenTypes["EndTag"],
572 "name": self.temporaryBuffer,
573 "data": [], "selfClosing": False}
574 self.state = self.beforeAttributeNameState
575 elif data == "/" and appropriate:
576 self.currentToken = {"type": tokenTypes["EndTag"],
577 "name": self.temporaryBuffer,
578 "data": [], "selfClosing": False}
579 self.state = self.selfClosingStartTagState
580 elif data == ">" and appropriate:
581 self.currentToken = {"type": tokenTypes["EndTag"],
582 "name": self.temporaryBuffer,
583 "data": [], "selfClosing": False}
584 self.emitCurrentToken()
585 self.state = self.dataState
586 elif data in asciiLetters:
587 self.temporaryBuffer += data
588 else:
589 self.tokenQueue.append({"type": tokenTypes["Characters"],
590 "data": "</" + self.temporaryBuffer})
591 self.stream.unget(data)
592 self.state = self.scriptDataState
593 return True
594
595 def scriptDataEscapeStartState(self):
596 data = self.stream.char()
597 if data == "-":
598 self.tokenQueue.append({"type": tokenTypes["Characters"], "data": "-"})
599 self.state = self.scriptDataEscapeStartDashState
600 else:
601 self.stream.unget(data)
602 self.state = self.scriptDataState
603 return True
604
605 def scriptDataEscapeStartDashState(self):
606 data = self.stream.char()
607 if data == "-":
608 self.tokenQueue.append({"type": tokenTypes["Characters"], "data": "-"})
609 self.state = self.scriptDataEscapedDashDashState
610 else:
611 self.stream.unget(data)
612 self.state = self.scriptDataState
613 return True
614
615 def scriptDataEscapedState(self):
616 data = self.stream.char()
617 if data == "-":
618 self.tokenQueue.append({"type": tokenTypes["Characters"], "data": "-"})
619 self.state = self.scriptDataEscapedDashState
620 elif data == "<":
621 self.state = self.scriptDataEscapedLessThanSignState
622 elif data == "\u0000":
623 self.tokenQueue.append({"type": tokenTypes["ParseError"],
624 "data": "invalid-codepoint"})
625 self.tokenQueue.append({"type": tokenTypes["Characters"],
626 "data": "\uFFFD"})
627 elif data == EOF:
628 self.state = self.dataState
629 else:
630 chars = self.stream.charsUntil(("<", "-", "\u0000"))
631 self.tokenQueue.append({"type": tokenTypes["Characters"], "data":
632 data + chars})
633 return True
634
635 def scriptDataEscapedDashState(self):
636 data = self.stream.char()
637 if data == "-":
638 self.tokenQueue.append({"type": tokenTypes["Characters"], "data": "-"})
639 self.state = self.scriptDataEscapedDashDashState
640 elif data == "<":
641 self.state = self.scriptDataEscapedLessThanSignState
642 elif data == "\u0000":
643 self.tokenQueue.append({"type": tokenTypes["ParseError"],
644 "data": "invalid-codepoint"})
645 self.tokenQueue.append({"type": tokenTypes["Characters"],
646 "data": "\uFFFD"})
647 self.state = self.scriptDataEscapedState
648 elif data == EOF:
649 self.state = self.dataState
650 else:
651 self.tokenQueue.append({"type": tokenTypes["Characters"], "data": data})
652 self.state = self.scriptDataEscapedState
653 return True
654
655 def scriptDataEscapedDashDashState(self):
656 data = self.stream.char()
657 if data == "-":
658 self.tokenQueue.append({"type": tokenTypes["Characters"], "data": "-"})
659 elif data == "<":
660 self.state = self.scriptDataEscapedLessThanSignState
661 elif data == ">":
662 self.tokenQueue.append({"type": tokenTypes["Characters"], "data": ">"})
663 self.state = self.scriptDataState
664 elif data == "\u0000":
665 self.tokenQueue.append({"type": tokenTypes["ParseError"],
666 "data": "invalid-codepoint"})
667 self.tokenQueue.append({"type": tokenTypes["Characters"],
668 "data": "\uFFFD"})
669 self.state = self.scriptDataEscapedState
670 elif data == EOF:
671 self.state = self.dataState
672 else:
673 self.tokenQueue.append({"type": tokenTypes["Characters"], "data": data})
674 self.state = self.scriptDataEscapedState
675 return True
676
677 def scriptDataEscapedLessThanSignState(self):
678 data = self.stream.char()
679 if data == "/":
680 self.temporaryBuffer = ""
681 self.state = self.scriptDataEscapedEndTagOpenState
682 elif data in asciiLetters:
683 self.tokenQueue.append({"type": tokenTypes["Characters"], "data": "<" + data})
684 self.temporaryBuffer = data
685 self.state = self.scriptDataDoubleEscapeStartState
686 else:
687 self.tokenQueue.append({"type": tokenTypes["Characters"], "data": "<"})
688 self.stream.unget(data)
689 self.state = self.scriptDataEscapedState
690 return True
691
692 def scriptDataEscapedEndTagOpenState(self):
693 data = self.stream.char()
694 if data in asciiLetters:
695 self.temporaryBuffer = data
696 self.state = self.scriptDataEscapedEndTagNameState
697 else:
698 self.tokenQueue.append({"type": tokenTypes["Characters"], "data": "</"})
699 self.stream.unget(data)
700 self.state = self.scriptDataEscapedState
701 return True
702
703 def scriptDataEscapedEndTagNameState(self):
704 appropriate = self.currentToken and self.currentToken["name"].lower() == self.temporaryBuffer.lower()
705 data = self.stream.char()
706 if data in spaceCharacters and appropriate:
707 self.currentToken = {"type": tokenTypes["EndTag"],
708 "name": self.temporaryBuffer,
709 "data": [], "selfClosing": False}
710 self.state = self.beforeAttributeNameState
711 elif data == "/" and appropriate:
712 self.currentToken = {"type": tokenTypes["EndTag"],
713 "name": self.temporaryBuffer,
714 "data": [], "selfClosing": False}
715 self.state = self.selfClosingStartTagState
716 elif data == ">" and appropriate:
717 self.currentToken = {"type": tokenTypes["EndTag"],
718 "name": self.temporaryBuffer,
719 "data": [], "selfClosing": False}
720 self.emitCurrentToken()
721 self.state = self.dataState
722 elif data in asciiLetters:
723 self.temporaryBuffer += data
724 else:
725 self.tokenQueue.append({"type": tokenTypes["Characters"],
726 "data": "</" + self.temporaryBuffer})
727 self.stream.unget(data)
728 self.state = self.scriptDataEscapedState
729 return True
730
731 def scriptDataDoubleEscapeStartState(self):
732 data = self.stream.char()
733 if data in (spaceCharacters | frozenset(("/", ">"))):
734 self.tokenQueue.append({"type": tokenTypes["Characters"], "data": data})
735 if self.temporaryBuffer.lower() == "script":
736 self.state = self.scriptDataDoubleEscapedState
737 else:
738 self.state = self.scriptDataEscapedState
739 elif data in asciiLetters:
740 self.tokenQueue.append({"type": tokenTypes["Characters"], "data": data})
741 self.temporaryBuffer += data
742 else:
743 self.stream.unget(data)
744 self.state = self.scriptDataEscapedState
745 return True
746
747 def scriptDataDoubleEscapedState(self):
748 data = self.stream.char()
749 if data == "-":
750 self.tokenQueue.append({"type": tokenTypes["Characters"], "data": "-"})
751 self.state = self.scriptDataDoubleEscapedDashState
752 elif data == "<":
753 self.tokenQueue.append({"type": tokenTypes["Characters"], "data": "<"})
754 self.state = self.scriptDataDoubleEscapedLessThanSignState
755 elif data == "\u0000":
756 self.tokenQueue.append({"type": tokenTypes["ParseError"],
757 "data": "invalid-codepoint"})
758 self.tokenQueue.append({"type": tokenTypes["Characters"],
759 "data": "\uFFFD"})
760 elif data == EOF:
761 self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
762 "eof-in-script-in-script"})
763 self.state = self.dataState
764 else:
765 self.tokenQueue.append({"type": tokenTypes["Characters"], "data": data})
766 return True
767
768 def scriptDataDoubleEscapedDashState(self):
769 data = self.stream.char()
770 if data == "-":
771 self.tokenQueue.append({"type": tokenTypes["Characters"], "data": "-"})
772 self.state = self.scriptDataDoubleEscapedDashDashState
773 elif data == "<":
774 self.tokenQueue.append({"type": tokenTypes["Characters"], "data": "<"})
775 self.state = self.scriptDataDoubleEscapedLessThanSignState
776 elif data == "\u0000":
777 self.tokenQueue.append({"type": tokenTypes["ParseError"],
778 "data": "invalid-codepoint"})
779 self.tokenQueue.append({"type": tokenTypes["Characters"],
780 "data": "\uFFFD"})
781 self.state = self.scriptDataDoubleEscapedState
782 elif data == EOF:
783 self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
784 "eof-in-script-in-script"})
785 self.state = self.dataState
786 else:
787 self.tokenQueue.append({"type": tokenTypes["Characters"], "data": data})
788 self.state = self.scriptDataDoubleEscapedState
789 return True
790
791 def scriptDataDoubleEscapedDashDashState(self):
792 data = self.stream.char()
793 if data == "-":
794 self.tokenQueue.append({"type": tokenTypes["Characters"], "data": "-"})
795 elif data == "<":
796 self.tokenQueue.append({"type": tokenTypes["Characters"], "data": "<"})
797 self.state = self.scriptDataDoubleEscapedLessThanSignState
798 elif data == ">":
799 self.tokenQueue.append({"type": tokenTypes["Characters"], "data": ">"})
800 self.state = self.scriptDataState
801 elif data == "\u0000":
802 self.tokenQueue.append({"type": tokenTypes["ParseError"],
803 "data": "invalid-codepoint"})
804 self.tokenQueue.append({"type": tokenTypes["Characters"],
805 "data": "\uFFFD"})
806 self.state = self.scriptDataDoubleEscapedState
807 elif data == EOF:
808 self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
809 "eof-in-script-in-script"})
810 self.state = self.dataState
811 else:
812 self.tokenQueue.append({"type": tokenTypes["Characters"], "data": data})
813 self.state = self.scriptDataDoubleEscapedState
814 return True
815
816 def scriptDataDoubleEscapedLessThanSignState(self):
817 data = self.stream.char()
818 if data == "/":
819 self.tokenQueue.append({"type": tokenTypes["Characters"], "data": "/"})
820 self.temporaryBuffer = ""
821 self.state = self.scriptDataDoubleEscapeEndState
822 else:
823 self.stream.unget(data)
824 self.state = self.scriptDataDoubleEscapedState
825 return True
826
827 def scriptDataDoubleEscapeEndState(self):
828 data = self.stream.char()
829 if data in (spaceCharacters | frozenset(("/", ">"))):
830 self.tokenQueue.append({"type": tokenTypes["Characters"], "data": data})
831 if self.temporaryBuffer.lower() == "script":
832 self.state = self.scriptDataEscapedState
833 else:
834 self.state = self.scriptDataDoubleEscapedState
835 elif data in asciiLetters:
836 self.tokenQueue.append({"type": tokenTypes["Characters"], "data": data})
837 self.temporaryBuffer += data
838 else:
839 self.stream.unget(data)
840 self.state = self.scriptDataDoubleEscapedState
841 return True
842
843 def beforeAttributeNameState(self):
844 data = self.stream.char()
845 if data in spaceCharacters:
846 self.stream.charsUntil(spaceCharacters, True)
847 elif data in asciiLetters:
848 self.currentToken["data"].append([data, ""])
849 self.state = self.attributeNameState
850 elif data == ">":
851 self.emitCurrentToken()
852 elif data == "/":
853 self.state = self.selfClosingStartTagState
854 elif data in ("'", '"', "=", "<"):
855 self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
856 "invalid-character-in-attribute-name"})
857 self.currentToken["data"].append([data, ""])
858 self.state = self.attributeNameState
859 elif data == "\u0000":
860 self.tokenQueue.append({"type": tokenTypes["ParseError"],
861 "data": "invalid-codepoint"})
862 self.currentToken["data"].append(["\uFFFD", ""])
863 self.state = self.attributeNameState
864 elif data is EOF:
865 self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
866 "expected-attribute-name-but-got-eof"})
867 self.state = self.dataState
868 else:
869 self.currentToken["data"].append([data, ""])
870 self.state = self.attributeNameState
871 return True
872
873 def attributeNameState(self):
874 data = self.stream.char()
875 leavingThisState = True
876 emitToken = False
877 if data == "=":
878 self.state = self.beforeAttributeValueState
879 elif data in asciiLetters:
880 self.currentToken["data"][-1][0] += data +\
881 self.stream.charsUntil(asciiLetters, True)
882 leavingThisState = False
883 elif data == ">":
884 # XXX If we emit here the attributes are converted to a dict
885 # without being checked and when the code below runs we error
886 # because data is a dict not a list
887 emitToken = True
888 elif data in spaceCharacters:
889 self.state = self.afterAttributeNameState
890 elif data == "/":
891 self.state = self.selfClosingStartTagState
892 elif data == "\u0000":
893 self.tokenQueue.append({"type": tokenTypes["ParseError"],
894 "data": "invalid-codepoint"})
895 self.currentToken["data"][-1][0] += "\uFFFD"
896 leavingThisState = False
897 elif data in ("'", '"', "<"):
898 self.tokenQueue.append({"type": tokenTypes["ParseError"],
899 "data":
900 "invalid-character-in-attribute-name"})
901 self.currentToken["data"][-1][0] += data
902 leavingThisState = False
903 elif data is EOF:
904 self.tokenQueue.append({"type": tokenTypes["ParseError"],
905 "data": "eof-in-attribute-name"})
906 self.state = self.dataState
907 else:
908 self.currentToken["data"][-1][0] += data
909 leavingThisState = False
910
911 if leavingThisState:
912 # Attributes are not dropped at this stage. That happens when the
913 # start tag token is emitted so values can still be safely appended
914 # to attributes, but we do want to report the parse error in time.
915 self.currentToken["data"][-1][0] = (
916 self.currentToken["data"][-1][0].translate(asciiUpper2Lower))
917 for name, _ in self.currentToken["data"][:-1]:
918 if self.currentToken["data"][-1][0] == name:
919 self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
920 "duplicate-attribute"})
921 break
922 # XXX Fix for above XXX
923 if emitToken:
924 self.emitCurrentToken()
925 return True
926
927 def afterAttributeNameState(self):
928 data = self.stream.char()
929 if data in spaceCharacters:
930 self.stream.charsUntil(spaceCharacters, True)
931 elif data == "=":
932 self.state = self.beforeAttributeValueState
933 elif data == ">":
934 self.emitCurrentToken()
935 elif data in asciiLetters:
936 self.currentToken["data"].append([data, ""])
937 self.state = self.attributeNameState
938 elif data == "/":
939 self.state = self.selfClosingStartTagState
940 elif data == "\u0000":
941 self.tokenQueue.append({"type": tokenTypes["ParseError"],
942 "data": "invalid-codepoint"})
943 self.currentToken["data"].append(["\uFFFD", ""])
944 self.state = self.attributeNameState
945 elif data in ("'", '"', "<"):
946 self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
947 "invalid-character-after-attribute-name"})
948 self.currentToken["data"].append([data, ""])
949 self.state = self.attributeNameState
950 elif data is EOF:
951 self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
952 "expected-end-of-tag-but-got-eof"})
953 self.state = self.dataState
954 else:
955 self.currentToken["data"].append([data, ""])
956 self.state = self.attributeNameState
957 return True
958
959 def beforeAttributeValueState(self):
960 data = self.stream.char()
961 if data in spaceCharacters:
962 self.stream.charsUntil(spaceCharacters, True)
963 elif data == "\"":
964 self.state = self.attributeValueDoubleQuotedState
965 elif data == "&":
966 self.state = self.attributeValueUnQuotedState
967 self.stream.unget(data)
968 elif data == "'":
969 self.state = self.attributeValueSingleQuotedState
970 elif data == ">":
971 self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
972 "expected-attribute-value-but-got-right-bracket"})
973 self.emitCurrentToken()
974 elif data == "\u0000":
975 self.tokenQueue.append({"type": tokenTypes["ParseError"],
976 "data": "invalid-codepoint"})
977 self.currentToken["data"][-1][1] += "\uFFFD"
978 self.state = self.attributeValueUnQuotedState
979 elif data in ("=", "<", "`"):
980 self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
981 "equals-in-unquoted-attribute-value"})
982 self.currentToken["data"][-1][1] += data
983 self.state = self.attributeValueUnQuotedState
984 elif data is EOF:
985 self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
986 "expected-attribute-value-but-got-eof"})
987 self.state = self.dataState
988 else:
989 self.currentToken["data"][-1][1] += data
990 self.state = self.attributeValueUnQuotedState
991 return True
992
993 def attributeValueDoubleQuotedState(self):
994 data = self.stream.char()
995 if data == "\"":
996 self.state = self.afterAttributeValueState
997 elif data == "&":
998 self.processEntityInAttribute('"')
999 elif data == "\u0000":
1000 self.tokenQueue.append({"type": tokenTypes["ParseError"],
1001 "data": "invalid-codepoint"})
1002 self.currentToken["data"][-1][1] += "\uFFFD"
1003 elif data is EOF:
1004 self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
1005 "eof-in-attribute-value-double-quote"})
1006 self.state = self.dataState
1007 else:
1008 self.currentToken["data"][-1][1] += data +\
1009 self.stream.charsUntil(("\"", "&", "\u0000"))
1010 return True
1011
1012 def attributeValueSingleQuotedState(self):
1013 data = self.stream.char()
1014 if data == "'":
1015 self.state = self.afterAttributeValueState
1016 elif data == "&":
1017 self.processEntityInAttribute("'")
1018 elif data == "\u0000":
1019 self.tokenQueue.append({"type": tokenTypes["ParseError"],
1020 "data": "invalid-codepoint"})
1021 self.currentToken["data"][-1][1] += "\uFFFD"
1022 elif data is EOF:
1023 self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
1024 "eof-in-attribute-value-single-quote"})
1025 self.state = self.dataState
1026 else:
1027 self.currentToken["data"][-1][1] += data +\
1028 self.stream.charsUntil(("'", "&", "\u0000"))
1029 return True
1030
1031 def attributeValueUnQuotedState(self):
1032 data = self.stream.char()
1033 if data in spaceCharacters:
1034 self.state = self.beforeAttributeNameState
1035 elif data == "&":
1036 self.processEntityInAttribute(">")
1037 elif data == ">":
1038 self.emitCurrentToken()
1039 elif data in ('"', "'", "=", "<", "`"):
1040 self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
1041 "unexpected-character-in-unquoted-attribute-value"})
1042 self.currentToken["data"][-1][1] += data
1043 elif data == "\u0000":
1044 self.tokenQueue.append({"type": tokenTypes["ParseError"],
1045 "data": "invalid-codepoint"})
1046 self.currentToken["data"][-1][1] += "\uFFFD"
1047 elif data is EOF:
1048 self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
1049 "eof-in-attribute-value-no-quotes"})
1050 self.state = self.dataState
1051 else:
1052 self.currentToken["data"][-1][1] += data + self.stream.charsUntil(
1053 frozenset(("&", ">", '"', "'", "=", "<", "`", "\u0000")) | spaceCharacters)
1054 return True
1055
1056 def afterAttributeValueState(self):
1057 data = self.stream.char()
1058 if data in spaceCharacters:
1059 self.state = self.beforeAttributeNameState
1060 elif data == ">":
1061 self.emitCurrentToken()
1062 elif data == "/":
1063 self.state = self.selfClosingStartTagState
1064 elif data is EOF:
1065 self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
1066 "unexpected-EOF-after-attribute-value"})
1067 self.stream.unget(data)
1068 self.state = self.dataState
1069 else:
1070 self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
1071 "unexpected-character-after-attribute-value"})
1072 self.stream.unget(data)
1073 self.state = self.beforeAttributeNameState
1074 return True
1075
1076 def selfClosingStartTagState(self):
1077 data = self.stream.char()
1078 if data == ">":
1079 self.currentToken["selfClosing"] = True
1080 self.emitCurrentToken()
1081 elif data is EOF:
1082 self.tokenQueue.append({"type": tokenTypes["ParseError"],
1083 "data":
1084 "unexpected-EOF-after-solidus-in-tag"})
1085 self.stream.unget(data)
1086 self.state = self.dataState
1087 else:
1088 self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
1089 "unexpected-character-after-solidus-in-tag"})
1090 self.stream.unget(data)
1091 self.state = self.beforeAttributeNameState
1092 return True
1093
1094 def bogusCommentState(self):
1095 # Make a new comment token and give it as value all the characters
1096 # until the first > or EOF (charsUntil checks for EOF automatically)
1097 # and emit it.
1098 data = self.stream.charsUntil(">")
1099 data = data.replace("\u0000", "\uFFFD")
1100 self.tokenQueue.append(
1101 {"type": tokenTypes["Comment"], "data": data})
1102
1103 # Eat the character directly after the bogus comment which is either a
1104 # ">" or an EOF.
1105 self.stream.char()
1106 self.state = self.dataState
1107 return True
1108
1109 def markupDeclarationOpenState(self):
1110 charStack = [self.stream.char()]
1111 if charStack[-1] == "-":
1112 charStack.append(self.stream.char())
1113 if charStack[-1] == "-":
1114 self.currentToken = {"type": tokenTypes["Comment"], "data": ""}
1115 self.state = self.commentStartState
1116 return True
1117 elif charStack[-1] in ('d', 'D'):
1118 matched = True
1119 for expected in (('o', 'O'), ('c', 'C'), ('t', 'T'),
1120 ('y', 'Y'), ('p', 'P'), ('e', 'E')):
1121 charStack.append(self.stream.char())
1122 if charStack[-1] not in expected:
1123 matched = False
1124 break
1125 if matched:
1126 self.currentToken = {"type": tokenTypes["Doctype"],
1127 "name": "",
1128 "publicId": None, "systemId": None,
1129 "correct": True}
1130 self.state = self.doctypeState
1131 return True
1132 elif (charStack[-1] == "[" and
1133 self.parser is not None and
1134 self.parser.tree.openElements and
1135 self.parser.tree.openElements[-1].namespace != self.parser.tree.defaultNamespace):
1136 matched = True
1137 for expected in ["C", "D", "A", "T", "A", "["]:
1138 charStack.append(self.stream.char())
1139 if charStack[-1] != expected:
1140 matched = False
1141 break
1142 if matched:
1143 self.state = self.cdataSectionState
1144 return True
1145
1146 self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
1147 "expected-dashes-or-doctype"})
1148
1149 while charStack:
1150 self.stream.unget(charStack.pop())
1151 self.state = self.bogusCommentState
1152 return True
1153
1154 def commentStartState(self):
1155 data = self.stream.char()
1156 if data == "-":
1157 self.state = self.commentStartDashState
1158 elif data == "\u0000":
1159 self.tokenQueue.append({"type": tokenTypes["ParseError"],
1160 "data": "invalid-codepoint"})
1161 self.currentToken["data"] += "\uFFFD"
1162 elif data == ">":
1163 self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
1164 "incorrect-comment"})
1165 self.tokenQueue.append(self.currentToken)
1166 self.state = self.dataState
1167 elif data is EOF:
1168 self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
1169 "eof-in-comment"})
1170 self.tokenQueue.append(self.currentToken)
1171 self.state = self.dataState
1172 else:
1173 self.currentToken["data"] += data
1174 self.state = self.commentState
1175 return True
1176
1177 def commentStartDashState(self):
1178 data = self.stream.char()
1179 if data == "-":
1180 self.state = self.commentEndState
1181 elif data == "\u0000":
1182 self.tokenQueue.append({"type": tokenTypes["ParseError"],
1183 "data": "invalid-codepoint"})
1184 self.currentToken["data"] += "-\uFFFD"
1185 elif data == ">":
1186 self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
1187 "incorrect-comment"})
1188 self.tokenQueue.append(self.currentToken)
1189 self.state = self.dataState
1190 elif data is EOF:
1191 self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
1192 "eof-in-comment"})
1193 self.tokenQueue.append(self.currentToken)
1194 self.state = self.dataState
1195 else:
1196 self.currentToken["data"] += "-" + data
1197 self.state = self.commentState
1198 return True
1199
1200 def commentState(self):
1201 data = self.stream.char()
1202 if data == "-":
1203 self.state = self.commentEndDashState
1204 elif data == "\u0000":
1205 self.tokenQueue.append({"type": tokenTypes["ParseError"],
1206 "data": "invalid-codepoint"})
1207 self.currentToken["data"] += "\uFFFD"
1208 elif data is EOF:
1209 self.tokenQueue.append({"type": tokenTypes["ParseError"],
1210 "data": "eof-in-comment"})
1211 self.tokenQueue.append(self.currentToken)
1212 self.state = self.dataState
1213 else:
1214 self.currentToken["data"] += data + \
1215 self.stream.charsUntil(("-", "\u0000"))
1216 return True
1217
1218 def commentEndDashState(self):
1219 data = self.stream.char()
1220 if data == "-":
1221 self.state = self.commentEndState
1222 elif data == "\u0000":
1223 self.tokenQueue.append({"type": tokenTypes["ParseError"],
1224 "data": "invalid-codepoint"})
1225 self.currentToken["data"] += "-\uFFFD"
1226 self.state = self.commentState
1227 elif data is EOF:
1228 self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
1229 "eof-in-comment-end-dash"})
1230 self.tokenQueue.append(self.currentToken)
1231 self.state = self.dataState
1232 else:
1233 self.currentToken["data"] += "-" + data
1234 self.state = self.commentState
1235 return True
1236
1237 def commentEndState(self):
1238 data = self.stream.char()
1239 if data == ">":
1240 self.tokenQueue.append(self.currentToken)
1241 self.state = self.dataState
1242 elif data == "\u0000":
1243 self.tokenQueue.append({"type": tokenTypes["ParseError"],
1244 "data": "invalid-codepoint"})
1245 self.currentToken["data"] += "--\uFFFD"
1246 self.state = self.commentState
1247 elif data == "!":
1248 self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
1249 "unexpected-bang-after-double-dash-in-comment"})
1250 self.state = self.commentEndBangState
1251 elif data == "-":
1252 self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
1253 "unexpected-dash-after-double-dash-in-comment"})
1254 self.currentToken["data"] += data
1255 elif data is EOF:
1256 self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
1257 "eof-in-comment-double-dash"})
1258 self.tokenQueue.append(self.currentToken)
1259 self.state = self.dataState
1260 else:
1261 # XXX
1262 self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
1263 "unexpected-char-in-comment"})
1264 self.currentToken["data"] += "--" + data
1265 self.state = self.commentState
1266 return True
1267
1268 def commentEndBangState(self):
1269 data = self.stream.char()
1270 if data == ">":
1271 self.tokenQueue.append(self.currentToken)
1272 self.state = self.dataState
1273 elif data == "-":
1274 self.currentToken["data"] += "--!"
1275 self.state = self.commentEndDashState
1276 elif data == "\u0000":
1277 self.tokenQueue.append({"type": tokenTypes["ParseError"],
1278 "data": "invalid-codepoint"})
1279 self.currentToken["data"] += "--!\uFFFD"
1280 self.state = self.commentState
1281 elif data is EOF:
1282 self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
1283 "eof-in-comment-end-bang-state"})
1284 self.tokenQueue.append(self.currentToken)
1285 self.state = self.dataState
1286 else:
1287 self.currentToken["data"] += "--!" + data
1288 self.state = self.commentState
1289 return True
1290
1291 def doctypeState(self):
1292 data = self.stream.char()
1293 if data in spaceCharacters:
1294 self.state = self.beforeDoctypeNameState
1295 elif data is EOF:
1296 self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
1297 "expected-doctype-name-but-got-eof"})
1298 self.currentToken["correct"] = False
1299 self.tokenQueue.append(self.currentToken)
1300 self.state = self.dataState
1301 else:
1302 self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
1303 "need-space-after-doctype"})
1304 self.stream.unget(data)
1305 self.state = self.beforeDoctypeNameState
1306 return True
1307
1308 def beforeDoctypeNameState(self):
1309 data = self.stream.char()
1310 if data in spaceCharacters:
1311 pass
1312 elif data == ">":
1313 self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
1314 "expected-doctype-name-but-got-right-bracket"})
1315 self.currentToken["correct"] = False
1316 self.tokenQueue.append(self.currentToken)
1317 self.state = self.dataState
1318 elif data == "\u0000":
1319 self.tokenQueue.append({"type": tokenTypes["ParseError"],
1320 "data": "invalid-codepoint"})
1321 self.currentToken["name"] = "\uFFFD"
1322 self.state = self.doctypeNameState
1323 elif data is EOF:
1324 self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
1325 "expected-doctype-name-but-got-eof"})
1326 self.currentToken["correct"] = False
1327 self.tokenQueue.append(self.currentToken)
1328 self.state = self.dataState
1329 else:
1330 self.currentToken["name"] = data
1331 self.state = self.doctypeNameState
1332 return True
1333
1334 def doctypeNameState(self):
1335 data = self.stream.char()
1336 if data in spaceCharacters:
1337 self.currentToken["name"] = self.currentToken["name"].translate(asciiUpper2Lower)
1338 self.state = self.afterDoctypeNameState
1339 elif data == ">":
1340 self.currentToken["name"] = self.currentToken["name"].translate(asciiUpper2Lower)
1341 self.tokenQueue.append(self.currentToken)
1342 self.state = self.dataState
1343 elif data == "\u0000":
1344 self.tokenQueue.append({"type": tokenTypes["ParseError"],
1345 "data": "invalid-codepoint"})
1346 self.currentToken["name"] += "\uFFFD"
1347 self.state = self.doctypeNameState
1348 elif data is EOF:
1349 self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
1350 "eof-in-doctype-name"})
1351 self.currentToken["correct"] = False
1352 self.currentToken["name"] = self.currentToken["name"].translate(asciiUpper2Lower)
1353 self.tokenQueue.append(self.currentToken)
1354 self.state = self.dataState
1355 else:
1356 self.currentToken["name"] += data
1357 return True
1358
1359 def afterDoctypeNameState(self):
1360 data = self.stream.char()
1361 if data in spaceCharacters:
1362 pass
1363 elif data == ">":
1364 self.tokenQueue.append(self.currentToken)
1365 self.state = self.dataState
1366 elif data is EOF:
1367 self.currentToken["correct"] = False
1368 self.stream.unget(data)
1369 self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
1370 "eof-in-doctype"})
1371 self.tokenQueue.append(self.currentToken)
1372 self.state = self.dataState
1373 else:
1374 if data in ("p", "P"):
1375 matched = True
1376 for expected in (("u", "U"), ("b", "B"), ("l", "L"),
1377 ("i", "I"), ("c", "C")):
1378 data = self.stream.char()
1379 if data not in expected:
1380 matched = False
1381 break
1382 if matched:
1383 self.state = self.afterDoctypePublicKeywordState
1384 return True
1385 elif data in ("s", "S"):
1386 matched = True
1387 for expected in (("y", "Y"), ("s", "S"), ("t", "T"),
1388 ("e", "E"), ("m", "M")):
1389 data = self.stream.char()
1390 if data not in expected:
1391 matched = False
1392 break
1393 if matched:
1394 self.state = self.afterDoctypeSystemKeywordState
1395 return True
1396
1397 # All the characters read before the current 'data' will be
1398 # [a-zA-Z], so they're garbage in the bogus doctype and can be
1399 # discarded; only the latest character might be '>' or EOF
1400 # and needs to be ungetted
1401 self.stream.unget(data)
1402 self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
1403 "expected-space-or-right-bracket-in-doctype", "datavars":
1404 {"data": data}})
1405 self.currentToken["correct"] = False
1406 self.state = self.bogusDoctypeState
1407
1408 return True
1409
1410 def afterDoctypePublicKeywordState(self):
1411 data = self.stream.char()
1412 if data in spaceCharacters:
1413 self.state = self.beforeDoctypePublicIdentifierState
1414 elif data in ("'", '"'):
1415 self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
1416 "unexpected-char-in-doctype"})
1417 self.stream.unget(data)
1418 self.state = self.beforeDoctypePublicIdentifierState
1419 elif data is EOF:
1420 self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
1421 "eof-in-doctype"})
1422 self.currentToken["correct"] = False
1423 self.tokenQueue.append(self.currentToken)
1424 self.state = self.dataState
1425 else:
1426 self.stream.unget(data)
1427 self.state = self.beforeDoctypePublicIdentifierState
1428 return True
1429
1430 def beforeDoctypePublicIdentifierState(self):
1431 data = self.stream.char()
1432 if data in spaceCharacters:
1433 pass
1434 elif data == "\"":
1435 self.currentToken["publicId"] = ""
1436 self.state = self.doctypePublicIdentifierDoubleQuotedState
1437 elif data == "'":
1438 self.currentToken["publicId"] = ""
1439 self.state = self.doctypePublicIdentifierSingleQuotedState
1440 elif data == ">":
1441 self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
1442 "unexpected-end-of-doctype"})
1443 self.currentToken["correct"] = False
1444 self.tokenQueue.append(self.currentToken)
1445 self.state = self.dataState
1446 elif data is EOF:
1447 self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
1448 "eof-in-doctype"})
1449 self.currentToken["correct"] = False
1450 self.tokenQueue.append(self.currentToken)
1451 self.state = self.dataState
1452 else:
1453 self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
1454 "unexpected-char-in-doctype"})
1455 self.currentToken["correct"] = False
1456 self.state = self.bogusDoctypeState
1457 return True
1458
1459 def doctypePublicIdentifierDoubleQuotedState(self):
1460 data = self.stream.char()
1461 if data == "\"":
1462 self.state = self.afterDoctypePublicIdentifierState
1463 elif data == "\u0000":
1464 self.tokenQueue.append({"type": tokenTypes["ParseError"],
1465 "data": "invalid-codepoint"})
1466 self.currentToken["publicId"] += "\uFFFD"
1467 elif data == ">":
1468 self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
1469 "unexpected-end-of-doctype"})
1470 self.currentToken["correct"] = False
1471 self.tokenQueue.append(self.currentToken)
1472 self.state = self.dataState
1473 elif data is EOF:
1474 self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
1475 "eof-in-doctype"})
1476 self.currentToken["correct"] = False
1477 self.tokenQueue.append(self.currentToken)
1478 self.state = self.dataState
1479 else:
1480 self.currentToken["publicId"] += data
1481 return True
1482
1483 def doctypePublicIdentifierSingleQuotedState(self):
1484 data = self.stream.char()
1485 if data == "'":
1486 self.state = self.afterDoctypePublicIdentifierState
1487 elif data == "\u0000":
1488 self.tokenQueue.append({"type": tokenTypes["ParseError"],
1489 "data": "invalid-codepoint"})
1490 self.currentToken["publicId"] += "\uFFFD"
1491 elif data == ">":
1492 self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
1493 "unexpected-end-of-doctype"})
1494 self.currentToken["correct"] = False
1495 self.tokenQueue.append(self.currentToken)
1496 self.state = self.dataState
1497 elif data is EOF:
1498 self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
1499 "eof-in-doctype"})
1500 self.currentToken["correct"] = False
1501 self.tokenQueue.append(self.currentToken)
1502 self.state = self.dataState
1503 else:
1504 self.currentToken["publicId"] += data
1505 return True
1506
1507 def afterDoctypePublicIdentifierState(self):
1508 data = self.stream.char()
1509 if data in spaceCharacters:
1510 self.state = self.betweenDoctypePublicAndSystemIdentifiersState
1511 elif data == ">":
1512 self.tokenQueue.append(self.currentToken)
1513 self.state = self.dataState
1514 elif data == '"':
1515 self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
1516 "unexpected-char-in-doctype"})
1517 self.currentToken["systemId"] = ""
1518 self.state = self.doctypeSystemIdentifierDoubleQuotedState
1519 elif data == "'":
1520 self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
1521 "unexpected-char-in-doctype"})
1522 self.currentToken["systemId"] = ""
1523 self.state = self.doctypeSystemIdentifierSingleQuotedState
1524 elif data is EOF:
1525 self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
1526 "eof-in-doctype"})
1527 self.currentToken["correct"] = False
1528 self.tokenQueue.append(self.currentToken)
1529 self.state = self.dataState
1530 else:
1531 self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
1532 "unexpected-char-in-doctype"})
1533 self.currentToken["correct"] = False
1534 self.state = self.bogusDoctypeState
1535 return True
1536
1537 def betweenDoctypePublicAndSystemIdentifiersState(self):
1538 data = self.stream.char()
1539 if data in spaceCharacters:
1540 pass
1541 elif data == ">":
1542 self.tokenQueue.append(self.currentToken)
1543 self.state = self.dataState
1544 elif data == '"':
1545 self.currentToken["systemId"] = ""
1546 self.state = self.doctypeSystemIdentifierDoubleQuotedState
1547 elif data == "'":
1548 self.currentToken["systemId"] = ""
1549 self.state = self.doctypeSystemIdentifierSingleQuotedState
1550 elif data == EOF:
1551 self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
1552 "eof-in-doctype"})
1553 self.currentToken["correct"] = False
1554 self.tokenQueue.append(self.currentToken)
1555 self.state = self.dataState
1556 else:
1557 self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
1558 "unexpected-char-in-doctype"})
1559 self.currentToken["correct"] = False
1560 self.state = self.bogusDoctypeState
1561 return True
1562
1563 def afterDoctypeSystemKeywordState(self):
1564 data = self.stream.char()
1565 if data in spaceCharacters:
1566 self.state = self.beforeDoctypeSystemIdentifierState
1567 elif data in ("'", '"'):
1568 self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
1569 "unexpected-char-in-doctype"})
1570 self.stream.unget(data)
1571 self.state = self.beforeDoctypeSystemIdentifierState
1572 elif data is EOF:
1573 self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
1574 "eof-in-doctype"})
1575 self.currentToken["correct"] = False
1576 self.tokenQueue.append(self.currentToken)
1577 self.state = self.dataState
1578 else:
1579 self.stream.unget(data)
1580 self.state = self.beforeDoctypeSystemIdentifierState
1581 return True
1582
1583 def beforeDoctypeSystemIdentifierState(self):
1584 data = self.stream.char()
1585 if data in spaceCharacters:
1586 pass
1587 elif data == "\"":
1588 self.currentToken["systemId"] = ""
1589 self.state = self.doctypeSystemIdentifierDoubleQuotedState
1590 elif data == "'":
1591 self.currentToken["systemId"] = ""
1592 self.state = self.doctypeSystemIdentifierSingleQuotedState
1593 elif data == ">":
1594 self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
1595 "unexpected-char-in-doctype"})
1596 self.currentToken["correct"] = False
1597 self.tokenQueue.append(self.currentToken)
1598 self.state = self.dataState
1599 elif data is EOF:
1600 self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
1601 "eof-in-doctype"})
1602 self.currentToken["correct"] = False
1603 self.tokenQueue.append(self.currentToken)
1604 self.state = self.dataState
1605 else:
1606 self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
1607 "unexpected-char-in-doctype"})
1608 self.currentToken["correct"] = False
1609 self.state = self.bogusDoctypeState
1610 return True
1611
1612 def doctypeSystemIdentifierDoubleQuotedState(self):
1613 data = self.stream.char()
1614 if data == "\"":
1615 self.state = self.afterDoctypeSystemIdentifierState
1616 elif data == "\u0000":
1617 self.tokenQueue.append({"type": tokenTypes["ParseError"],
1618 "data": "invalid-codepoint"})
1619 self.currentToken["systemId"] += "\uFFFD"
1620 elif data == ">":
1621 self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
1622 "unexpected-end-of-doctype"})
1623 self.currentToken["correct"] = False
1624 self.tokenQueue.append(self.currentToken)
1625 self.state = self.dataState
1626 elif data is EOF:
1627 self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
1628 "eof-in-doctype"})
1629 self.currentToken["correct"] = False
1630 self.tokenQueue.append(self.currentToken)
1631 self.state = self.dataState
1632 else:
1633 self.currentToken["systemId"] += data
1634 return True
1635
1636 def doctypeSystemIdentifierSingleQuotedState(self):
1637 data = self.stream.char()
1638 if data == "'":
1639 self.state = self.afterDoctypeSystemIdentifierState
1640 elif data == "\u0000":
1641 self.tokenQueue.append({"type": tokenTypes["ParseError"],
1642 "data": "invalid-codepoint"})
1643 self.currentToken["systemId"] += "\uFFFD"
1644 elif data == ">":
1645 self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
1646 "unexpected-end-of-doctype"})
1647 self.currentToken["correct"] = False
1648 self.tokenQueue.append(self.currentToken)
1649 self.state = self.dataState
1650 elif data is EOF:
1651 self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
1652 "eof-in-doctype"})
1653 self.currentToken["correct"] = False
1654 self.tokenQueue.append(self.currentToken)
1655 self.state = self.dataState
1656 else:
1657 self.currentToken["systemId"] += data
1658 return True
1659
1660 def afterDoctypeSystemIdentifierState(self):
1661 data = self.stream.char()
1662 if data in spaceCharacters:
1663 pass
1664 elif data == ">":
1665 self.tokenQueue.append(self.currentToken)
1666 self.state = self.dataState
1667 elif data is EOF:
1668 self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
1669 "eof-in-doctype"})
1670 self.currentToken["correct"] = False
1671 self.tokenQueue.append(self.currentToken)
1672 self.state = self.dataState
1673 else:
1674 self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
1675 "unexpected-char-in-doctype"})
1676 self.state = self.bogusDoctypeState
1677 return True
1678
1679 def bogusDoctypeState(self):
1680 data = self.stream.char()
1681 if data == ">":
1682 self.tokenQueue.append(self.currentToken)
1683 self.state = self.dataState
1684 elif data is EOF:
1685 # XXX EMIT
1686 self.stream.unget(data)
1687 self.tokenQueue.append(self.currentToken)
1688 self.state = self.dataState
1689 else:
1690 pass
1691 return True
1692
1693 def cdataSectionState(self):
1694 data = []
1695 while True:
1696 data.append(self.stream.charsUntil("]"))
1697 data.append(self.stream.charsUntil(">"))
1698 char = self.stream.char()
1699 if char == EOF:
1700 break
1701 else:
1702 assert char == ">"
1703 if data[-1][-2:] == "]]":
1704 data[-1] = data[-1][:-2]
1705 break
1706 else:
1707 data.append(char)
1708
1709 data = "".join(data) # pylint:disable=redefined-variable-type
1710 # Deal with null here rather than in the parser
1711 nullCount = data.count("\u0000")
1712 if nullCount > 0:
1713 for _ in range(nullCount):
1714 self.tokenQueue.append({"type": tokenTypes["ParseError"],
1715 "data": "invalid-codepoint"})
1716 data = data.replace("\u0000", "\uFFFD")
1717 if data:
1718 self.tokenQueue.append({"type": tokenTypes["Characters"],
1719 "data": data})
1720 self.state = self.dataState
1721 return True
diff --git a/venv/lib/python3.7/site-packages/pip-10.0.1-py3.7.egg/pip/_vendor/html5lib/_trie/__init__.py b/venv/lib/python3.7/site-packages/pip-10.0.1-py3.7.egg/pip/_vendor/html5lib/_trie/__init__.py
new file mode 100644
index 0000000..ccc70bd
--- /dev/null
+++ b/venv/lib/python3.7/site-packages/pip-10.0.1-py3.7.egg/pip/_vendor/html5lib/_trie/__init__.py
@@ -0,0 +1,14 @@
1from __future__ import absolute_import, division, unicode_literals
2
3from .py import Trie as PyTrie
4
5Trie = PyTrie
6
7# pylint:disable=wrong-import-position
8try:
9 from .datrie import Trie as DATrie
10except ImportError:
11 pass
12else:
13 Trie = DATrie
14# pylint:enable=wrong-import-position
diff --git a/venv/lib/python3.7/site-packages/pip-10.0.1-py3.7.egg/pip/_vendor/html5lib/_trie/_base.py b/venv/lib/python3.7/site-packages/pip-10.0.1-py3.7.egg/pip/_vendor/html5lib/_trie/_base.py
new file mode 100644
index 0000000..ecfff32
--- /dev/null
+++ b/venv/lib/python3.7/site-packages/pip-10.0.1-py3.7.egg/pip/_vendor/html5lib/_trie/_base.py
@@ -0,0 +1,37 @@
1from __future__ import absolute_import, division, unicode_literals
2
3from collections import Mapping
4
5
6class Trie(Mapping):
7 """Abstract base class for tries"""
8
9 def keys(self, prefix=None):
10 # pylint:disable=arguments-differ
11 keys = super(Trie, self).keys()
12
13 if prefix is None:
14 return set(keys)
15
16 return {x for x in keys if x.startswith(prefix)}
17
18 def has_keys_with_prefix(self, prefix):
19 for key in self.keys():
20 if key.startswith(prefix):
21 return True
22
23 return False
24
25 def longest_prefix(self, prefix):
26 if prefix in self:
27 return prefix
28
29 for i in range(1, len(prefix) + 1):
30 if prefix[:-i] in self:
31 return prefix[:-i]
32
33 raise KeyError(prefix)
34
35 def longest_prefix_item(self, prefix):
36 lprefix = self.longest_prefix(prefix)
37 return (lprefix, self[lprefix])
diff --git a/venv/lib/python3.7/site-packages/pip-10.0.1-py3.7.egg/pip/_vendor/html5lib/_trie/datrie.py b/venv/lib/python3.7/site-packages/pip-10.0.1-py3.7.egg/pip/_vendor/html5lib/_trie/datrie.py
new file mode 100644
index 0000000..cb1af60
--- /dev/null
+++ b/venv/lib/python3.7/site-packages/pip-10.0.1-py3.7.egg/pip/_vendor/html5lib/_trie/datrie.py
@@ -0,0 +1,44 @@
1from __future__ import absolute_import, division, unicode_literals
2
3from datrie import Trie as DATrie
4from pip._vendor.six import text_type
5
6from ._base import Trie as ABCTrie
7
8
9class Trie(ABCTrie):
10 def __init__(self, data):
11 chars = set()
12 for key in data.keys():
13 if not isinstance(key, text_type):
14 raise TypeError("All keys must be strings")
15 for char in key:
16 chars.add(char)
17
18 self._data = DATrie("".join(chars))
19 for key, value in data.items():
20 self._data[key] = value
21
22 def __contains__(self, key):
23 return key in self._data
24
25 def __len__(self):
26 return len(self._data)
27
28 def __iter__(self):
29 raise NotImplementedError()
30
31 def __getitem__(self, key):
32 return self._data[key]
33
34 def keys(self, prefix=None):
35 return self._data.keys(prefix)
36
37 def has_keys_with_prefix(self, prefix):
38 return self._data.has_keys_with_prefix(prefix)
39
40 def longest_prefix(self, prefix):
41 return self._data.longest_prefix(prefix)
42
43 def longest_prefix_item(self, prefix):
44 return self._data.longest_prefix_item(prefix)
diff --git a/venv/lib/python3.7/site-packages/pip-10.0.1-py3.7.egg/pip/_vendor/html5lib/_trie/py.py b/venv/lib/python3.7/site-packages/pip-10.0.1-py3.7.egg/pip/_vendor/html5lib/_trie/py.py
new file mode 100644
index 0000000..5531263
--- /dev/null
+++ b/venv/lib/python3.7/site-packages/pip-10.0.1-py3.7.egg/pip/_vendor/html5lib/_trie/py.py
@@ -0,0 +1,67 @@
1from __future__ import absolute_import, division, unicode_literals
2from pip._vendor.six import text_type
3
4from bisect import bisect_left
5
6from ._base import Trie as ABCTrie
7
8
9class Trie(ABCTrie):
10 def __init__(self, data):
11 if not all(isinstance(x, text_type) for x in data.keys()):
12 raise TypeError("All keys must be strings")
13
14 self._data = data
15 self._keys = sorted(data.keys())
16 self._cachestr = ""
17 self._cachepoints = (0, len(data))
18
19 def __contains__(self, key):
20 return key in self._data
21
22 def __len__(self):
23 return len(self._data)
24
25 def __iter__(self):
26 return iter(self._data)
27
28 def __getitem__(self, key):
29 return self._data[key]
30
31 def keys(self, prefix=None):
32 if prefix is None or prefix == "" or not self._keys:
33 return set(self._keys)
34
35 if prefix.startswith(self._cachestr):
36 lo, hi = self._cachepoints
37 start = i = bisect_left(self._keys, prefix, lo, hi)
38 else:
39 start = i = bisect_left(self._keys, prefix)
40
41 keys = set()
42 if start == len(self._keys):
43 return keys
44
45 while self._keys[i].startswith(prefix):
46 keys.add(self._keys[i])
47 i += 1
48
49 self._cachestr = prefix
50 self._cachepoints = (start, i)
51
52 return keys
53
54 def has_keys_with_prefix(self, prefix):
55 if prefix in self._data:
56 return True
57
58 if prefix.startswith(self._cachestr):
59 lo, hi = self._cachepoints
60 i = bisect_left(self._keys, prefix, lo, hi)
61 else:
62 i = bisect_left(self._keys, prefix)
63
64 if i == len(self._keys):
65 return False
66
67 return self._keys[i].startswith(prefix)
diff --git a/venv/lib/python3.7/site-packages/pip-10.0.1-py3.7.egg/pip/_vendor/html5lib/_utils.py b/venv/lib/python3.7/site-packages/pip-10.0.1-py3.7.egg/pip/_vendor/html5lib/_utils.py
new file mode 100644
index 0000000..a559fa0
--- /dev/null
+++ b/venv/lib/python3.7/site-packages/pip-10.0.1-py3.7.egg/pip/_vendor/html5lib/_utils.py
@@ -0,0 +1,124 @@
1from __future__ import absolute_import, division, unicode_literals
2
3from types import ModuleType
4
5from pip._vendor.six import text_type
6
7try:
8 import xml.etree.cElementTree as default_etree
9except ImportError:
10 import xml.etree.ElementTree as default_etree
11
12
13__all__ = ["default_etree", "MethodDispatcher", "isSurrogatePair",
14 "surrogatePairToCodepoint", "moduleFactoryFactory",
15 "supports_lone_surrogates"]
16
17
18# Platforms not supporting lone surrogates (\uD800-\uDFFF) should be
19# caught by the below test. In general this would be any platform
20# using UTF-16 as its encoding of unicode strings, such as
21# Jython. This is because UTF-16 itself is based on the use of such
22# surrogates, and there is no mechanism to further escape such
23# escapes.
24try:
25 _x = eval('"\\uD800"') # pylint:disable=eval-used
26 if not isinstance(_x, text_type):
27 # We need this with u"" because of http://bugs.jython.org/issue2039
28 _x = eval('u"\\uD800"') # pylint:disable=eval-used
29 assert isinstance(_x, text_type)
30except: # pylint:disable=bare-except
31 supports_lone_surrogates = False
32else:
33 supports_lone_surrogates = True
34
35
36class MethodDispatcher(dict):
37 """Dict with 2 special properties:
38
39 On initiation, keys that are lists, sets or tuples are converted to
40 multiple keys so accessing any one of the items in the original
41 list-like object returns the matching value
42
43 md = MethodDispatcher({("foo", "bar"):"baz"})
44 md["foo"] == "baz"
45
46 A default value which can be set through the default attribute.
47 """
48
49 def __init__(self, items=()):
50 # Using _dictEntries instead of directly assigning to self is about
51 # twice as fast. Please do careful performance testing before changing
52 # anything here.
53 _dictEntries = []
54 for name, value in items:
55 if isinstance(name, (list, tuple, frozenset, set)):
56 for item in name:
57 _dictEntries.append((item, value))
58 else:
59 _dictEntries.append((name, value))
60 dict.__init__(self, _dictEntries)
61 assert len(self) == len(_dictEntries)
62 self.default = None
63
64 def __getitem__(self, key):
65 return dict.get(self, key, self.default)
66
67
68# Some utility functions to deal with weirdness around UCS2 vs UCS4
69# python builds
70
71def isSurrogatePair(data):
72 return (len(data) == 2 and
73 ord(data[0]) >= 0xD800 and ord(data[0]) <= 0xDBFF and
74 ord(data[1]) >= 0xDC00 and ord(data[1]) <= 0xDFFF)
75
76
77def surrogatePairToCodepoint(data):
78 char_val = (0x10000 + (ord(data[0]) - 0xD800) * 0x400 +
79 (ord(data[1]) - 0xDC00))
80 return char_val
81
82# Module Factory Factory (no, this isn't Java, I know)
83# Here to stop this being duplicated all over the place.
84
85
86def moduleFactoryFactory(factory):
87 moduleCache = {}
88
89 def moduleFactory(baseModule, *args, **kwargs):
90 if isinstance(ModuleType.__name__, type("")):
91 name = "_%s_factory" % baseModule.__name__
92 else:
93 name = b"_%s_factory" % baseModule.__name__
94
95 kwargs_tuple = tuple(kwargs.items())
96
97 try:
98 return moduleCache[name][args][kwargs_tuple]
99 except KeyError:
100 mod = ModuleType(name)
101 objs = factory(baseModule, *args, **kwargs)
102 mod.__dict__.update(objs)
103 if "name" not in moduleCache:
104 moduleCache[name] = {}
105 if "args" not in moduleCache[name]:
106 moduleCache[name][args] = {}
107 if "kwargs" not in moduleCache[name][args]:
108 moduleCache[name][args][kwargs_tuple] = {}
109 moduleCache[name][args][kwargs_tuple] = mod
110 return mod
111
112 return moduleFactory
113
114
115def memoize(func):
116 cache = {}
117
118 def wrapped(*args, **kwargs):
119 key = (tuple(args), tuple(kwargs.items()))
120 if key not in cache:
121 cache[key] = func(*args, **kwargs)
122 return cache[key]
123
124 return wrapped
diff --git a/venv/lib/python3.7/site-packages/pip-10.0.1-py3.7.egg/pip/_vendor/html5lib/constants.py b/venv/lib/python3.7/site-packages/pip-10.0.1-py3.7.egg/pip/_vendor/html5lib/constants.py
new file mode 100644
index 0000000..bca155e
--- /dev/null
+++ b/venv/lib/python3.7/site-packages/pip-10.0.1-py3.7.egg/pip/_vendor/html5lib/constants.py
@@ -0,0 +1,2947 @@
1from __future__ import absolute_import, division, unicode_literals
2
3import string
4
5EOF = None
6
7E = {
8 "null-character":
9 "Null character in input stream, replaced with U+FFFD.",
10 "invalid-codepoint":
11 "Invalid codepoint in stream.",
12 "incorrectly-placed-solidus":
13 "Solidus (/) incorrectly placed in tag.",
14 "incorrect-cr-newline-entity":
15 "Incorrect CR newline entity, replaced with LF.",
16 "illegal-windows-1252-entity":
17 "Entity used with illegal number (windows-1252 reference).",
18 "cant-convert-numeric-entity":
19 "Numeric entity couldn't be converted to character "
20 "(codepoint U+%(charAsInt)08x).",
21 "illegal-codepoint-for-numeric-entity":
22 "Numeric entity represents an illegal codepoint: "
23 "U+%(charAsInt)08x.",
24 "numeric-entity-without-semicolon":
25 "Numeric entity didn't end with ';'.",
26 "expected-numeric-entity-but-got-eof":
27 "Numeric entity expected. Got end of file instead.",
28 "expected-numeric-entity":
29 "Numeric entity expected but none found.",
30 "named-entity-without-semicolon":
31 "Named entity didn't end with ';'.",
32 "expected-named-entity":
33 "Named entity expected. Got none.",
34 "attributes-in-end-tag":
35 "End tag contains unexpected attributes.",
36 'self-closing-flag-on-end-tag':
37 "End tag contains unexpected self-closing flag.",
38 "expected-tag-name-but-got-right-bracket":
39 "Expected tag name. Got '>' instead.",
40 "expected-tag-name-but-got-question-mark":
41 "Expected tag name. Got '?' instead. (HTML doesn't "
42 "support processing instructions.)",
43 "expected-tag-name":
44 "Expected tag name. Got something else instead",
45 "expected-closing-tag-but-got-right-bracket":
46 "Expected closing tag. Got '>' instead. Ignoring '</>'.",
47 "expected-closing-tag-but-got-eof":
48 "Expected closing tag. Unexpected end of file.",
49 "expected-closing-tag-but-got-char":
50 "Expected closing tag. Unexpected character '%(data)s' found.",
51 "eof-in-tag-name":
52 "Unexpected end of file in the tag name.",
53 "expected-attribute-name-but-got-eof":
54 "Unexpected end of file. Expected attribute name instead.",
55 "eof-in-attribute-name":
56 "Unexpected end of file in attribute name.",
57 "invalid-character-in-attribute-name":
58 "Invalid character in attribute name",
59 "duplicate-attribute":
60 "Dropped duplicate attribute on tag.",
61 "expected-end-of-tag-name-but-got-eof":
62 "Unexpected end of file. Expected = or end of tag.",
63 "expected-attribute-value-but-got-eof":
64 "Unexpected end of file. Expected attribute value.",
65 "expected-attribute-value-but-got-right-bracket":
66 "Expected attribute value. Got '>' instead.",
67 'equals-in-unquoted-attribute-value':
68 "Unexpected = in unquoted attribute",
69 'unexpected-character-in-unquoted-attribute-value':
70 "Unexpected character in unquoted attribute",
71 "invalid-character-after-attribute-name":
72 "Unexpected character after attribute name.",
73 "unexpected-character-after-attribute-value":
74 "Unexpected character after attribute value.",
75 "eof-in-attribute-value-double-quote":
76 "Unexpected end of file in attribute value (\").",
77 "eof-in-attribute-value-single-quote":
78 "Unexpected end of file in attribute value (').",
79 "eof-in-attribute-value-no-quotes":
80 "Unexpected end of file in attribute value.",
81 "unexpected-EOF-after-solidus-in-tag":
82 "Unexpected end of file in tag. Expected >",
83 "unexpected-character-after-solidus-in-tag":
84 "Unexpected character after / in tag. Expected >",
85 "expected-dashes-or-doctype":
86 "Expected '--' or 'DOCTYPE'. Not found.",
87 "unexpected-bang-after-double-dash-in-comment":
88 "Unexpected ! after -- in comment",
89 "unexpected-space-after-double-dash-in-comment":
90 "Unexpected space after -- in comment",
91 "incorrect-comment":
92 "Incorrect comment.",
93 "eof-in-comment":
94 "Unexpected end of file in comment.",
95 "eof-in-comment-end-dash":
96 "Unexpected end of file in comment (-)",
97 "unexpected-dash-after-double-dash-in-comment":
98 "Unexpected '-' after '--' found in comment.",
99 "eof-in-comment-double-dash":
100 "Unexpected end of file in comment (--).",
101 "eof-in-comment-end-space-state":
102 "Unexpected end of file in comment.",
103 "eof-in-comment-end-bang-state":
104 "Unexpected end of file in comment.",
105 "unexpected-char-in-comment":
106 "Unexpected character in comment found.",
107 "need-space-after-doctype":
108 "No space after literal string 'DOCTYPE'.",
109 "expected-doctype-name-but-got-right-bracket":
110 "Unexpected > character. Expected DOCTYPE name.",
111 "expected-doctype-name-but-got-eof":
112 "Unexpected end of file. Expected DOCTYPE name.",
113 "eof-in-doctype-name":
114 "Unexpected end of file in DOCTYPE name.",
115 "eof-in-doctype":
116 "Unexpected end of file in DOCTYPE.",
117 "expected-space-or-right-bracket-in-doctype":
118 "Expected space or '>'. Got '%(data)s'",
119 "unexpected-end-of-doctype":
120 "Unexpected end of DOCTYPE.",
121 "unexpected-char-in-doctype":
122 "Unexpected character in DOCTYPE.",
123 "eof-in-innerhtml":
124 "XXX innerHTML EOF",
125 "unexpected-doctype":
126 "Unexpected DOCTYPE. Ignored.",
127 "non-html-root":
128 "html needs to be the first start tag.",
129 "expected-doctype-but-got-eof":
130 "Unexpected End of file. Expected DOCTYPE.",
131 "unknown-doctype":
132 "Erroneous DOCTYPE.",
133 "expected-doctype-but-got-chars":
134 "Unexpected non-space characters. Expected DOCTYPE.",
135 "expected-doctype-but-got-start-tag":
136 "Unexpected start tag (%(name)s). Expected DOCTYPE.",
137 "expected-doctype-but-got-end-tag":
138 "Unexpected end tag (%(name)s). Expected DOCTYPE.",
139 "end-tag-after-implied-root":
140 "Unexpected end tag (%(name)s) after the (implied) root element.",
141 "expected-named-closing-tag-but-got-eof":
142 "Unexpected end of file. Expected end tag (%(name)s).",
143 "two-heads-are-not-better-than-one":
144 "Unexpected start tag head in existing head. Ignored.",
145 "unexpected-end-tag":
146 "Unexpected end tag (%(name)s). Ignored.",
147 "unexpected-start-tag-out-of-my-head":
148 "Unexpected start tag (%(name)s) that can be in head. Moved.",
149 "unexpected-start-tag":
150 "Unexpected start tag (%(name)s).",
151 "missing-end-tag":
152 "Missing end tag (%(name)s).",
153 "missing-end-tags":
154 "Missing end tags (%(name)s).",
155 "unexpected-start-tag-implies-end-tag":
156 "Unexpected start tag (%(startName)s) "
157 "implies end tag (%(endName)s).",
158 "unexpected-start-tag-treated-as":
159 "Unexpected start tag (%(originalName)s). Treated as %(newName)s.",
160 "deprecated-tag":
161 "Unexpected start tag %(name)s. Don't use it!",
162 "unexpected-start-tag-ignored":
163 "Unexpected start tag %(name)s. Ignored.",
164 "expected-one-end-tag-but-got-another":
165 "Unexpected end tag (%(gotName)s). "
166 "Missing end tag (%(expectedName)s).",
167 "end-tag-too-early":
168 "End tag (%(name)s) seen too early. Expected other end tag.",
169 "end-tag-too-early-named":
170 "Unexpected end tag (%(gotName)s). Expected end tag (%(expectedName)s).",
171 "end-tag-too-early-ignored":
172 "End tag (%(name)s) seen too early. Ignored.",
173 "adoption-agency-1.1":
174 "End tag (%(name)s) violates step 1, "
175 "paragraph 1 of the adoption agency algorithm.",
176 "adoption-agency-1.2":
177 "End tag (%(name)s) violates step 1, "
178 "paragraph 2 of the adoption agency algorithm.",
179 "adoption-agency-1.3":
180 "End tag (%(name)s) violates step 1, "
181 "paragraph 3 of the adoption agency algorithm.",
182 "adoption-agency-4.4":
183 "End tag (%(name)s) violates step 4, "
184 "paragraph 4 of the adoption agency algorithm.",
185 "unexpected-end-tag-treated-as":
186 "Unexpected end tag (%(originalName)s). Treated as %(newName)s.",
187 "no-end-tag":
188 "This element (%(name)s) has no end tag.",
189 "unexpected-implied-end-tag-in-table":
190 "Unexpected implied end tag (%(name)s) in the table phase.",
191 "unexpected-implied-end-tag-in-table-body":
192 "Unexpected implied end tag (%(name)s) in the table body phase.",
193 "unexpected-char-implies-table-voodoo":
194 "Unexpected non-space characters in "
195 "table context caused voodoo mode.",
196 "unexpected-hidden-input-in-table":
197 "Unexpected input with type hidden in table context.",
198 "unexpected-form-in-table":
199 "Unexpected form in table context.",
200 "unexpected-start-tag-implies-table-voodoo":
201 "Unexpected start tag (%(name)s) in "
202 "table context caused voodoo mode.",
203 "unexpected-end-tag-implies-table-voodoo":
204 "Unexpected end tag (%(name)s) in "
205 "table context caused voodoo mode.",
206 "unexpected-cell-in-table-body":
207 "Unexpected table cell start tag (%(name)s) "
208 "in the table body phase.",
209 "unexpected-cell-end-tag":
210 "Got table cell end tag (%(name)s) "
211 "while required end tags are missing.",
212 "unexpected-end-tag-in-table-body":
213 "Unexpected end tag (%(name)s) in the table body phase. Ignored.",
214 "unexpected-implied-end-tag-in-table-row":
215 "Unexpected implied end tag (%(name)s) in the table row phase.",
216 "unexpected-end-tag-in-table-row":
217 "Unexpected end tag (%(name)s) in the table row phase. Ignored.",
218 "unexpected-select-in-select":
219 "Unexpected select start tag in the select phase "
220 "treated as select end tag.",
221 "unexpected-input-in-select":
222 "Unexpected input start tag in the select phase.",
223 "unexpected-start-tag-in-select":
224 "Unexpected start tag token (%(name)s in the select phase. "
225 "Ignored.",
226 "unexpected-end-tag-in-select":
227 "Unexpected end tag (%(name)s) in the select phase. Ignored.",
228 "unexpected-table-element-start-tag-in-select-in-table":
229 "Unexpected table element start tag (%(name)s) in the select in table phase.",
230 "unexpected-table-element-end-tag-in-select-in-table":
231 "Unexpected table element end tag (%(name)s) in the select in table phase.",
232 "unexpected-char-after-body":
233 "Unexpected non-space characters in the after body phase.",
234 "unexpected-start-tag-after-body":
235 "Unexpected start tag token (%(name)s)"
236 " in the after body phase.",
237 "unexpected-end-tag-after-body":
238 "Unexpected end tag token (%(name)s)"
239 " in the after body phase.",
240 "unexpected-char-in-frameset":
241 "Unexpected characters in the frameset phase. Characters ignored.",
242 "unexpected-start-tag-in-frameset":
243 "Unexpected start tag token (%(name)s)"
244 " in the frameset phase. Ignored.",
245 "unexpected-frameset-in-frameset-innerhtml":
246 "Unexpected end tag token (frameset) "
247 "in the frameset phase (innerHTML).",
248 "unexpected-end-tag-in-frameset":
249 "Unexpected end tag token (%(name)s)"
250 " in the frameset phase. Ignored.",
251 "unexpected-char-after-frameset":
252 "Unexpected non-space characters in the "
253 "after frameset phase. Ignored.",
254 "unexpected-start-tag-after-frameset":
255 "Unexpected start tag (%(name)s)"
256 " in the after frameset phase. Ignored.",
257 "unexpected-end-tag-after-frameset":
258 "Unexpected end tag (%(name)s)"
259 " in the after frameset phase. Ignored.",
260 "unexpected-end-tag-after-body-innerhtml":
261 "Unexpected end tag after body(innerHtml)",
262 "expected-eof-but-got-char":
263 "Unexpected non-space characters. Expected end of file.",
264 "expected-eof-but-got-start-tag":
265 "Unexpected start tag (%(name)s)"
266 ". Expected end of file.",
267 "expected-eof-but-got-end-tag":
268 "Unexpected end tag (%(name)s)"
269 ". Expected end of file.",
270 "eof-in-table":
271 "Unexpected end of file. Expected table content.",
272 "eof-in-select":
273 "Unexpected end of file. Expected select content.",
274 "eof-in-frameset":
275 "Unexpected end of file. Expected frameset content.",
276 "eof-in-script-in-script":
277 "Unexpected end of file. Expected script content.",
278 "eof-in-foreign-lands":
279 "Unexpected end of file. Expected foreign content",
280 "non-void-element-with-trailing-solidus":
281 "Trailing solidus not allowed on element %(name)s",
282 "unexpected-html-element-in-foreign-content":
283 "Element %(name)s not allowed in a non-html context",
284 "unexpected-end-tag-before-html":
285 "Unexpected end tag (%(name)s) before html.",
286 "unexpected-inhead-noscript-tag":
287 "Element %(name)s not allowed in a inhead-noscript context",
288 "eof-in-head-noscript":
289 "Unexpected end of file. Expected inhead-noscript content",
290 "char-in-head-noscript":
291 "Unexpected non-space character. Expected inhead-noscript content",
292 "XXX-undefined-error":
293 "Undefined error (this sucks and should be fixed)",
294}
295
296namespaces = {
297 "html": "http://www.w3.org/1999/xhtml",
298 "mathml": "http://www.w3.org/1998/Math/MathML",
299 "svg": "http://www.w3.org/2000/svg",
300 "xlink": "http://www.w3.org/1999/xlink",
301 "xml": "http://www.w3.org/XML/1998/namespace",
302 "xmlns": "http://www.w3.org/2000/xmlns/"
303}
304
305scopingElements = frozenset([
306 (namespaces["html"], "applet"),
307 (namespaces["html"], "caption"),
308 (namespaces["html"], "html"),
309 (namespaces["html"], "marquee"),
310 (namespaces["html"], "object"),
311 (namespaces["html"], "table"),
312 (namespaces["html"], "td"),
313 (namespaces["html"], "th"),
314 (namespaces["mathml"], "mi"),
315 (namespaces["mathml"], "mo"),
316 (namespaces["mathml"], "mn"),
317 (namespaces["mathml"], "ms"),
318 (namespaces["mathml"], "mtext"),
319 (namespaces["mathml"], "annotation-xml"),
320 (namespaces["svg"], "foreignObject"),
321 (namespaces["svg"], "desc"),
322 (namespaces["svg"], "title"),
323])
324
325formattingElements = frozenset([
326 (namespaces["html"], "a"),
327 (namespaces["html"], "b"),
328 (namespaces["html"], "big"),
329 (namespaces["html"], "code"),
330 (namespaces["html"], "em"),
331 (namespaces["html"], "font"),
332 (namespaces["html"], "i"),
333 (namespaces["html"], "nobr"),
334 (namespaces["html"], "s"),
335 (namespaces["html"], "small"),
336 (namespaces["html"], "strike"),
337 (namespaces["html"], "strong"),
338 (namespaces["html"], "tt"),
339 (namespaces["html"], "u")
340])
341
342specialElements = frozenset([
343 (namespaces["html"], "address"),
344 (namespaces["html"], "applet"),
345 (namespaces["html"], "area"),
346 (namespaces["html"], "article"),
347 (namespaces["html"], "aside"),
348 (namespaces["html"], "base"),
349 (namespaces["html"], "basefont"),
350 (namespaces["html"], "bgsound"),
351 (namespaces["html"], "blockquote"),
352 (namespaces["html"], "body"),
353 (namespaces["html"], "br"),
354 (namespaces["html"], "button"),
355 (namespaces["html"], "caption"),
356 (namespaces["html"], "center"),
357 (namespaces["html"], "col"),
358 (namespaces["html"], "colgroup"),
359 (namespaces["html"], "command"),
360 (namespaces["html"], "dd"),
361 (namespaces["html"], "details"),
362 (namespaces["html"], "dir"),
363 (namespaces["html"], "div"),
364 (namespaces["html"], "dl"),
365 (namespaces["html"], "dt"),
366 (namespaces["html"], "embed"),
367 (namespaces["html"], "fieldset"),
368 (namespaces["html"], "figure"),
369 (namespaces["html"], "footer"),
370 (namespaces["html"], "form"),
371 (namespaces["html"], "frame"),
372 (namespaces["html"], "frameset"),
373 (namespaces["html"], "h1"),
374 (namespaces["html"], "h2"),
375 (namespaces["html"], "h3"),
376 (namespaces["html"], "h4"),
377 (namespaces["html"], "h5"),
378 (namespaces["html"], "h6"),
379 (namespaces["html"], "head"),
380 (namespaces["html"], "header"),
381 (namespaces["html"], "hr"),
382 (namespaces["html"], "html"),
383 (namespaces["html"], "iframe"),
384 # Note that image is commented out in the spec as "this isn't an
385 # element that can end up on the stack, so it doesn't matter,"
386 (namespaces["html"], "image"),
387 (namespaces["html"], "img"),
388 (namespaces["html"], "input"),
389 (namespaces["html"], "isindex"),
390 (namespaces["html"], "li"),
391 (namespaces["html"], "link"),
392 (namespaces["html"], "listing"),
393 (namespaces["html"], "marquee"),
394 (namespaces["html"], "menu"),
395 (namespaces["html"], "meta"),
396 (namespaces["html"], "nav"),
397 (namespaces["html"], "noembed"),
398 (namespaces["html"], "noframes"),
399 (namespaces["html"], "noscript"),
400 (namespaces["html"], "object"),
401 (namespaces["html"], "ol"),
402 (namespaces["html"], "p"),
403 (namespaces["html"], "param"),
404 (namespaces["html"], "plaintext"),
405 (namespaces["html"], "pre"),
406 (namespaces["html"], "script"),
407 (namespaces["html"], "section"),
408 (namespaces["html"], "select"),
409 (namespaces["html"], "style"),
410 (namespaces["html"], "table"),
411 (namespaces["html"], "tbody"),
412 (namespaces["html"], "td"),
413 (namespaces["html"], "textarea"),
414 (namespaces["html"], "tfoot"),
415 (namespaces["html"], "th"),
416 (namespaces["html"], "thead"),
417 (namespaces["html"], "title"),
418 (namespaces["html"], "tr"),
419 (namespaces["html"], "ul"),
420 (namespaces["html"], "wbr"),
421 (namespaces["html"], "xmp"),
422 (namespaces["svg"], "foreignObject")
423])
424
425htmlIntegrationPointElements = frozenset([
426 (namespaces["mathml"], "annotation-xml"),
427 (namespaces["svg"], "foreignObject"),
428 (namespaces["svg"], "desc"),
429 (namespaces["svg"], "title")
430])
431
432mathmlTextIntegrationPointElements = frozenset([
433 (namespaces["mathml"], "mi"),
434 (namespaces["mathml"], "mo"),
435 (namespaces["mathml"], "mn"),
436 (namespaces["mathml"], "ms"),
437 (namespaces["mathml"], "mtext")
438])
439
440adjustSVGAttributes = {
441 "attributename": "attributeName",
442 "attributetype": "attributeType",
443 "basefrequency": "baseFrequency",
444 "baseprofile": "baseProfile",
445 "calcmode": "calcMode",
446 "clippathunits": "clipPathUnits",
447 "contentscripttype": "contentScriptType",
448 "contentstyletype": "contentStyleType",
449 "diffuseconstant": "diffuseConstant",
450 "edgemode": "edgeMode",
451 "externalresourcesrequired": "externalResourcesRequired",
452 "filterres": "filterRes",
453 "filterunits": "filterUnits",
454 "glyphref": "glyphRef",
455 "gradienttransform": "gradientTransform",
456 "gradientunits": "gradientUnits",
457 "kernelmatrix": "kernelMatrix",
458 "kernelunitlength": "kernelUnitLength",
459 "keypoints": "keyPoints",
460 "keysplines": "keySplines",
461 "keytimes": "keyTimes",
462 "lengthadjust": "lengthAdjust",
463 "limitingconeangle": "limitingConeAngle",
464 "markerheight": "markerHeight",
465 "markerunits": "markerUnits",
466 "markerwidth": "markerWidth",
467 "maskcontentunits": "maskContentUnits",
468 "maskunits": "maskUnits",
469 "numoctaves": "numOctaves",
470 "pathlength": "pathLength",
471 "patterncontentunits": "patternContentUnits",
472 "patterntransform": "patternTransform",
473 "patternunits": "patternUnits",
474 "pointsatx": "pointsAtX",
475 "pointsaty": "pointsAtY",
476 "pointsatz": "pointsAtZ",
477 "preservealpha": "preserveAlpha",
478 "preserveaspectratio": "preserveAspectRatio",
479 "primitiveunits": "primitiveUnits",
480 "refx": "refX",
481 "refy": "refY",
482 "repeatcount": "repeatCount",
483 "repeatdur": "repeatDur",
484 "requiredextensions": "requiredExtensions",
485 "requiredfeatures": "requiredFeatures",
486 "specularconstant": "specularConstant",
487 "specularexponent": "specularExponent",
488 "spreadmethod": "spreadMethod",
489 "startoffset": "startOffset",
490 "stddeviation": "stdDeviation",
491 "stitchtiles": "stitchTiles",
492 "surfacescale": "surfaceScale",
493 "systemlanguage": "systemLanguage",
494 "tablevalues": "tableValues",
495 "targetx": "targetX",
496 "targety": "targetY",
497 "textlength": "textLength",
498 "viewbox": "viewBox",
499 "viewtarget": "viewTarget",
500 "xchannelselector": "xChannelSelector",
501 "ychannelselector": "yChannelSelector",
502 "zoomandpan": "zoomAndPan"
503}
504
505adjustMathMLAttributes = {"definitionurl": "definitionURL"}
506
507adjustForeignAttributes = {
508 "xlink:actuate": ("xlink", "actuate", namespaces["xlink"]),
509 "xlink:arcrole": ("xlink", "arcrole", namespaces["xlink"]),
510 "xlink:href": ("xlink", "href", namespaces["xlink"]),
511 "xlink:role": ("xlink", "role", namespaces["xlink"]),
512 "xlink:show": ("xlink", "show", namespaces["xlink"]),
513 "xlink:title": ("xlink", "title", namespaces["xlink"]),
514 "xlink:type": ("xlink", "type", namespaces["xlink"]),
515 "xml:base": ("xml", "base", namespaces["xml"]),
516 "xml:lang": ("xml", "lang", namespaces["xml"]),
517 "xml:space": ("xml", "space", namespaces["xml"]),
518 "xmlns": (None, "xmlns", namespaces["xmlns"]),
519 "xmlns:xlink": ("xmlns", "xlink", namespaces["xmlns"])
520}
521
522unadjustForeignAttributes = dict([((ns, local), qname) for qname, (prefix, local, ns) in
523 adjustForeignAttributes.items()])
524
525spaceCharacters = frozenset([
526 "\t",
527 "\n",
528 "\u000C",
529 " ",
530 "\r"
531])
532
533tableInsertModeElements = frozenset([
534 "table",
535 "tbody",
536 "tfoot",
537 "thead",
538 "tr"
539])
540
541asciiLowercase = frozenset(string.ascii_lowercase)
542asciiUppercase = frozenset(string.ascii_uppercase)
543asciiLetters = frozenset(string.ascii_letters)
544digits = frozenset(string.digits)
545hexDigits = frozenset(string.hexdigits)
546
547asciiUpper2Lower = dict([(ord(c), ord(c.lower()))
548 for c in string.ascii_uppercase])
549
550# Heading elements need to be ordered
551headingElements = (
552 "h1",
553 "h2",
554 "h3",
555 "h4",
556 "h5",
557 "h6"
558)
559
560voidElements = frozenset([
561 "base",
562 "command",
563 "event-source",
564 "link",
565 "meta",
566 "hr",
567 "br",
568 "img",
569 "embed",
570 "param",
571 "area",
572 "col",
573 "input",
574 "source",
575 "track"
576])
577
578cdataElements = frozenset(['title', 'textarea'])
579
580rcdataElements = frozenset([
581 'style',
582 'script',
583 'xmp',
584 'iframe',
585 'noembed',
586 'noframes',
587 'noscript'
588])
589
590booleanAttributes = {
591 "": frozenset(["irrelevant", "itemscope"]),
592 "style": frozenset(["scoped"]),
593 "img": frozenset(["ismap"]),
594 "audio": frozenset(["autoplay", "controls"]),
595 "video": frozenset(["autoplay", "controls"]),
596 "script": frozenset(["defer", "async"]),
597 "details": frozenset(["open"]),
598 "datagrid": frozenset(["multiple", "disabled"]),
599 "command": frozenset(["hidden", "disabled", "checked", "default"]),
600 "hr": frozenset(["noshade"]),
601 "menu": frozenset(["autosubmit"]),
602 "fieldset": frozenset(["disabled", "readonly"]),
603 "option": frozenset(["disabled", "readonly", "selected"]),
604 "optgroup": frozenset(["disabled", "readonly"]),
605 "button": frozenset(["disabled", "autofocus"]),
606 "input": frozenset(["disabled", "readonly", "required", "autofocus", "checked", "ismap"]),
607 "select": frozenset(["disabled", "readonly", "autofocus", "multiple"]),
608 "output": frozenset(["disabled", "readonly"]),
609 "iframe": frozenset(["seamless"]),
610}
611
612# entitiesWindows1252 has to be _ordered_ and needs to have an index. It
613# therefore can't be a frozenset.
614entitiesWindows1252 = (
615 8364, # 0x80 0x20AC EURO SIGN
616 65533, # 0x81 UNDEFINED
617 8218, # 0x82 0x201A SINGLE LOW-9 QUOTATION MARK
618 402, # 0x83 0x0192 LATIN SMALL LETTER F WITH HOOK
619 8222, # 0x84 0x201E DOUBLE LOW-9 QUOTATION MARK
620 8230, # 0x85 0x2026 HORIZONTAL ELLIPSIS
621 8224, # 0x86 0x2020 DAGGER
622 8225, # 0x87 0x2021 DOUBLE DAGGER
623 710, # 0x88 0x02C6 MODIFIER LETTER CIRCUMFLEX ACCENT
624 8240, # 0x89 0x2030 PER MILLE SIGN
625 352, # 0x8A 0x0160 LATIN CAPITAL LETTER S WITH CARON
626 8249, # 0x8B 0x2039 SINGLE LEFT-POINTING ANGLE QUOTATION MARK
627 338, # 0x8C 0x0152 LATIN CAPITAL LIGATURE OE
628 65533, # 0x8D UNDEFINED
629 381, # 0x8E 0x017D LATIN CAPITAL LETTER Z WITH CARON
630 65533, # 0x8F UNDEFINED
631 65533, # 0x90 UNDEFINED
632 8216, # 0x91 0x2018 LEFT SINGLE QUOTATION MARK
633 8217, # 0x92 0x2019 RIGHT SINGLE QUOTATION MARK
634 8220, # 0x93 0x201C LEFT DOUBLE QUOTATION MARK
635 8221, # 0x94 0x201D RIGHT DOUBLE QUOTATION MARK
636 8226, # 0x95 0x2022 BULLET
637 8211, # 0x96 0x2013 EN DASH
638 8212, # 0x97 0x2014 EM DASH
639 732, # 0x98 0x02DC SMALL TILDE
640 8482, # 0x99 0x2122 TRADE MARK SIGN
641 353, # 0x9A 0x0161 LATIN SMALL LETTER S WITH CARON
642 8250, # 0x9B 0x203A SINGLE RIGHT-POINTING ANGLE QUOTATION MARK
643 339, # 0x9C 0x0153 LATIN SMALL LIGATURE OE
644 65533, # 0x9D UNDEFINED
645 382, # 0x9E 0x017E LATIN SMALL LETTER Z WITH CARON
646 376 # 0x9F 0x0178 LATIN CAPITAL LETTER Y WITH DIAERESIS
647)
648
649xmlEntities = frozenset(['lt;', 'gt;', 'amp;', 'apos;', 'quot;'])
650
651entities = {
652 "AElig": "\xc6",
653 "AElig;": "\xc6",
654 "AMP": "&",
655 "AMP;": "&",
656 "Aacute": "\xc1",
657 "Aacute;": "\xc1",
658 "Abreve;": "\u0102",
659 "Acirc": "\xc2",
660 "Acirc;": "\xc2",
661 "Acy;": "\u0410",
662 "Afr;": "\U0001d504",
663 "Agrave": "\xc0",
664 "Agrave;": "\xc0",
665 "Alpha;": "\u0391",
666 "Amacr;": "\u0100",
667 "And;": "\u2a53",
668 "Aogon;": "\u0104",
669 "Aopf;": "\U0001d538",
670 "ApplyFunction;": "\u2061",
671 "Aring": "\xc5",
672 "Aring;": "\xc5",
673 "Ascr;": "\U0001d49c",
674 "Assign;": "\u2254",
675 "Atilde": "\xc3",
676 "Atilde;": "\xc3",
677 "Auml": "\xc4",
678 "Auml;": "\xc4",
679 "Backslash;": "\u2216",
680 "Barv;": "\u2ae7",
681 "Barwed;": "\u2306",
682 "Bcy;": "\u0411",
683 "Because;": "\u2235",
684 "Bernoullis;": "\u212c",
685 "Beta;": "\u0392",
686 "Bfr;": "\U0001d505",
687 "Bopf;": "\U0001d539",
688 "Breve;": "\u02d8",
689 "Bscr;": "\u212c",
690 "Bumpeq;": "\u224e",
691 "CHcy;": "\u0427",
692 "COPY": "\xa9",
693 "COPY;": "\xa9",
694 "Cacute;": "\u0106",
695 "Cap;": "\u22d2",
696 "CapitalDifferentialD;": "\u2145",
697 "Cayleys;": "\u212d",
698 "Ccaron;": "\u010c",
699 "Ccedil": "\xc7",
700 "Ccedil;": "\xc7",
701 "Ccirc;": "\u0108",
702 "Cconint;": "\u2230",
703 "Cdot;": "\u010a",
704 "Cedilla;": "\xb8",
705 "CenterDot;": "\xb7",
706 "Cfr;": "\u212d",
707 "Chi;": "\u03a7",
708 "CircleDot;": "\u2299",
709 "CircleMinus;": "\u2296",
710 "CirclePlus;": "\u2295",
711 "CircleTimes;": "\u2297",
712 "ClockwiseContourIntegral;": "\u2232",
713 "CloseCurlyDoubleQuote;": "\u201d",
714 "CloseCurlyQuote;": "\u2019",
715 "Colon;": "\u2237",
716 "Colone;": "\u2a74",
717 "Congruent;": "\u2261",
718 "Conint;": "\u222f",
719 "ContourIntegral;": "\u222e",
720 "Copf;": "\u2102",
721 "Coproduct;": "\u2210",
722 "CounterClockwiseContourIntegral;": "\u2233",
723 "Cross;": "\u2a2f",
724 "Cscr;": "\U0001d49e",
725 "Cup;": "\u22d3",
726 "CupCap;": "\u224d",
727 "DD;": "\u2145",
728 "DDotrahd;": "\u2911",
729 "DJcy;": "\u0402",
730 "DScy;": "\u0405",
731 "DZcy;": "\u040f",
732 "Dagger;": "\u2021",
733 "Darr;": "\u21a1",
734 "Dashv;": "\u2ae4",
735 "Dcaron;": "\u010e",
736 "Dcy;": "\u0414",
737 "Del;": "\u2207",
738 "Delta;": "\u0394",
739 "Dfr;": "\U0001d507",
740 "DiacriticalAcute;": "\xb4",
741 "DiacriticalDot;": "\u02d9",
742 "DiacriticalDoubleAcute;": "\u02dd",
743 "DiacriticalGrave;": "`",
744 "DiacriticalTilde;": "\u02dc",
745 "Diamond;": "\u22c4",
746 "DifferentialD;": "\u2146",
747 "Dopf;": "\U0001d53b",
748 "Dot;": "\xa8",
749 "DotDot;": "\u20dc",
750 "DotEqual;": "\u2250",
751 "DoubleContourIntegral;": "\u222f",
752 "DoubleDot;": "\xa8",
753 "DoubleDownArrow;": "\u21d3",
754 "DoubleLeftArrow;": "\u21d0",
755 "DoubleLeftRightArrow;": "\u21d4",
756 "DoubleLeftTee;": "\u2ae4",
757 "DoubleLongLeftArrow;": "\u27f8",
758 "DoubleLongLeftRightArrow;": "\u27fa",
759 "DoubleLongRightArrow;": "\u27f9",
760 "DoubleRightArrow;": "\u21d2",
761 "DoubleRightTee;": "\u22a8",
762 "DoubleUpArrow;": "\u21d1",
763 "DoubleUpDownArrow;": "\u21d5",
764 "DoubleVerticalBar;": "\u2225",
765 "DownArrow;": "\u2193",
766 "DownArrowBar;": "\u2913",
767 "DownArrowUpArrow;": "\u21f5",
768 "DownBreve;": "\u0311",
769 "DownLeftRightVector;": "\u2950",
770 "DownLeftTeeVector;": "\u295e",
771 "DownLeftVector;": "\u21bd",
772 "DownLeftVectorBar;": "\u2956",
773 "DownRightTeeVector;": "\u295f",
774 "DownRightVector;": "\u21c1",
775 "DownRightVectorBar;": "\u2957",
776 "DownTee;": "\u22a4",
777 "DownTeeArrow;": "\u21a7",
778 "Downarrow;": "\u21d3",
779 "Dscr;": "\U0001d49f",
780 "Dstrok;": "\u0110",
781 "ENG;": "\u014a",
782 "ETH": "\xd0",
783 "ETH;": "\xd0",
784 "Eacute": "\xc9",
785 "Eacute;": "\xc9",
786 "Ecaron;": "\u011a",
787 "Ecirc": "\xca",
788 "Ecirc;": "\xca",
789 "Ecy;": "\u042d",
790 "Edot;": "\u0116",
791 "Efr;": "\U0001d508",
792 "Egrave": "\xc8",
793 "Egrave;": "\xc8",
794 "Element;": "\u2208",
795 "Emacr;": "\u0112",
796 "EmptySmallSquare;": "\u25fb",
797 "EmptyVerySmallSquare;": "\u25ab",
798 "Eogon;": "\u0118",
799 "Eopf;": "\U0001d53c",
800 "Epsilon;": "\u0395",
801 "Equal;": "\u2a75",
802 "EqualTilde;": "\u2242",
803 "Equilibrium;": "\u21cc",
804 "Escr;": "\u2130",
805 "Esim;": "\u2a73",
806 "Eta;": "\u0397",
807 "Euml": "\xcb",
808 "Euml;": "\xcb",
809 "Exists;": "\u2203",
810 "ExponentialE;": "\u2147",
811 "Fcy;": "\u0424",
812 "Ffr;": "\U0001d509",
813 "FilledSmallSquare;": "\u25fc",
814 "FilledVerySmallSquare;": "\u25aa",
815 "Fopf;": "\U0001d53d",
816 "ForAll;": "\u2200",
817 "Fouriertrf;": "\u2131",
818 "Fscr;": "\u2131",
819 "GJcy;": "\u0403",
820 "GT": ">",
821 "GT;": ">",
822 "Gamma;": "\u0393",
823 "Gammad;": "\u03dc",
824 "Gbreve;": "\u011e",
825 "Gcedil;": "\u0122",
826 "Gcirc;": "\u011c",
827 "Gcy;": "\u0413",
828 "Gdot;": "\u0120",
829 "Gfr;": "\U0001d50a",
830 "Gg;": "\u22d9",
831 "Gopf;": "\U0001d53e",
832 "GreaterEqual;": "\u2265",
833 "GreaterEqualLess;": "\u22db",
834 "GreaterFullEqual;": "\u2267",
835 "GreaterGreater;": "\u2aa2",
836 "GreaterLess;": "\u2277",
837 "GreaterSlantEqual;": "\u2a7e",
838 "GreaterTilde;": "\u2273",
839 "Gscr;": "\U0001d4a2",
840 "Gt;": "\u226b",
841 "HARDcy;": "\u042a",
842 "Hacek;": "\u02c7",
843 "Hat;": "^",
844 "Hcirc;": "\u0124",
845 "Hfr;": "\u210c",
846 "HilbertSpace;": "\u210b",
847 "Hopf;": "\u210d",
848 "HorizontalLine;": "\u2500",
849 "Hscr;": "\u210b",
850 "Hstrok;": "\u0126",
851 "HumpDownHump;": "\u224e",
852 "HumpEqual;": "\u224f",
853 "IEcy;": "\u0415",
854 "IJlig;": "\u0132",
855 "IOcy;": "\u0401",
856 "Iacute": "\xcd",
857 "Iacute;": "\xcd",
858 "Icirc": "\xce",
859 "Icirc;": "\xce",
860 "Icy;": "\u0418",
861 "Idot;": "\u0130",
862 "Ifr;": "\u2111",
863 "Igrave": "\xcc",
864 "Igrave;": "\xcc",
865 "Im;": "\u2111",
866 "Imacr;": "\u012a",
867 "ImaginaryI;": "\u2148",
868 "Implies;": "\u21d2",
869 "Int;": "\u222c",
870 "Integral;": "\u222b",
871 "Intersection;": "\u22c2",
872 "InvisibleComma;": "\u2063",
873 "InvisibleTimes;": "\u2062",
874 "Iogon;": "\u012e",
875 "Iopf;": "\U0001d540",
876 "Iota;": "\u0399",
877 "Iscr;": "\u2110",
878 "Itilde;": "\u0128",
879 "Iukcy;": "\u0406",
880 "Iuml": "\xcf",
881 "Iuml;": "\xcf",
882 "Jcirc;": "\u0134",
883 "Jcy;": "\u0419",
884 "Jfr;": "\U0001d50d",
885 "Jopf;": "\U0001d541",
886 "Jscr;": "\U0001d4a5",
887 "Jsercy;": "\u0408",
888 "Jukcy;": "\u0404",
889 "KHcy;": "\u0425",
890 "KJcy;": "\u040c",
891 "Kappa;": "\u039a",
892 "Kcedil;": "\u0136",
893 "Kcy;": "\u041a",
894 "Kfr;": "\U0001d50e",
895 "Kopf;": "\U0001d542",
896 "Kscr;": "\U0001d4a6",
897 "LJcy;": "\u0409",
898 "LT": "<",
899 "LT;": "<",
900 "Lacute;": "\u0139",
901 "Lambda;": "\u039b",
902 "Lang;": "\u27ea",
903 "Laplacetrf;": "\u2112",
904 "Larr;": "\u219e",
905 "Lcaron;": "\u013d",
906 "Lcedil;": "\u013b",
907 "Lcy;": "\u041b",
908 "LeftAngleBracket;": "\u27e8",
909 "LeftArrow;": "\u2190",
910 "LeftArrowBar;": "\u21e4",
911 "LeftArrowRightArrow;": "\u21c6",
912 "LeftCeiling;": "\u2308",
913 "LeftDoubleBracket;": "\u27e6",
914 "LeftDownTeeVector;": "\u2961",
915 "LeftDownVector;": "\u21c3",
916 "LeftDownVectorBar;": "\u2959",
917 "LeftFloor;": "\u230a",
918 "LeftRightArrow;": "\u2194",
919 "LeftRightVector;": "\u294e",
920 "LeftTee;": "\u22a3",
921 "LeftTeeArrow;": "\u21a4",
922 "LeftTeeVector;": "\u295a",
923 "LeftTriangle;": "\u22b2",
924 "LeftTriangleBar;": "\u29cf",
925 "LeftTriangleEqual;": "\u22b4",
926 "LeftUpDownVector;": "\u2951",
927 "LeftUpTeeVector;": "\u2960",
928 "LeftUpVector;": "\u21bf",
929 "LeftUpVectorBar;": "\u2958",
930 "LeftVector;": "\u21bc",
931 "LeftVectorBar;": "\u2952",
932 "Leftarrow;": "\u21d0",
933 "Leftrightarrow;": "\u21d4",
934 "LessEqualGreater;": "\u22da",
935 "LessFullEqual;": "\u2266",
936 "LessGreater;": "\u2276",
937 "LessLess;": "\u2aa1",
938 "LessSlantEqual;": "\u2a7d",
939 "LessTilde;": "\u2272",
940 "Lfr;": "\U0001d50f",
941 "Ll;": "\u22d8",
942 "Lleftarrow;": "\u21da",
943 "Lmidot;": "\u013f",
944 "LongLeftArrow;": "\u27f5",
945 "LongLeftRightArrow;": "\u27f7",
946 "LongRightArrow;": "\u27f6",
947 "Longleftarrow;": "\u27f8",
948 "Longleftrightarrow;": "\u27fa",
949 "Longrightarrow;": "\u27f9",
950 "Lopf;": "\U0001d543",
951 "LowerLeftArrow;": "\u2199",
952 "LowerRightArrow;": "\u2198",
953 "Lscr;": "\u2112",
954 "Lsh;": "\u21b0",
955 "Lstrok;": "\u0141",
956 "Lt;": "\u226a",
957 "Map;": "\u2905",
958 "Mcy;": "\u041c",
959 "MediumSpace;": "\u205f",
960 "Mellintrf;": "\u2133",
961 "Mfr;": "\U0001d510",
962 "MinusPlus;": "\u2213",
963 "Mopf;": "\U0001d544",
964 "Mscr;": "\u2133",
965 "Mu;": "\u039c",
966 "NJcy;": "\u040a",
967 "Nacute;": "\u0143",
968 "Ncaron;": "\u0147",
969 "Ncedil;": "\u0145",
970 "Ncy;": "\u041d",
971 "NegativeMediumSpace;": "\u200b",
972 "NegativeThickSpace;": "\u200b",
973 "NegativeThinSpace;": "\u200b",
974 "NegativeVeryThinSpace;": "\u200b",
975 "NestedGreaterGreater;": "\u226b",
976 "NestedLessLess;": "\u226a",
977 "NewLine;": "\n",
978 "Nfr;": "\U0001d511",
979 "NoBreak;": "\u2060",
980 "NonBreakingSpace;": "\xa0",
981 "Nopf;": "\u2115",
982 "Not;": "\u2aec",
983 "NotCongruent;": "\u2262",
984 "NotCupCap;": "\u226d",
985 "NotDoubleVerticalBar;": "\u2226",
986 "NotElement;": "\u2209",
987 "NotEqual;": "\u2260",
988 "NotEqualTilde;": "\u2242\u0338",
989 "NotExists;": "\u2204",
990 "NotGreater;": "\u226f",
991 "NotGreaterEqual;": "\u2271",
992 "NotGreaterFullEqual;": "\u2267\u0338",
993 "NotGreaterGreater;": "\u226b\u0338",
994 "NotGreaterLess;": "\u2279",
995 "NotGreaterSlantEqual;": "\u2a7e\u0338",
996 "NotGreaterTilde;": "\u2275",
997 "NotHumpDownHump;": "\u224e\u0338",
998 "NotHumpEqual;": "\u224f\u0338",
999 "NotLeftTriangle;": "\u22ea",
1000 "NotLeftTriangleBar;": "\u29cf\u0338",
1001 "NotLeftTriangleEqual;": "\u22ec",
1002 "NotLess;": "\u226e",
1003 "NotLessEqual;": "\u2270",
1004 "NotLessGreater;": "\u2278",
1005 "NotLessLess;": "\u226a\u0338",
1006 "NotLessSlantEqual;": "\u2a7d\u0338",
1007 "NotLessTilde;": "\u2274",
1008 "NotNestedGreaterGreater;": "\u2aa2\u0338",
1009 "NotNestedLessLess;": "\u2aa1\u0338",
1010 "NotPrecedes;": "\u2280",
1011 "NotPrecedesEqual;": "\u2aaf\u0338",
1012 "NotPrecedesSlantEqual;": "\u22e0",
1013 "NotReverseElement;": "\u220c",
1014 "NotRightTriangle;": "\u22eb",
1015 "NotRightTriangleBar;": "\u29d0\u0338",
1016 "NotRightTriangleEqual;": "\u22ed",
1017 "NotSquareSubset;": "\u228f\u0338",
1018 "NotSquareSubsetEqual;": "\u22e2",
1019 "NotSquareSuperset;": "\u2290\u0338",
1020 "NotSquareSupersetEqual;": "\u22e3",
1021 "NotSubset;": "\u2282\u20d2",
1022 "NotSubsetEqual;": "\u2288",
1023 "NotSucceeds;": "\u2281",
1024 "NotSucceedsEqual;": "\u2ab0\u0338",
1025 "NotSucceedsSlantEqual;": "\u22e1",
1026 "NotSucceedsTilde;": "\u227f\u0338",
1027 "NotSuperset;": "\u2283\u20d2",
1028 "NotSupersetEqual;": "\u2289",
1029 "NotTilde;": "\u2241",
1030 "NotTildeEqual;": "\u2244",
1031 "NotTildeFullEqual;": "\u2247",
1032 "NotTildeTilde;": "\u2249",
1033 "NotVerticalBar;": "\u2224",
1034 "Nscr;": "\U0001d4a9",
1035 "Ntilde": "\xd1",
1036 "Ntilde;": "\xd1",
1037 "Nu;": "\u039d",
1038 "OElig;": "\u0152",
1039 "Oacute": "\xd3",
1040 "Oacute;": "\xd3",
1041 "Ocirc": "\xd4",
1042 "Ocirc;": "\xd4",
1043 "Ocy;": "\u041e",
1044 "Odblac;": "\u0150",
1045 "Ofr;": "\U0001d512",
1046 "Ograve": "\xd2",
1047 "Ograve;": "\xd2",
1048 "Omacr;": "\u014c",
1049 "Omega;": "\u03a9",
1050 "Omicron;": "\u039f",
1051 "Oopf;": "\U0001d546",
1052 "OpenCurlyDoubleQuote;": "\u201c",
1053 "OpenCurlyQuote;": "\u2018",
1054 "Or;": "\u2a54",
1055 "Oscr;": "\U0001d4aa",
1056 "Oslash": "\xd8",
1057 "Oslash;": "\xd8",
1058 "Otilde": "\xd5",
1059 "Otilde;": "\xd5",
1060 "Otimes;": "\u2a37",
1061 "Ouml": "\xd6",
1062 "Ouml;": "\xd6",
1063 "OverBar;": "\u203e",
1064 "OverBrace;": "\u23de",
1065 "OverBracket;": "\u23b4",
1066 "OverParenthesis;": "\u23dc",
1067 "PartialD;": "\u2202",
1068 "Pcy;": "\u041f",
1069 "Pfr;": "\U0001d513",
1070 "Phi;": "\u03a6",
1071 "Pi;": "\u03a0",
1072 "PlusMinus;": "\xb1",
1073 "Poincareplane;": "\u210c",
1074 "Popf;": "\u2119",
1075 "Pr;": "\u2abb",
1076 "Precedes;": "\u227a",
1077 "PrecedesEqual;": "\u2aaf",
1078 "PrecedesSlantEqual;": "\u227c",
1079 "PrecedesTilde;": "\u227e",
1080 "Prime;": "\u2033",
1081 "Product;": "\u220f",
1082 "Proportion;": "\u2237",
1083 "Proportional;": "\u221d",
1084 "Pscr;": "\U0001d4ab",
1085 "Psi;": "\u03a8",
1086 "QUOT": "\"",
1087 "QUOT;": "\"",
1088 "Qfr;": "\U0001d514",
1089 "Qopf;": "\u211a",
1090 "Qscr;": "\U0001d4ac",
1091 "RBarr;": "\u2910",
1092 "REG": "\xae",
1093 "REG;": "\xae",
1094 "Racute;": "\u0154",
1095 "Rang;": "\u27eb",
1096 "Rarr;": "\u21a0",
1097 "Rarrtl;": "\u2916",
1098 "Rcaron;": "\u0158",
1099 "Rcedil;": "\u0156",
1100 "Rcy;": "\u0420",
1101 "Re;": "\u211c",
1102 "ReverseElement;": "\u220b",
1103 "ReverseEquilibrium;": "\u21cb",
1104 "ReverseUpEquilibrium;": "\u296f",
1105 "Rfr;": "\u211c",
1106 "Rho;": "\u03a1",
1107 "RightAngleBracket;": "\u27e9",
1108 "RightArrow;": "\u2192",
1109 "RightArrowBar;": "\u21e5",
1110 "RightArrowLeftArrow;": "\u21c4",
1111 "RightCeiling;": "\u2309",
1112 "RightDoubleBracket;": "\u27e7",
1113 "RightDownTeeVector;": "\u295d",
1114 "RightDownVector;": "\u21c2",
1115 "RightDownVectorBar;": "\u2955",
1116 "RightFloor;": "\u230b",
1117 "RightTee;": "\u22a2",
1118 "RightTeeArrow;": "\u21a6",
1119 "RightTeeVector;": "\u295b",
1120 "RightTriangle;": "\u22b3",
1121 "RightTriangleBar;": "\u29d0",
1122 "RightTriangleEqual;": "\u22b5",
1123 "RightUpDownVector;": "\u294f",
1124 "RightUpTeeVector;": "\u295c",
1125 "RightUpVector;": "\u21be",
1126 "RightUpVectorBar;": "\u2954",
1127 "RightVector;": "\u21c0",
1128 "RightVectorBar;": "\u2953",
1129 "Rightarrow;": "\u21d2",
1130 "Ropf;": "\u211d",
1131 "RoundImplies;": "\u2970",
1132 "Rrightarrow;": "\u21db",
1133 "Rscr;": "\u211b",
1134 "Rsh;": "\u21b1",
1135 "RuleDelayed;": "\u29f4",
1136 "SHCHcy;": "\u0429",
1137 "SHcy;": "\u0428",
1138 "SOFTcy;": "\u042c",
1139 "Sacute;": "\u015a",
1140 "Sc;": "\u2abc",
1141 "Scaron;": "\u0160",
1142 "Scedil;": "\u015e",
1143 "Scirc;": "\u015c",
1144 "Scy;": "\u0421",
1145 "Sfr;": "\U0001d516",
1146 "ShortDownArrow;": "\u2193",
1147 "ShortLeftArrow;": "\u2190",
1148 "ShortRightArrow;": "\u2192",
1149 "ShortUpArrow;": "\u2191",
1150 "Sigma;": "\u03a3",
1151 "SmallCircle;": "\u2218",
1152 "Sopf;": "\U0001d54a",
1153 "Sqrt;": "\u221a",
1154 "Square;": "\u25a1",
1155 "SquareIntersection;": "\u2293",
1156 "SquareSubset;": "\u228f",
1157 "SquareSubsetEqual;": "\u2291",
1158 "SquareSuperset;": "\u2290",
1159 "SquareSupersetEqual;": "\u2292",
1160 "SquareUnion;": "\u2294",
1161 "Sscr;": "\U0001d4ae",
1162 "Star;": "\u22c6",
1163 "Sub;": "\u22d0",
1164 "Subset;": "\u22d0",
1165 "SubsetEqual;": "\u2286",
1166 "Succeeds;": "\u227b",
1167 "SucceedsEqual;": "\u2ab0",
1168 "SucceedsSlantEqual;": "\u227d",
1169 "SucceedsTilde;": "\u227f",
1170 "SuchThat;": "\u220b",
1171 "Sum;": "\u2211",
1172 "Sup;": "\u22d1",
1173 "Superset;": "\u2283",
1174 "SupersetEqual;": "\u2287",
1175 "Supset;": "\u22d1",
1176 "THORN": "\xde",
1177 "THORN;": "\xde",
1178 "TRADE;": "\u2122",
1179 "TSHcy;": "\u040b",
1180 "TScy;": "\u0426",
1181 "Tab;": "\t",
1182 "Tau;": "\u03a4",
1183 "Tcaron;": "\u0164",
1184 "Tcedil;": "\u0162",
1185 "Tcy;": "\u0422",
1186 "Tfr;": "\U0001d517",
1187 "Therefore;": "\u2234",
1188 "Theta;": "\u0398",
1189 "ThickSpace;": "\u205f\u200a",
1190 "ThinSpace;": "\u2009",
1191 "Tilde;": "\u223c",
1192 "TildeEqual;": "\u2243",
1193 "TildeFullEqual;": "\u2245",
1194 "TildeTilde;": "\u2248",
1195 "Topf;": "\U0001d54b",
1196 "TripleDot;": "\u20db",
1197 "Tscr;": "\U0001d4af",
1198 "Tstrok;": "\u0166",
1199 "Uacute": "\xda",
1200 "Uacute;": "\xda",
1201 "Uarr;": "\u219f",
1202 "Uarrocir;": "\u2949",
1203 "Ubrcy;": "\u040e",
1204 "Ubreve;": "\u016c",
1205 "Ucirc": "\xdb",
1206 "Ucirc;": "\xdb",
1207 "Ucy;": "\u0423",
1208 "Udblac;": "\u0170",
1209 "Ufr;": "\U0001d518",
1210 "Ugrave": "\xd9",
1211 "Ugrave;": "\xd9",
1212 "Umacr;": "\u016a",
1213 "UnderBar;": "_",
1214 "UnderBrace;": "\u23df",
1215 "UnderBracket;": "\u23b5",
1216 "UnderParenthesis;": "\u23dd",
1217 "Union;": "\u22c3",
1218 "UnionPlus;": "\u228e",
1219 "Uogon;": "\u0172",
1220 "Uopf;": "\U0001d54c",
1221 "UpArrow;": "\u2191",
1222 "UpArrowBar;": "\u2912",
1223 "UpArrowDownArrow;": "\u21c5",
1224 "UpDownArrow;": "\u2195",
1225 "UpEquilibrium;": "\u296e",
1226 "UpTee;": "\u22a5",
1227 "UpTeeArrow;": "\u21a5",
1228 "Uparrow;": "\u21d1",
1229 "Updownarrow;": "\u21d5",
1230 "UpperLeftArrow;": "\u2196",
1231 "UpperRightArrow;": "\u2197",
1232 "Upsi;": "\u03d2",
1233 "Upsilon;": "\u03a5",
1234 "Uring;": "\u016e",
1235 "Uscr;": "\U0001d4b0",
1236 "Utilde;": "\u0168",
1237 "Uuml": "\xdc",
1238 "Uuml;": "\xdc",
1239 "VDash;": "\u22ab",
1240 "Vbar;": "\u2aeb",
1241 "Vcy;": "\u0412",
1242 "Vdash;": "\u22a9",
1243 "Vdashl;": "\u2ae6",
1244 "Vee;": "\u22c1",
1245 "Verbar;": "\u2016",
1246 "Vert;": "\u2016",
1247 "VerticalBar;": "\u2223",
1248 "VerticalLine;": "|",
1249 "VerticalSeparator;": "\u2758",
1250 "VerticalTilde;": "\u2240",
1251 "VeryThinSpace;": "\u200a",
1252 "Vfr;": "\U0001d519",
1253 "Vopf;": "\U0001d54d",
1254 "Vscr;": "\U0001d4b1",
1255 "Vvdash;": "\u22aa",
1256 "Wcirc;": "\u0174",
1257 "Wedge;": "\u22c0",
1258 "Wfr;": "\U0001d51a",
1259 "Wopf;": "\U0001d54e",
1260 "Wscr;": "\U0001d4b2",
1261 "Xfr;": "\U0001d51b",
1262 "Xi;": "\u039e",
1263 "Xopf;": "\U0001d54f",
1264 "Xscr;": "\U0001d4b3",
1265 "YAcy;": "\u042f",
1266 "YIcy;": "\u0407",
1267 "YUcy;": "\u042e",
1268 "Yacute": "\xdd",
1269 "Yacute;": "\xdd",
1270 "Ycirc;": "\u0176",
1271 "Ycy;": "\u042b",
1272 "Yfr;": "\U0001d51c",
1273 "Yopf;": "\U0001d550",
1274 "Yscr;": "\U0001d4b4",
1275 "Yuml;": "\u0178",
1276 "ZHcy;": "\u0416",
1277 "Zacute;": "\u0179",
1278 "Zcaron;": "\u017d",
1279 "Zcy;": "\u0417",
1280 "Zdot;": "\u017b",
1281 "ZeroWidthSpace;": "\u200b",
1282 "Zeta;": "\u0396",
1283 "Zfr;": "\u2128",
1284 "Zopf;": "\u2124",
1285 "Zscr;": "\U0001d4b5",
1286 "aacute": "\xe1",
1287 "aacute;": "\xe1",
1288 "abreve;": "\u0103",
1289 "ac;": "\u223e",
1290 "acE;": "\u223e\u0333",
1291 "acd;": "\u223f",
1292 "acirc": "\xe2",
1293 "acirc;": "\xe2",
1294 "acute": "\xb4",
1295 "acute;": "\xb4",
1296 "acy;": "\u0430",
1297 "aelig": "\xe6",
1298 "aelig;": "\xe6",
1299 "af;": "\u2061",
1300 "afr;": "\U0001d51e",
1301 "agrave": "\xe0",
1302 "agrave;": "\xe0",
1303 "alefsym;": "\u2135",
1304 "aleph;": "\u2135",
1305 "alpha;": "\u03b1",
1306 "amacr;": "\u0101",
1307 "amalg;": "\u2a3f",
1308 "amp": "&",
1309 "amp;": "&",
1310 "and;": "\u2227",
1311 "andand;": "\u2a55",
1312 "andd;": "\u2a5c",
1313 "andslope;": "\u2a58",
1314 "andv;": "\u2a5a",
1315 "ang;": "\u2220",
1316 "ange;": "\u29a4",
1317 "angle;": "\u2220",
1318 "angmsd;": "\u2221",
1319 "angmsdaa;": "\u29a8",
1320 "angmsdab;": "\u29a9",
1321 "angmsdac;": "\u29aa",
1322 "angmsdad;": "\u29ab",
1323 "angmsdae;": "\u29ac",
1324 "angmsdaf;": "\u29ad",
1325 "angmsdag;": "\u29ae",
1326 "angmsdah;": "\u29af",
1327 "angrt;": "\u221f",
1328 "angrtvb;": "\u22be",
1329 "angrtvbd;": "\u299d",
1330 "angsph;": "\u2222",
1331 "angst;": "\xc5",
1332 "angzarr;": "\u237c",
1333 "aogon;": "\u0105",
1334 "aopf;": "\U0001d552",
1335 "ap;": "\u2248",
1336 "apE;": "\u2a70",
1337 "apacir;": "\u2a6f",
1338 "ape;": "\u224a",
1339 "apid;": "\u224b",
1340 "apos;": "'",
1341 "approx;": "\u2248",
1342 "approxeq;": "\u224a",
1343 "aring": "\xe5",
1344 "aring;": "\xe5",
1345 "ascr;": "\U0001d4b6",
1346 "ast;": "*",
1347 "asymp;": "\u2248",
1348 "asympeq;": "\u224d",
1349 "atilde": "\xe3",
1350 "atilde;": "\xe3",
1351 "auml": "\xe4",
1352 "auml;": "\xe4",
1353 "awconint;": "\u2233",
1354 "awint;": "\u2a11",
1355 "bNot;": "\u2aed",
1356 "backcong;": "\u224c",
1357 "backepsilon;": "\u03f6",
1358 "backprime;": "\u2035",
1359 "backsim;": "\u223d",
1360 "backsimeq;": "\u22cd",
1361 "barvee;": "\u22bd",
1362 "barwed;": "\u2305",
1363 "barwedge;": "\u2305",
1364 "bbrk;": "\u23b5",
1365 "bbrktbrk;": "\u23b6",
1366 "bcong;": "\u224c",
1367 "bcy;": "\u0431",
1368 "bdquo;": "\u201e",
1369 "becaus;": "\u2235",
1370 "because;": "\u2235",
1371 "bemptyv;": "\u29b0",
1372 "bepsi;": "\u03f6",
1373 "bernou;": "\u212c",
1374 "beta;": "\u03b2",
1375 "beth;": "\u2136",
1376 "between;": "\u226c",
1377 "bfr;": "\U0001d51f",
1378 "bigcap;": "\u22c2",
1379 "bigcirc;": "\u25ef",
1380 "bigcup;": "\u22c3",
1381 "bigodot;": "\u2a00",
1382 "bigoplus;": "\u2a01",
1383 "bigotimes;": "\u2a02",
1384 "bigsqcup;": "\u2a06",
1385 "bigstar;": "\u2605",
1386 "bigtriangledown;": "\u25bd",
1387 "bigtriangleup;": "\u25b3",
1388 "biguplus;": "\u2a04",
1389 "bigvee;": "\u22c1",
1390 "bigwedge;": "\u22c0",
1391 "bkarow;": "\u290d",
1392 "blacklozenge;": "\u29eb",
1393 "blacksquare;": "\u25aa",
1394 "blacktriangle;": "\u25b4",
1395 "blacktriangledown;": "\u25be",
1396 "blacktriangleleft;": "\u25c2",
1397 "blacktriangleright;": "\u25b8",
1398 "blank;": "\u2423",
1399 "blk12;": "\u2592",
1400 "blk14;": "\u2591",
1401 "blk34;": "\u2593",
1402 "block;": "\u2588",
1403 "bne;": "=\u20e5",
1404 "bnequiv;": "\u2261\u20e5",
1405 "bnot;": "\u2310",
1406 "bopf;": "\U0001d553",
1407 "bot;": "\u22a5",
1408 "bottom;": "\u22a5",
1409 "bowtie;": "\u22c8",
1410 "boxDL;": "\u2557",
1411 "boxDR;": "\u2554",
1412 "boxDl;": "\u2556",
1413 "boxDr;": "\u2553",
1414 "boxH;": "\u2550",
1415 "boxHD;": "\u2566",
1416 "boxHU;": "\u2569",
1417 "boxHd;": "\u2564",
1418 "boxHu;": "\u2567",
1419 "boxUL;": "\u255d",
1420 "boxUR;": "\u255a",
1421 "boxUl;": "\u255c",
1422 "boxUr;": "\u2559",
1423 "boxV;": "\u2551",
1424 "boxVH;": "\u256c",
1425 "boxVL;": "\u2563",
1426 "boxVR;": "\u2560",
1427 "boxVh;": "\u256b",
1428 "boxVl;": "\u2562",
1429 "boxVr;": "\u255f",
1430 "boxbox;": "\u29c9",
1431 "boxdL;": "\u2555",
1432 "boxdR;": "\u2552",
1433 "boxdl;": "\u2510",
1434 "boxdr;": "\u250c",
1435 "boxh;": "\u2500",
1436 "boxhD;": "\u2565",
1437 "boxhU;": "\u2568",
1438 "boxhd;": "\u252c",
1439 "boxhu;": "\u2534",
1440 "boxminus;": "\u229f",
1441 "boxplus;": "\u229e",
1442 "boxtimes;": "\u22a0",
1443 "boxuL;": "\u255b",
1444 "boxuR;": "\u2558",
1445 "boxul;": "\u2518",
1446 "boxur;": "\u2514",
1447 "boxv;": "\u2502",
1448 "boxvH;": "\u256a",
1449 "boxvL;": "\u2561",
1450 "boxvR;": "\u255e",
1451 "boxvh;": "\u253c",
1452 "boxvl;": "\u2524",
1453 "boxvr;": "\u251c",
1454 "bprime;": "\u2035",
1455 "breve;": "\u02d8",
1456 "brvbar": "\xa6",
1457 "brvbar;": "\xa6",
1458 "bscr;": "\U0001d4b7",
1459 "bsemi;": "\u204f",
1460 "bsim;": "\u223d",
1461 "bsime;": "\u22cd",
1462 "bsol;": "\\",
1463 "bsolb;": "\u29c5",
1464 "bsolhsub;": "\u27c8",
1465 "bull;": "\u2022",
1466 "bullet;": "\u2022",
1467 "bump;": "\u224e",
1468 "bumpE;": "\u2aae",
1469 "bumpe;": "\u224f",
1470 "bumpeq;": "\u224f",
1471 "cacute;": "\u0107",
1472 "cap;": "\u2229",
1473 "capand;": "\u2a44",
1474 "capbrcup;": "\u2a49",
1475 "capcap;": "\u2a4b",
1476 "capcup;": "\u2a47",
1477 "capdot;": "\u2a40",
1478 "caps;": "\u2229\ufe00",
1479 "caret;": "\u2041",
1480 "caron;": "\u02c7",
1481 "ccaps;": "\u2a4d",
1482 "ccaron;": "\u010d",
1483 "ccedil": "\xe7",
1484 "ccedil;": "\xe7",
1485 "ccirc;": "\u0109",
1486 "ccups;": "\u2a4c",
1487 "ccupssm;": "\u2a50",
1488 "cdot;": "\u010b",
1489 "cedil": "\xb8",
1490 "cedil;": "\xb8",
1491 "cemptyv;": "\u29b2",
1492 "cent": "\xa2",
1493 "cent;": "\xa2",
1494 "centerdot;": "\xb7",
1495 "cfr;": "\U0001d520",
1496 "chcy;": "\u0447",
1497 "check;": "\u2713",
1498 "checkmark;": "\u2713",
1499 "chi;": "\u03c7",
1500 "cir;": "\u25cb",
1501 "cirE;": "\u29c3",
1502 "circ;": "\u02c6",
1503 "circeq;": "\u2257",
1504 "circlearrowleft;": "\u21ba",
1505 "circlearrowright;": "\u21bb",
1506 "circledR;": "\xae",
1507 "circledS;": "\u24c8",
1508 "circledast;": "\u229b",
1509 "circledcirc;": "\u229a",
1510 "circleddash;": "\u229d",
1511 "cire;": "\u2257",
1512 "cirfnint;": "\u2a10",
1513 "cirmid;": "\u2aef",
1514 "cirscir;": "\u29c2",
1515 "clubs;": "\u2663",
1516 "clubsuit;": "\u2663",
1517 "colon;": ":",
1518 "colone;": "\u2254",
1519 "coloneq;": "\u2254",
1520 "comma;": ",",
1521 "commat;": "@",
1522 "comp;": "\u2201",
1523 "compfn;": "\u2218",
1524 "complement;": "\u2201",
1525 "complexes;": "\u2102",
1526 "cong;": "\u2245",
1527 "congdot;": "\u2a6d",
1528 "conint;": "\u222e",
1529 "copf;": "\U0001d554",
1530 "coprod;": "\u2210",
1531 "copy": "\xa9",
1532 "copy;": "\xa9",
1533 "copysr;": "\u2117",
1534 "crarr;": "\u21b5",
1535 "cross;": "\u2717",
1536 "cscr;": "\U0001d4b8",
1537 "csub;": "\u2acf",
1538 "csube;": "\u2ad1",
1539 "csup;": "\u2ad0",
1540 "csupe;": "\u2ad2",
1541 "ctdot;": "\u22ef",
1542 "cudarrl;": "\u2938",
1543 "cudarrr;": "\u2935",
1544 "cuepr;": "\u22de",
1545 "cuesc;": "\u22df",
1546 "cularr;": "\u21b6",
1547 "cularrp;": "\u293d",
1548 "cup;": "\u222a",
1549 "cupbrcap;": "\u2a48",
1550 "cupcap;": "\u2a46",
1551 "cupcup;": "\u2a4a",
1552 "cupdot;": "\u228d",
1553 "cupor;": "\u2a45",
1554 "cups;": "\u222a\ufe00",
1555 "curarr;": "\u21b7",
1556 "curarrm;": "\u293c",
1557 "curlyeqprec;": "\u22de",
1558 "curlyeqsucc;": "\u22df",
1559 "curlyvee;": "\u22ce",
1560 "curlywedge;": "\u22cf",
1561 "curren": "\xa4",
1562 "curren;": "\xa4",
1563 "curvearrowleft;": "\u21b6",
1564 "curvearrowright;": "\u21b7",
1565 "cuvee;": "\u22ce",
1566 "cuwed;": "\u22cf",
1567 "cwconint;": "\u2232",
1568 "cwint;": "\u2231",
1569 "cylcty;": "\u232d",
1570 "dArr;": "\u21d3",
1571 "dHar;": "\u2965",
1572 "dagger;": "\u2020",
1573 "daleth;": "\u2138",
1574 "darr;": "\u2193",
1575 "dash;": "\u2010",
1576 "dashv;": "\u22a3",
1577 "dbkarow;": "\u290f",
1578 "dblac;": "\u02dd",
1579 "dcaron;": "\u010f",
1580 "dcy;": "\u0434",
1581 "dd;": "\u2146",
1582 "ddagger;": "\u2021",
1583 "ddarr;": "\u21ca",
1584 "ddotseq;": "\u2a77",
1585 "deg": "\xb0",
1586 "deg;": "\xb0",
1587 "delta;": "\u03b4",
1588 "demptyv;": "\u29b1",
1589 "dfisht;": "\u297f",
1590 "dfr;": "\U0001d521",
1591 "dharl;": "\u21c3",
1592 "dharr;": "\u21c2",
1593 "diam;": "\u22c4",
1594 "diamond;": "\u22c4",
1595 "diamondsuit;": "\u2666",
1596 "diams;": "\u2666",
1597 "die;": "\xa8",
1598 "digamma;": "\u03dd",
1599 "disin;": "\u22f2",
1600 "div;": "\xf7",
1601 "divide": "\xf7",
1602 "divide;": "\xf7",
1603 "divideontimes;": "\u22c7",
1604 "divonx;": "\u22c7",
1605 "djcy;": "\u0452",
1606 "dlcorn;": "\u231e",
1607 "dlcrop;": "\u230d",
1608 "dollar;": "$",
1609 "dopf;": "\U0001d555",
1610 "dot;": "\u02d9",
1611 "doteq;": "\u2250",
1612 "doteqdot;": "\u2251",
1613 "dotminus;": "\u2238",
1614 "dotplus;": "\u2214",
1615 "dotsquare;": "\u22a1",
1616 "doublebarwedge;": "\u2306",
1617 "downarrow;": "\u2193",
1618 "downdownarrows;": "\u21ca",
1619 "downharpoonleft;": "\u21c3",
1620 "downharpoonright;": "\u21c2",
1621 "drbkarow;": "\u2910",
1622 "drcorn;": "\u231f",
1623 "drcrop;": "\u230c",
1624 "dscr;": "\U0001d4b9",
1625 "dscy;": "\u0455",
1626 "dsol;": "\u29f6",
1627 "dstrok;": "\u0111",
1628 "dtdot;": "\u22f1",
1629 "dtri;": "\u25bf",
1630 "dtrif;": "\u25be",
1631 "duarr;": "\u21f5",
1632 "duhar;": "\u296f",
1633 "dwangle;": "\u29a6",
1634 "dzcy;": "\u045f",
1635 "dzigrarr;": "\u27ff",
1636 "eDDot;": "\u2a77",
1637 "eDot;": "\u2251",
1638 "eacute": "\xe9",
1639 "eacute;": "\xe9",
1640 "easter;": "\u2a6e",
1641 "ecaron;": "\u011b",
1642 "ecir;": "\u2256",
1643 "ecirc": "\xea",
1644 "ecirc;": "\xea",
1645 "ecolon;": "\u2255",
1646 "ecy;": "\u044d",
1647 "edot;": "\u0117",
1648 "ee;": "\u2147",
1649 "efDot;": "\u2252",
1650 "efr;": "\U0001d522",
1651 "eg;": "\u2a9a",
1652 "egrave": "\xe8",
1653 "egrave;": "\xe8",
1654 "egs;": "\u2a96",
1655 "egsdot;": "\u2a98",
1656 "el;": "\u2a99",
1657 "elinters;": "\u23e7",
1658 "ell;": "\u2113",
1659 "els;": "\u2a95",
1660 "elsdot;": "\u2a97",
1661 "emacr;": "\u0113",
1662 "empty;": "\u2205",
1663 "emptyset;": "\u2205",
1664 "emptyv;": "\u2205",
1665 "emsp13;": "\u2004",
1666 "emsp14;": "\u2005",
1667 "emsp;": "\u2003",
1668 "eng;": "\u014b",
1669 "ensp;": "\u2002",
1670 "eogon;": "\u0119",
1671 "eopf;": "\U0001d556",
1672 "epar;": "\u22d5",
1673 "eparsl;": "\u29e3",
1674 "eplus;": "\u2a71",
1675 "epsi;": "\u03b5",
1676 "epsilon;": "\u03b5",
1677 "epsiv;": "\u03f5",
1678 "eqcirc;": "\u2256",
1679 "eqcolon;": "\u2255",
1680 "eqsim;": "\u2242",
1681 "eqslantgtr;": "\u2a96",
1682 "eqslantless;": "\u2a95",
1683 "equals;": "=",
1684 "equest;": "\u225f",
1685 "equiv;": "\u2261",
1686 "equivDD;": "\u2a78",
1687 "eqvparsl;": "\u29e5",
1688 "erDot;": "\u2253",
1689 "erarr;": "\u2971",
1690 "escr;": "\u212f",
1691 "esdot;": "\u2250",
1692 "esim;": "\u2242",
1693 "eta;": "\u03b7",
1694 "eth": "\xf0",
1695 "eth;": "\xf0",
1696 "euml": "\xeb",
1697 "euml;": "\xeb",
1698 "euro;": "\u20ac",
1699 "excl;": "!",
1700 "exist;": "\u2203",
1701 "expectation;": "\u2130",
1702 "exponentiale;": "\u2147",
1703 "fallingdotseq;": "\u2252",
1704 "fcy;": "\u0444",
1705 "female;": "\u2640",
1706 "ffilig;": "\ufb03",
1707 "fflig;": "\ufb00",
1708 "ffllig;": "\ufb04",
1709 "ffr;": "\U0001d523",
1710 "filig;": "\ufb01",
1711 "fjlig;": "fj",
1712 "flat;": "\u266d",
1713 "fllig;": "\ufb02",
1714 "fltns;": "\u25b1",
1715 "fnof;": "\u0192",
1716 "fopf;": "\U0001d557",
1717 "forall;": "\u2200",
1718 "fork;": "\u22d4",
1719 "forkv;": "\u2ad9",
1720 "fpartint;": "\u2a0d",
1721 "frac12": "\xbd",
1722 "frac12;": "\xbd",
1723 "frac13;": "\u2153",
1724 "frac14": "\xbc",
1725 "frac14;": "\xbc",
1726 "frac15;": "\u2155",
1727 "frac16;": "\u2159",
1728 "frac18;": "\u215b",
1729 "frac23;": "\u2154",
1730 "frac25;": "\u2156",
1731 "frac34": "\xbe",
1732 "frac34;": "\xbe",
1733 "frac35;": "\u2157",
1734 "frac38;": "\u215c",
1735 "frac45;": "\u2158",
1736 "frac56;": "\u215a",
1737 "frac58;": "\u215d",
1738 "frac78;": "\u215e",
1739 "frasl;": "\u2044",
1740 "frown;": "\u2322",
1741 "fscr;": "\U0001d4bb",
1742 "gE;": "\u2267",
1743 "gEl;": "\u2a8c",
1744 "gacute;": "\u01f5",
1745 "gamma;": "\u03b3",
1746 "gammad;": "\u03dd",
1747 "gap;": "\u2a86",
1748 "gbreve;": "\u011f",
1749 "gcirc;": "\u011d",
1750 "gcy;": "\u0433",
1751 "gdot;": "\u0121",
1752 "ge;": "\u2265",
1753 "gel;": "\u22db",
1754 "geq;": "\u2265",
1755 "geqq;": "\u2267",
1756 "geqslant;": "\u2a7e",
1757 "ges;": "\u2a7e",
1758 "gescc;": "\u2aa9",
1759 "gesdot;": "\u2a80",
1760 "gesdoto;": "\u2a82",
1761 "gesdotol;": "\u2a84",
1762 "gesl;": "\u22db\ufe00",
1763 "gesles;": "\u2a94",
1764 "gfr;": "\U0001d524",
1765 "gg;": "\u226b",
1766 "ggg;": "\u22d9",
1767 "gimel;": "\u2137",
1768 "gjcy;": "\u0453",
1769 "gl;": "\u2277",
1770 "glE;": "\u2a92",
1771 "gla;": "\u2aa5",
1772 "glj;": "\u2aa4",
1773 "gnE;": "\u2269",
1774 "gnap;": "\u2a8a",
1775 "gnapprox;": "\u2a8a",
1776 "gne;": "\u2a88",
1777 "gneq;": "\u2a88",
1778 "gneqq;": "\u2269",
1779 "gnsim;": "\u22e7",
1780 "gopf;": "\U0001d558",
1781 "grave;": "`",
1782 "gscr;": "\u210a",
1783 "gsim;": "\u2273",
1784 "gsime;": "\u2a8e",
1785 "gsiml;": "\u2a90",
1786 "gt": ">",
1787 "gt;": ">",
1788 "gtcc;": "\u2aa7",
1789 "gtcir;": "\u2a7a",
1790 "gtdot;": "\u22d7",
1791 "gtlPar;": "\u2995",
1792 "gtquest;": "\u2a7c",
1793 "gtrapprox;": "\u2a86",
1794 "gtrarr;": "\u2978",
1795 "gtrdot;": "\u22d7",
1796 "gtreqless;": "\u22db",
1797 "gtreqqless;": "\u2a8c",
1798 "gtrless;": "\u2277",
1799 "gtrsim;": "\u2273",
1800 "gvertneqq;": "\u2269\ufe00",
1801 "gvnE;": "\u2269\ufe00",
1802 "hArr;": "\u21d4",
1803 "hairsp;": "\u200a",
1804 "half;": "\xbd",
1805 "hamilt;": "\u210b",
1806 "hardcy;": "\u044a",
1807 "harr;": "\u2194",
1808 "harrcir;": "\u2948",
1809 "harrw;": "\u21ad",
1810 "hbar;": "\u210f",
1811 "hcirc;": "\u0125",
1812 "hearts;": "\u2665",
1813 "heartsuit;": "\u2665",
1814 "hellip;": "\u2026",
1815 "hercon;": "\u22b9",
1816 "hfr;": "\U0001d525",
1817 "hksearow;": "\u2925",
1818 "hkswarow;": "\u2926",
1819 "hoarr;": "\u21ff",
1820 "homtht;": "\u223b",
1821 "hookleftarrow;": "\u21a9",
1822 "hookrightarrow;": "\u21aa",
1823 "hopf;": "\U0001d559",
1824 "horbar;": "\u2015",
1825 "hscr;": "\U0001d4bd",
1826 "hslash;": "\u210f",
1827 "hstrok;": "\u0127",
1828 "hybull;": "\u2043",
1829 "hyphen;": "\u2010",
1830 "iacute": "\xed",
1831 "iacute;": "\xed",
1832 "ic;": "\u2063",
1833 "icirc": "\xee",
1834 "icirc;": "\xee",
1835 "icy;": "\u0438",
1836 "iecy;": "\u0435",
1837 "iexcl": "\xa1",
1838 "iexcl;": "\xa1",
1839 "iff;": "\u21d4",
1840 "ifr;": "\U0001d526",
1841 "igrave": "\xec",
1842 "igrave;": "\xec",
1843 "ii;": "\u2148",
1844 "iiiint;": "\u2a0c",
1845 "iiint;": "\u222d",
1846 "iinfin;": "\u29dc",
1847 "iiota;": "\u2129",
1848 "ijlig;": "\u0133",
1849 "imacr;": "\u012b",
1850 "image;": "\u2111",
1851 "imagline;": "\u2110",
1852 "imagpart;": "\u2111",
1853 "imath;": "\u0131",
1854 "imof;": "\u22b7",
1855 "imped;": "\u01b5",
1856 "in;": "\u2208",
1857 "incare;": "\u2105",
1858 "infin;": "\u221e",
1859 "infintie;": "\u29dd",
1860 "inodot;": "\u0131",
1861 "int;": "\u222b",
1862 "intcal;": "\u22ba",
1863 "integers;": "\u2124",
1864 "intercal;": "\u22ba",
1865 "intlarhk;": "\u2a17",
1866 "intprod;": "\u2a3c",
1867 "iocy;": "\u0451",
1868 "iogon;": "\u012f",
1869 "iopf;": "\U0001d55a",
1870 "iota;": "\u03b9",
1871 "iprod;": "\u2a3c",
1872 "iquest": "\xbf",
1873 "iquest;": "\xbf",
1874 "iscr;": "\U0001d4be",
1875 "isin;": "\u2208",
1876 "isinE;": "\u22f9",
1877 "isindot;": "\u22f5",
1878 "isins;": "\u22f4",
1879 "isinsv;": "\u22f3",
1880 "isinv;": "\u2208",
1881 "it;": "\u2062",
1882 "itilde;": "\u0129",
1883 "iukcy;": "\u0456",
1884 "iuml": "\xef",
1885 "iuml;": "\xef",
1886 "jcirc;": "\u0135",
1887 "jcy;": "\u0439",
1888 "jfr;": "\U0001d527",
1889 "jmath;": "\u0237",
1890 "jopf;": "\U0001d55b",
1891 "jscr;": "\U0001d4bf",
1892 "jsercy;": "\u0458",
1893 "jukcy;": "\u0454",
1894 "kappa;": "\u03ba",
1895 "kappav;": "\u03f0",
1896 "kcedil;": "\u0137",
1897 "kcy;": "\u043a",
1898 "kfr;": "\U0001d528",
1899 "kgreen;": "\u0138",
1900 "khcy;": "\u0445",
1901 "kjcy;": "\u045c",
1902 "kopf;": "\U0001d55c",
1903 "kscr;": "\U0001d4c0",
1904 "lAarr;": "\u21da",
1905 "lArr;": "\u21d0",
1906 "lAtail;": "\u291b",
1907 "lBarr;": "\u290e",
1908 "lE;": "\u2266",
1909 "lEg;": "\u2a8b",
1910 "lHar;": "\u2962",
1911 "lacute;": "\u013a",
1912 "laemptyv;": "\u29b4",
1913 "lagran;": "\u2112",
1914 "lambda;": "\u03bb",
1915 "lang;": "\u27e8",
1916 "langd;": "\u2991",
1917 "langle;": "\u27e8",
1918 "lap;": "\u2a85",
1919 "laquo": "\xab",
1920 "laquo;": "\xab",
1921 "larr;": "\u2190",
1922 "larrb;": "\u21e4",
1923 "larrbfs;": "\u291f",
1924 "larrfs;": "\u291d",
1925 "larrhk;": "\u21a9",
1926 "larrlp;": "\u21ab",
1927 "larrpl;": "\u2939",
1928 "larrsim;": "\u2973",
1929 "larrtl;": "\u21a2",
1930 "lat;": "\u2aab",
1931 "latail;": "\u2919",
1932 "late;": "\u2aad",
1933 "lates;": "\u2aad\ufe00",
1934 "lbarr;": "\u290c",
1935 "lbbrk;": "\u2772",
1936 "lbrace;": "{",
1937 "lbrack;": "[",
1938 "lbrke;": "\u298b",
1939 "lbrksld;": "\u298f",
1940 "lbrkslu;": "\u298d",
1941 "lcaron;": "\u013e",
1942 "lcedil;": "\u013c",
1943 "lceil;": "\u2308",
1944 "lcub;": "{",
1945 "lcy;": "\u043b",
1946 "ldca;": "\u2936",
1947 "ldquo;": "\u201c",
1948 "ldquor;": "\u201e",
1949 "ldrdhar;": "\u2967",
1950 "ldrushar;": "\u294b",
1951 "ldsh;": "\u21b2",
1952 "le;": "\u2264",
1953 "leftarrow;": "\u2190",
1954 "leftarrowtail;": "\u21a2",
1955 "leftharpoondown;": "\u21bd",
1956 "leftharpoonup;": "\u21bc",
1957 "leftleftarrows;": "\u21c7",
1958 "leftrightarrow;": "\u2194",
1959 "leftrightarrows;": "\u21c6",
1960 "leftrightharpoons;": "\u21cb",
1961 "leftrightsquigarrow;": "\u21ad",
1962 "leftthreetimes;": "\u22cb",
1963 "leg;": "\u22da",
1964 "leq;": "\u2264",
1965 "leqq;": "\u2266",
1966 "leqslant;": "\u2a7d",
1967 "les;": "\u2a7d",
1968 "lescc;": "\u2aa8",
1969 "lesdot;": "\u2a7f",
1970 "lesdoto;": "\u2a81",
1971 "lesdotor;": "\u2a83",
1972 "lesg;": "\u22da\ufe00",
1973 "lesges;": "\u2a93",
1974 "lessapprox;": "\u2a85",
1975 "lessdot;": "\u22d6",
1976 "lesseqgtr;": "\u22da",
1977 "lesseqqgtr;": "\u2a8b",
1978 "lessgtr;": "\u2276",
1979 "lesssim;": "\u2272",
1980 "lfisht;": "\u297c",
1981 "lfloor;": "\u230a",
1982 "lfr;": "\U0001d529",
1983 "lg;": "\u2276",
1984 "lgE;": "\u2a91",
1985 "lhard;": "\u21bd",
1986 "lharu;": "\u21bc",
1987 "lharul;": "\u296a",
1988 "lhblk;": "\u2584",
1989 "ljcy;": "\u0459",
1990 "ll;": "\u226a",
1991 "llarr;": "\u21c7",
1992 "llcorner;": "\u231e",
1993 "llhard;": "\u296b",
1994 "lltri;": "\u25fa",
1995 "lmidot;": "\u0140",
1996 "lmoust;": "\u23b0",
1997 "lmoustache;": "\u23b0",
1998 "lnE;": "\u2268",
1999 "lnap;": "\u2a89",
2000 "lnapprox;": "\u2a89",
2001 "lne;": "\u2a87",
2002 "lneq;": "\u2a87",
2003 "lneqq;": "\u2268",
2004 "lnsim;": "\u22e6",
2005 "loang;": "\u27ec",
2006 "loarr;": "\u21fd",
2007 "lobrk;": "\u27e6",
2008 "longleftarrow;": "\u27f5",
2009 "longleftrightarrow;": "\u27f7",
2010 "longmapsto;": "\u27fc",
2011 "longrightarrow;": "\u27f6",
2012 "looparrowleft;": "\u21ab",
2013 "looparrowright;": "\u21ac",
2014 "lopar;": "\u2985",
2015 "lopf;": "\U0001d55d",
2016 "loplus;": "\u2a2d",
2017 "lotimes;": "\u2a34",
2018 "lowast;": "\u2217",
2019 "lowbar;": "_",
2020 "loz;": "\u25ca",
2021 "lozenge;": "\u25ca",
2022 "lozf;": "\u29eb",
2023 "lpar;": "(",
2024 "lparlt;": "\u2993",
2025 "lrarr;": "\u21c6",
2026 "lrcorner;": "\u231f",
2027 "lrhar;": "\u21cb",
2028 "lrhard;": "\u296d",
2029 "lrm;": "\u200e",
2030 "lrtri;": "\u22bf",
2031 "lsaquo;": "\u2039",
2032 "lscr;": "\U0001d4c1",
2033 "lsh;": "\u21b0",
2034 "lsim;": "\u2272",
2035 "lsime;": "\u2a8d",
2036 "lsimg;": "\u2a8f",
2037 "lsqb;": "[",
2038 "lsquo;": "\u2018",
2039 "lsquor;": "\u201a",
2040 "lstrok;": "\u0142",
2041 "lt": "<",
2042 "lt;": "<",
2043 "ltcc;": "\u2aa6",
2044 "ltcir;": "\u2a79",
2045 "ltdot;": "\u22d6",
2046 "lthree;": "\u22cb",
2047 "ltimes;": "\u22c9",
2048 "ltlarr;": "\u2976",
2049 "ltquest;": "\u2a7b",
2050 "ltrPar;": "\u2996",
2051 "ltri;": "\u25c3",
2052 "ltrie;": "\u22b4",
2053 "ltrif;": "\u25c2",
2054 "lurdshar;": "\u294a",
2055 "luruhar;": "\u2966",
2056 "lvertneqq;": "\u2268\ufe00",
2057 "lvnE;": "\u2268\ufe00",
2058 "mDDot;": "\u223a",
2059 "macr": "\xaf",
2060 "macr;": "\xaf",
2061 "male;": "\u2642",
2062 "malt;": "\u2720",
2063 "maltese;": "\u2720",
2064 "map;": "\u21a6",
2065 "mapsto;": "\u21a6",
2066 "mapstodown;": "\u21a7",
2067 "mapstoleft;": "\u21a4",
2068 "mapstoup;": "\u21a5",
2069 "marker;": "\u25ae",
2070 "mcomma;": "\u2a29",
2071 "mcy;": "\u043c",
2072 "mdash;": "\u2014",
2073 "measuredangle;": "\u2221",
2074 "mfr;": "\U0001d52a",
2075 "mho;": "\u2127",
2076 "micro": "\xb5",
2077 "micro;": "\xb5",
2078 "mid;": "\u2223",
2079 "midast;": "*",
2080 "midcir;": "\u2af0",
2081 "middot": "\xb7",
2082 "middot;": "\xb7",
2083 "minus;": "\u2212",
2084 "minusb;": "\u229f",
2085 "minusd;": "\u2238",
2086 "minusdu;": "\u2a2a",
2087 "mlcp;": "\u2adb",
2088 "mldr;": "\u2026",
2089 "mnplus;": "\u2213",
2090 "models;": "\u22a7",
2091 "mopf;": "\U0001d55e",
2092 "mp;": "\u2213",
2093 "mscr;": "\U0001d4c2",
2094 "mstpos;": "\u223e",
2095 "mu;": "\u03bc",
2096 "multimap;": "\u22b8",
2097 "mumap;": "\u22b8",
2098 "nGg;": "\u22d9\u0338",
2099 "nGt;": "\u226b\u20d2",
2100 "nGtv;": "\u226b\u0338",
2101 "nLeftarrow;": "\u21cd",
2102 "nLeftrightarrow;": "\u21ce",
2103 "nLl;": "\u22d8\u0338",
2104 "nLt;": "\u226a\u20d2",
2105 "nLtv;": "\u226a\u0338",
2106 "nRightarrow;": "\u21cf",
2107 "nVDash;": "\u22af",
2108 "nVdash;": "\u22ae",
2109 "nabla;": "\u2207",
2110 "nacute;": "\u0144",
2111 "nang;": "\u2220\u20d2",
2112 "nap;": "\u2249",
2113 "napE;": "\u2a70\u0338",
2114 "napid;": "\u224b\u0338",
2115 "napos;": "\u0149",
2116 "napprox;": "\u2249",
2117 "natur;": "\u266e",
2118 "natural;": "\u266e",
2119 "naturals;": "\u2115",
2120 "nbsp": "\xa0",
2121 "nbsp;": "\xa0",
2122 "nbump;": "\u224e\u0338",
2123 "nbumpe;": "\u224f\u0338",
2124 "ncap;": "\u2a43",
2125 "ncaron;": "\u0148",
2126 "ncedil;": "\u0146",
2127 "ncong;": "\u2247",
2128 "ncongdot;": "\u2a6d\u0338",
2129 "ncup;": "\u2a42",
2130 "ncy;": "\u043d",
2131 "ndash;": "\u2013",
2132 "ne;": "\u2260",
2133 "neArr;": "\u21d7",
2134 "nearhk;": "\u2924",
2135 "nearr;": "\u2197",
2136 "nearrow;": "\u2197",
2137 "nedot;": "\u2250\u0338",
2138 "nequiv;": "\u2262",
2139 "nesear;": "\u2928",
2140 "nesim;": "\u2242\u0338",
2141 "nexist;": "\u2204",
2142 "nexists;": "\u2204",
2143 "nfr;": "\U0001d52b",
2144 "ngE;": "\u2267\u0338",
2145 "nge;": "\u2271",
2146 "ngeq;": "\u2271",
2147 "ngeqq;": "\u2267\u0338",
2148 "ngeqslant;": "\u2a7e\u0338",
2149 "nges;": "\u2a7e\u0338",
2150 "ngsim;": "\u2275",
2151 "ngt;": "\u226f",
2152 "ngtr;": "\u226f",
2153 "nhArr;": "\u21ce",
2154 "nharr;": "\u21ae",
2155 "nhpar;": "\u2af2",
2156 "ni;": "\u220b",
2157 "nis;": "\u22fc",
2158 "nisd;": "\u22fa",
2159 "niv;": "\u220b",
2160 "njcy;": "\u045a",
2161 "nlArr;": "\u21cd",
2162 "nlE;": "\u2266\u0338",
2163 "nlarr;": "\u219a",
2164 "nldr;": "\u2025",
2165 "nle;": "\u2270",
2166 "nleftarrow;": "\u219a",
2167 "nleftrightarrow;": "\u21ae",
2168 "nleq;": "\u2270",
2169 "nleqq;": "\u2266\u0338",
2170 "nleqslant;": "\u2a7d\u0338",
2171 "nles;": "\u2a7d\u0338",
2172 "nless;": "\u226e",
2173 "nlsim;": "\u2274",
2174 "nlt;": "\u226e",
2175 "nltri;": "\u22ea",
2176 "nltrie;": "\u22ec",
2177 "nmid;": "\u2224",
2178 "nopf;": "\U0001d55f",
2179 "not": "\xac",
2180 "not;": "\xac",
2181 "notin;": "\u2209",
2182 "notinE;": "\u22f9\u0338",
2183 "notindot;": "\u22f5\u0338",
2184 "notinva;": "\u2209",
2185 "notinvb;": "\u22f7",
2186 "notinvc;": "\u22f6",
2187 "notni;": "\u220c",
2188 "notniva;": "\u220c",
2189 "notnivb;": "\u22fe",
2190 "notnivc;": "\u22fd",
2191 "npar;": "\u2226",
2192 "nparallel;": "\u2226",
2193 "nparsl;": "\u2afd\u20e5",
2194 "npart;": "\u2202\u0338",
2195 "npolint;": "\u2a14",
2196 "npr;": "\u2280",
2197 "nprcue;": "\u22e0",
2198 "npre;": "\u2aaf\u0338",
2199 "nprec;": "\u2280",
2200 "npreceq;": "\u2aaf\u0338",
2201 "nrArr;": "\u21cf",
2202 "nrarr;": "\u219b",
2203 "nrarrc;": "\u2933\u0338",
2204 "nrarrw;": "\u219d\u0338",
2205 "nrightarrow;": "\u219b",
2206 "nrtri;": "\u22eb",
2207 "nrtrie;": "\u22ed",
2208 "nsc;": "\u2281",
2209 "nsccue;": "\u22e1",
2210 "nsce;": "\u2ab0\u0338",
2211 "nscr;": "\U0001d4c3",
2212 "nshortmid;": "\u2224",
2213 "nshortparallel;": "\u2226",
2214 "nsim;": "\u2241",
2215 "nsime;": "\u2244",
2216 "nsimeq;": "\u2244",
2217 "nsmid;": "\u2224",
2218 "nspar;": "\u2226",
2219 "nsqsube;": "\u22e2",
2220 "nsqsupe;": "\u22e3",
2221 "nsub;": "\u2284",
2222 "nsubE;": "\u2ac5\u0338",
2223 "nsube;": "\u2288",
2224 "nsubset;": "\u2282\u20d2",
2225 "nsubseteq;": "\u2288",
2226 "nsubseteqq;": "\u2ac5\u0338",
2227 "nsucc;": "\u2281",
2228 "nsucceq;": "\u2ab0\u0338",
2229 "nsup;": "\u2285",
2230 "nsupE;": "\u2ac6\u0338",
2231 "nsupe;": "\u2289",
2232 "nsupset;": "\u2283\u20d2",
2233 "nsupseteq;": "\u2289",
2234 "nsupseteqq;": "\u2ac6\u0338",
2235 "ntgl;": "\u2279",
2236 "ntilde": "\xf1",
2237 "ntilde;": "\xf1",
2238 "ntlg;": "\u2278",
2239 "ntriangleleft;": "\u22ea",
2240 "ntrianglelefteq;": "\u22ec",
2241 "ntriangleright;": "\u22eb",
2242 "ntrianglerighteq;": "\u22ed",
2243 "nu;": "\u03bd",
2244 "num;": "#",
2245 "numero;": "\u2116",
2246 "numsp;": "\u2007",
2247 "nvDash;": "\u22ad",
2248 "nvHarr;": "\u2904",
2249 "nvap;": "\u224d\u20d2",
2250 "nvdash;": "\u22ac",
2251 "nvge;": "\u2265\u20d2",
2252 "nvgt;": ">\u20d2",
2253 "nvinfin;": "\u29de",
2254 "nvlArr;": "\u2902",
2255 "nvle;": "\u2264\u20d2",
2256 "nvlt;": "<\u20d2",
2257 "nvltrie;": "\u22b4\u20d2",
2258 "nvrArr;": "\u2903",
2259 "nvrtrie;": "\u22b5\u20d2",
2260 "nvsim;": "\u223c\u20d2",
2261 "nwArr;": "\u21d6",
2262 "nwarhk;": "\u2923",
2263 "nwarr;": "\u2196",
2264 "nwarrow;": "\u2196",
2265 "nwnear;": "\u2927",
2266 "oS;": "\u24c8",
2267 "oacute": "\xf3",
2268 "oacute;": "\xf3",
2269 "oast;": "\u229b",
2270 "ocir;": "\u229a",
2271 "ocirc": "\xf4",
2272 "ocirc;": "\xf4",
2273 "ocy;": "\u043e",
2274 "odash;": "\u229d",
2275 "odblac;": "\u0151",
2276 "odiv;": "\u2a38",
2277 "odot;": "\u2299",
2278 "odsold;": "\u29bc",
2279 "oelig;": "\u0153",
2280 "ofcir;": "\u29bf",
2281 "ofr;": "\U0001d52c",
2282 "ogon;": "\u02db",
2283 "ograve": "\xf2",
2284 "ograve;": "\xf2",
2285 "ogt;": "\u29c1",
2286 "ohbar;": "\u29b5",
2287 "ohm;": "\u03a9",
2288 "oint;": "\u222e",
2289 "olarr;": "\u21ba",
2290 "olcir;": "\u29be",
2291 "olcross;": "\u29bb",
2292 "oline;": "\u203e",
2293 "olt;": "\u29c0",
2294 "omacr;": "\u014d",
2295 "omega;": "\u03c9",
2296 "omicron;": "\u03bf",
2297 "omid;": "\u29b6",
2298 "ominus;": "\u2296",
2299 "oopf;": "\U0001d560",
2300 "opar;": "\u29b7",
2301 "operp;": "\u29b9",
2302 "oplus;": "\u2295",
2303 "or;": "\u2228",
2304 "orarr;": "\u21bb",
2305 "ord;": "\u2a5d",
2306 "order;": "\u2134",
2307 "orderof;": "\u2134",
2308 "ordf": "\xaa",
2309 "ordf;": "\xaa",
2310 "ordm": "\xba",
2311 "ordm;": "\xba",
2312 "origof;": "\u22b6",
2313 "oror;": "\u2a56",
2314 "orslope;": "\u2a57",
2315 "orv;": "\u2a5b",
2316 "oscr;": "\u2134",
2317 "oslash": "\xf8",
2318 "oslash;": "\xf8",
2319 "osol;": "\u2298",
2320 "otilde": "\xf5",
2321 "otilde;": "\xf5",
2322 "otimes;": "\u2297",
2323 "otimesas;": "\u2a36",
2324 "ouml": "\xf6",
2325 "ouml;": "\xf6",
2326 "ovbar;": "\u233d",
2327 "par;": "\u2225",
2328 "para": "\xb6",
2329 "para;": "\xb6",
2330 "parallel;": "\u2225",
2331 "parsim;": "\u2af3",
2332 "parsl;": "\u2afd",
2333 "part;": "\u2202",
2334 "pcy;": "\u043f",
2335 "percnt;": "%",
2336 "period;": ".",
2337 "permil;": "\u2030",
2338 "perp;": "\u22a5",
2339 "pertenk;": "\u2031",
2340 "pfr;": "\U0001d52d",
2341 "phi;": "\u03c6",
2342 "phiv;": "\u03d5",
2343 "phmmat;": "\u2133",
2344 "phone;": "\u260e",
2345 "pi;": "\u03c0",
2346 "pitchfork;": "\u22d4",
2347 "piv;": "\u03d6",
2348 "planck;": "\u210f",
2349 "planckh;": "\u210e",
2350 "plankv;": "\u210f",
2351 "plus;": "+",
2352 "plusacir;": "\u2a23",
2353 "plusb;": "\u229e",
2354 "pluscir;": "\u2a22",
2355 "plusdo;": "\u2214",
2356 "plusdu;": "\u2a25",
2357 "pluse;": "\u2a72",
2358 "plusmn": "\xb1",
2359 "plusmn;": "\xb1",
2360 "plussim;": "\u2a26",
2361 "plustwo;": "\u2a27",
2362 "pm;": "\xb1",
2363 "pointint;": "\u2a15",
2364 "popf;": "\U0001d561",
2365 "pound": "\xa3",
2366 "pound;": "\xa3",
2367 "pr;": "\u227a",
2368 "prE;": "\u2ab3",
2369 "prap;": "\u2ab7",
2370 "prcue;": "\u227c",
2371 "pre;": "\u2aaf",
2372 "prec;": "\u227a",
2373 "precapprox;": "\u2ab7",
2374 "preccurlyeq;": "\u227c",
2375 "preceq;": "\u2aaf",
2376 "precnapprox;": "\u2ab9",
2377 "precneqq;": "\u2ab5",
2378 "precnsim;": "\u22e8",
2379 "precsim;": "\u227e",
2380 "prime;": "\u2032",
2381 "primes;": "\u2119",
2382 "prnE;": "\u2ab5",
2383 "prnap;": "\u2ab9",
2384 "prnsim;": "\u22e8",
2385 "prod;": "\u220f",
2386 "profalar;": "\u232e",
2387 "profline;": "\u2312",
2388 "profsurf;": "\u2313",
2389 "prop;": "\u221d",
2390 "propto;": "\u221d",
2391 "prsim;": "\u227e",
2392 "prurel;": "\u22b0",
2393 "pscr;": "\U0001d4c5",
2394 "psi;": "\u03c8",
2395 "puncsp;": "\u2008",
2396 "qfr;": "\U0001d52e",
2397 "qint;": "\u2a0c",
2398 "qopf;": "\U0001d562",
2399 "qprime;": "\u2057",
2400 "qscr;": "\U0001d4c6",
2401 "quaternions;": "\u210d",
2402 "quatint;": "\u2a16",
2403 "quest;": "?",
2404 "questeq;": "\u225f",
2405 "quot": "\"",
2406 "quot;": "\"",
2407 "rAarr;": "\u21db",
2408 "rArr;": "\u21d2",
2409 "rAtail;": "\u291c",
2410 "rBarr;": "\u290f",
2411 "rHar;": "\u2964",
2412 "race;": "\u223d\u0331",
2413 "racute;": "\u0155",
2414 "radic;": "\u221a",
2415 "raemptyv;": "\u29b3",
2416 "rang;": "\u27e9",
2417 "rangd;": "\u2992",
2418 "range;": "\u29a5",
2419 "rangle;": "\u27e9",
2420 "raquo": "\xbb",
2421 "raquo;": "\xbb",
2422 "rarr;": "\u2192",
2423 "rarrap;": "\u2975",
2424 "rarrb;": "\u21e5",
2425 "rarrbfs;": "\u2920",
2426 "rarrc;": "\u2933",
2427 "rarrfs;": "\u291e",
2428 "rarrhk;": "\u21aa",
2429 "rarrlp;": "\u21ac",
2430 "rarrpl;": "\u2945",
2431 "rarrsim;": "\u2974",
2432 "rarrtl;": "\u21a3",
2433 "rarrw;": "\u219d",
2434 "ratail;": "\u291a",
2435 "ratio;": "\u2236",
2436 "rationals;": "\u211a",
2437 "rbarr;": "\u290d",
2438 "rbbrk;": "\u2773",
2439 "rbrace;": "}",
2440 "rbrack;": "]",
2441 "rbrke;": "\u298c",
2442 "rbrksld;": "\u298e",
2443 "rbrkslu;": "\u2990",
2444 "rcaron;": "\u0159",
2445 "rcedil;": "\u0157",
2446 "rceil;": "\u2309",
2447 "rcub;": "}",
2448 "rcy;": "\u0440",
2449 "rdca;": "\u2937",
2450 "rdldhar;": "\u2969",
2451 "rdquo;": "\u201d",
2452 "rdquor;": "\u201d",
2453 "rdsh;": "\u21b3",
2454 "real;": "\u211c",
2455 "realine;": "\u211b",
2456 "realpart;": "\u211c",
2457 "reals;": "\u211d",
2458 "rect;": "\u25ad",
2459 "reg": "\xae",
2460 "reg;": "\xae",
2461 "rfisht;": "\u297d",
2462 "rfloor;": "\u230b",
2463 "rfr;": "\U0001d52f",
2464 "rhard;": "\u21c1",
2465 "rharu;": "\u21c0",
2466 "rharul;": "\u296c",
2467 "rho;": "\u03c1",
2468 "rhov;": "\u03f1",
2469 "rightarrow;": "\u2192",
2470 "rightarrowtail;": "\u21a3",
2471 "rightharpoondown;": "\u21c1",
2472 "rightharpoonup;": "\u21c0",
2473 "rightleftarrows;": "\u21c4",
2474 "rightleftharpoons;": "\u21cc",
2475 "rightrightarrows;": "\u21c9",
2476 "rightsquigarrow;": "\u219d",
2477 "rightthreetimes;": "\u22cc",
2478 "ring;": "\u02da",
2479 "risingdotseq;": "\u2253",
2480 "rlarr;": "\u21c4",
2481 "rlhar;": "\u21cc",
2482 "rlm;": "\u200f",
2483 "rmoust;": "\u23b1",
2484 "rmoustache;": "\u23b1",
2485 "rnmid;": "\u2aee",
2486 "roang;": "\u27ed",
2487 "roarr;": "\u21fe",
2488 "robrk;": "\u27e7",
2489 "ropar;": "\u2986",
2490 "ropf;": "\U0001d563",
2491 "roplus;": "\u2a2e",
2492 "rotimes;": "\u2a35",
2493 "rpar;": ")",
2494 "rpargt;": "\u2994",
2495 "rppolint;": "\u2a12",
2496 "rrarr;": "\u21c9",
2497 "rsaquo;": "\u203a",
2498 "rscr;": "\U0001d4c7",
2499 "rsh;": "\u21b1",
2500 "rsqb;": "]",
2501 "rsquo;": "\u2019",
2502 "rsquor;": "\u2019",
2503 "rthree;": "\u22cc",
2504 "rtimes;": "\u22ca",
2505 "rtri;": "\u25b9",
2506 "rtrie;": "\u22b5",
2507 "rtrif;": "\u25b8",
2508 "rtriltri;": "\u29ce",
2509 "ruluhar;": "\u2968",
2510 "rx;": "\u211e",
2511 "sacute;": "\u015b",
2512 "sbquo;": "\u201a",
2513 "sc;": "\u227b",
2514 "scE;": "\u2ab4",
2515 "scap;": "\u2ab8",
2516 "scaron;": "\u0161",
2517 "sccue;": "\u227d",
2518 "sce;": "\u2ab0",
2519 "scedil;": "\u015f",
2520 "scirc;": "\u015d",
2521 "scnE;": "\u2ab6",
2522 "scnap;": "\u2aba",
2523 "scnsim;": "\u22e9",
2524 "scpolint;": "\u2a13",
2525 "scsim;": "\u227f",
2526 "scy;": "\u0441",
2527 "sdot;": "\u22c5",
2528 "sdotb;": "\u22a1",
2529 "sdote;": "\u2a66",
2530 "seArr;": "\u21d8",
2531 "searhk;": "\u2925",
2532 "searr;": "\u2198",
2533 "searrow;": "\u2198",
2534 "sect": "\xa7",
2535 "sect;": "\xa7",
2536 "semi;": ";",
2537 "seswar;": "\u2929",
2538 "setminus;": "\u2216",
2539 "setmn;": "\u2216",
2540 "sext;": "\u2736",
2541 "sfr;": "\U0001d530",
2542 "sfrown;": "\u2322",
2543 "sharp;": "\u266f",
2544 "shchcy;": "\u0449",
2545 "shcy;": "\u0448",
2546 "shortmid;": "\u2223",
2547 "shortparallel;": "\u2225",
2548 "shy": "\xad",
2549 "shy;": "\xad",
2550 "sigma;": "\u03c3",
2551 "sigmaf;": "\u03c2",
2552 "sigmav;": "\u03c2",
2553 "sim;": "\u223c",
2554 "simdot;": "\u2a6a",
2555 "sime;": "\u2243",
2556 "simeq;": "\u2243",
2557 "simg;": "\u2a9e",
2558 "simgE;": "\u2aa0",
2559 "siml;": "\u2a9d",
2560 "simlE;": "\u2a9f",
2561 "simne;": "\u2246",
2562 "simplus;": "\u2a24",
2563 "simrarr;": "\u2972",
2564 "slarr;": "\u2190",
2565 "smallsetminus;": "\u2216",
2566 "smashp;": "\u2a33",
2567 "smeparsl;": "\u29e4",
2568 "smid;": "\u2223",
2569 "smile;": "\u2323",
2570 "smt;": "\u2aaa",
2571 "smte;": "\u2aac",
2572 "smtes;": "\u2aac\ufe00",
2573 "softcy;": "\u044c",
2574 "sol;": "/",
2575 "solb;": "\u29c4",
2576 "solbar;": "\u233f",
2577 "sopf;": "\U0001d564",
2578 "spades;": "\u2660",
2579 "spadesuit;": "\u2660",
2580 "spar;": "\u2225",
2581 "sqcap;": "\u2293",
2582 "sqcaps;": "\u2293\ufe00",
2583 "sqcup;": "\u2294",
2584 "sqcups;": "\u2294\ufe00",
2585 "sqsub;": "\u228f",
2586 "sqsube;": "\u2291",
2587 "sqsubset;": "\u228f",
2588 "sqsubseteq;": "\u2291",
2589 "sqsup;": "\u2290",
2590 "sqsupe;": "\u2292",
2591 "sqsupset;": "\u2290",
2592 "sqsupseteq;": "\u2292",
2593 "squ;": "\u25a1",
2594 "square;": "\u25a1",
2595 "squarf;": "\u25aa",
2596 "squf;": "\u25aa",
2597 "srarr;": "\u2192",
2598 "sscr;": "\U0001d4c8",
2599 "ssetmn;": "\u2216",
2600 "ssmile;": "\u2323",
2601 "sstarf;": "\u22c6",
2602 "star;": "\u2606",
2603 "starf;": "\u2605",
2604 "straightepsilon;": "\u03f5",
2605 "straightphi;": "\u03d5",
2606 "strns;": "\xaf",
2607 "sub;": "\u2282",
2608 "subE;": "\u2ac5",
2609 "subdot;": "\u2abd",
2610 "sube;": "\u2286",
2611 "subedot;": "\u2ac3",
2612 "submult;": "\u2ac1",
2613 "subnE;": "\u2acb",
2614 "subne;": "\u228a",
2615 "subplus;": "\u2abf",
2616 "subrarr;": "\u2979",
2617 "subset;": "\u2282",
2618 "subseteq;": "\u2286",
2619 "subseteqq;": "\u2ac5",
2620 "subsetneq;": "\u228a",
2621 "subsetneqq;": "\u2acb",
2622 "subsim;": "\u2ac7",
2623 "subsub;": "\u2ad5",
2624 "subsup;": "\u2ad3",
2625 "succ;": "\u227b",
2626 "succapprox;": "\u2ab8",
2627 "succcurlyeq;": "\u227d",
2628 "succeq;": "\u2ab0",
2629 "succnapprox;": "\u2aba",
2630 "succneqq;": "\u2ab6",
2631 "succnsim;": "\u22e9",
2632 "succsim;": "\u227f",
2633 "sum;": "\u2211",
2634 "sung;": "\u266a",
2635 "sup1": "\xb9",
2636 "sup1;": "\xb9",
2637 "sup2": "\xb2",
2638 "sup2;": "\xb2",
2639 "sup3": "\xb3",
2640 "sup3;": "\xb3",
2641 "sup;": "\u2283",
2642 "supE;": "\u2ac6",
2643 "supdot;": "\u2abe",
2644 "supdsub;": "\u2ad8",
2645 "supe;": "\u2287",
2646 "supedot;": "\u2ac4",
2647 "suphsol;": "\u27c9",
2648 "suphsub;": "\u2ad7",
2649 "suplarr;": "\u297b",
2650 "supmult;": "\u2ac2",
2651 "supnE;": "\u2acc",
2652 "supne;": "\u228b",
2653 "supplus;": "\u2ac0",
2654 "supset;": "\u2283",
2655 "supseteq;": "\u2287",
2656 "supseteqq;": "\u2ac6",
2657 "supsetneq;": "\u228b",
2658 "supsetneqq;": "\u2acc",
2659 "supsim;": "\u2ac8",
2660 "supsub;": "\u2ad4",
2661 "supsup;": "\u2ad6",
2662 "swArr;": "\u21d9",
2663 "swarhk;": "\u2926",
2664 "swarr;": "\u2199",
2665 "swarrow;": "\u2199",
2666 "swnwar;": "\u292a",
2667 "szlig": "\xdf",
2668 "szlig;": "\xdf",
2669 "target;": "\u2316",
2670 "tau;": "\u03c4",
2671 "tbrk;": "\u23b4",
2672 "tcaron;": "\u0165",
2673 "tcedil;": "\u0163",
2674 "tcy;": "\u0442",
2675 "tdot;": "\u20db",
2676 "telrec;": "\u2315",
2677 "tfr;": "\U0001d531",
2678 "there4;": "\u2234",
2679 "therefore;": "\u2234",
2680 "theta;": "\u03b8",
2681 "thetasym;": "\u03d1",
2682 "thetav;": "\u03d1",
2683 "thickapprox;": "\u2248",
2684 "thicksim;": "\u223c",
2685 "thinsp;": "\u2009",
2686 "thkap;": "\u2248",
2687 "thksim;": "\u223c",
2688 "thorn": "\xfe",
2689 "thorn;": "\xfe",
2690 "tilde;": "\u02dc",
2691 "times": "\xd7",
2692 "times;": "\xd7",
2693 "timesb;": "\u22a0",
2694 "timesbar;": "\u2a31",
2695 "timesd;": "\u2a30",
2696 "tint;": "\u222d",
2697 "toea;": "\u2928",
2698 "top;": "\u22a4",
2699 "topbot;": "\u2336",
2700 "topcir;": "\u2af1",
2701 "topf;": "\U0001d565",
2702 "topfork;": "\u2ada",
2703 "tosa;": "\u2929",
2704 "tprime;": "\u2034",
2705 "trade;": "\u2122",
2706 "triangle;": "\u25b5",
2707 "triangledown;": "\u25bf",
2708 "triangleleft;": "\u25c3",
2709 "trianglelefteq;": "\u22b4",
2710 "triangleq;": "\u225c",
2711 "triangleright;": "\u25b9",
2712 "trianglerighteq;": "\u22b5",
2713 "tridot;": "\u25ec",
2714 "trie;": "\u225c",
2715 "triminus;": "\u2a3a",
2716 "triplus;": "\u2a39",
2717 "trisb;": "\u29cd",
2718 "tritime;": "\u2a3b",
2719 "trpezium;": "\u23e2",
2720 "tscr;": "\U0001d4c9",
2721 "tscy;": "\u0446",
2722 "tshcy;": "\u045b",
2723 "tstrok;": "\u0167",
2724 "twixt;": "\u226c",
2725 "twoheadleftarrow;": "\u219e",
2726 "twoheadrightarrow;": "\u21a0",
2727 "uArr;": "\u21d1",
2728 "uHar;": "\u2963",
2729 "uacute": "\xfa",
2730 "uacute;": "\xfa",
2731 "uarr;": "\u2191",
2732 "ubrcy;": "\u045e",
2733 "ubreve;": "\u016d",
2734 "ucirc": "\xfb",
2735 "ucirc;": "\xfb",
2736 "ucy;": "\u0443",
2737 "udarr;": "\u21c5",
2738 "udblac;": "\u0171",
2739 "udhar;": "\u296e",
2740 "ufisht;": "\u297e",
2741 "ufr;": "\U0001d532",
2742 "ugrave": "\xf9",
2743 "ugrave;": "\xf9",
2744 "uharl;": "\u21bf",
2745 "uharr;": "\u21be",
2746 "uhblk;": "\u2580",
2747 "ulcorn;": "\u231c",
2748 "ulcorner;": "\u231c",
2749 "ulcrop;": "\u230f",
2750 "ultri;": "\u25f8",
2751 "umacr;": "\u016b",
2752 "uml": "\xa8",
2753 "uml;": "\xa8",
2754 "uogon;": "\u0173",
2755 "uopf;": "\U0001d566",
2756 "uparrow;": "\u2191",
2757 "updownarrow;": "\u2195",
2758 "upharpoonleft;": "\u21bf",
2759 "upharpoonright;": "\u21be",
2760 "uplus;": "\u228e",
2761 "upsi;": "\u03c5",
2762 "upsih;": "\u03d2",
2763 "upsilon;": "\u03c5",
2764 "upuparrows;": "\u21c8",
2765 "urcorn;": "\u231d",
2766 "urcorner;": "\u231d",
2767 "urcrop;": "\u230e",
2768 "uring;": "\u016f",
2769 "urtri;": "\u25f9",
2770 "uscr;": "\U0001d4ca",
2771 "utdot;": "\u22f0",
2772 "utilde;": "\u0169",
2773 "utri;": "\u25b5",
2774 "utrif;": "\u25b4",
2775 "uuarr;": "\u21c8",
2776 "uuml": "\xfc",
2777 "uuml;": "\xfc",
2778 "uwangle;": "\u29a7",
2779 "vArr;": "\u21d5",
2780 "vBar;": "\u2ae8",
2781 "vBarv;": "\u2ae9",
2782 "vDash;": "\u22a8",
2783 "vangrt;": "\u299c",
2784 "varepsilon;": "\u03f5",
2785 "varkappa;": "\u03f0",
2786 "varnothing;": "\u2205",
2787 "varphi;": "\u03d5",
2788 "varpi;": "\u03d6",
2789 "varpropto;": "\u221d",
2790 "varr;": "\u2195",
2791 "varrho;": "\u03f1",
2792 "varsigma;": "\u03c2",
2793 "varsubsetneq;": "\u228a\ufe00",
2794 "varsubsetneqq;": "\u2acb\ufe00",
2795 "varsupsetneq;": "\u228b\ufe00",
2796 "varsupsetneqq;": "\u2acc\ufe00",
2797 "vartheta;": "\u03d1",
2798 "vartriangleleft;": "\u22b2",
2799 "vartriangleright;": "\u22b3",
2800 "vcy;": "\u0432",
2801 "vdash;": "\u22a2",
2802 "vee;": "\u2228",
2803 "veebar;": "\u22bb",
2804 "veeeq;": "\u225a",
2805 "vellip;": "\u22ee",
2806 "verbar;": "|",
2807 "vert;": "|",
2808 "vfr;": "\U0001d533",
2809 "vltri;": "\u22b2",
2810 "vnsub;": "\u2282\u20d2",
2811 "vnsup;": "\u2283\u20d2",
2812 "vopf;": "\U0001d567",
2813 "vprop;": "\u221d",
2814 "vrtri;": "\u22b3",
2815 "vscr;": "\U0001d4cb",
2816 "vsubnE;": "\u2acb\ufe00",
2817 "vsubne;": "\u228a\ufe00",
2818 "vsupnE;": "\u2acc\ufe00",
2819 "vsupne;": "\u228b\ufe00",
2820 "vzigzag;": "\u299a",
2821 "wcirc;": "\u0175",
2822 "wedbar;": "\u2a5f",
2823 "wedge;": "\u2227",
2824 "wedgeq;": "\u2259",
2825 "weierp;": "\u2118",
2826 "wfr;": "\U0001d534",
2827 "wopf;": "\U0001d568",
2828 "wp;": "\u2118",
2829 "wr;": "\u2240",
2830 "wreath;": "\u2240",
2831 "wscr;": "\U0001d4cc",
2832 "xcap;": "\u22c2",
2833 "xcirc;": "\u25ef",
2834 "xcup;": "\u22c3",
2835 "xdtri;": "\u25bd",
2836 "xfr;": "\U0001d535",
2837 "xhArr;": "\u27fa",
2838 "xharr;": "\u27f7",
2839 "xi;": "\u03be",
2840 "xlArr;": "\u27f8",
2841 "xlarr;": "\u27f5",
2842 "xmap;": "\u27fc",
2843 "xnis;": "\u22fb",
2844 "xodot;": "\u2a00",
2845 "xopf;": "\U0001d569",
2846 "xoplus;": "\u2a01",
2847 "xotime;": "\u2a02",
2848 "xrArr;": "\u27f9",
2849 "xrarr;": "\u27f6",
2850 "xscr;": "\U0001d4cd",
2851 "xsqcup;": "\u2a06",
2852 "xuplus;": "\u2a04",
2853 "xutri;": "\u25b3",
2854 "xvee;": "\u22c1",
2855 "xwedge;": "\u22c0",
2856 "yacute": "\xfd",
2857 "yacute;": "\xfd",
2858 "yacy;": "\u044f",
2859 "ycirc;": "\u0177",
2860 "ycy;": "\u044b",
2861 "yen": "\xa5",
2862 "yen;": "\xa5",
2863 "yfr;": "\U0001d536",
2864 "yicy;": "\u0457",
2865 "yopf;": "\U0001d56a",
2866 "yscr;": "\U0001d4ce",
2867 "yucy;": "\u044e",
2868 "yuml": "\xff",
2869 "yuml;": "\xff",
2870 "zacute;": "\u017a",
2871 "zcaron;": "\u017e",
2872 "zcy;": "\u0437",
2873 "zdot;": "\u017c",
2874 "zeetrf;": "\u2128",
2875 "zeta;": "\u03b6",
2876 "zfr;": "\U0001d537",
2877 "zhcy;": "\u0436",
2878 "zigrarr;": "\u21dd",
2879 "zopf;": "\U0001d56b",
2880 "zscr;": "\U0001d4cf",
2881 "zwj;": "\u200d",
2882 "zwnj;": "\u200c",
2883}
2884
2885replacementCharacters = {
2886 0x0: "\uFFFD",
2887 0x0d: "\u000D",
2888 0x80: "\u20AC",
2889 0x81: "\u0081",
2890 0x82: "\u201A",
2891 0x83: "\u0192",
2892 0x84: "\u201E",
2893 0x85: "\u2026",
2894 0x86: "\u2020",
2895 0x87: "\u2021",
2896 0x88: "\u02C6",
2897 0x89: "\u2030",
2898 0x8A: "\u0160",
2899 0x8B: "\u2039",
2900 0x8C: "\u0152",
2901 0x8D: "\u008D",
2902 0x8E: "\u017D",
2903 0x8F: "\u008F",
2904 0x90: "\u0090",
2905 0x91: "\u2018",
2906 0x92: "\u2019",
2907 0x93: "\u201C",
2908 0x94: "\u201D",
2909 0x95: "\u2022",
2910 0x96: "\u2013",
2911 0x97: "\u2014",
2912 0x98: "\u02DC",
2913 0x99: "\u2122",
2914 0x9A: "\u0161",
2915 0x9B: "\u203A",
2916 0x9C: "\u0153",
2917 0x9D: "\u009D",
2918 0x9E: "\u017E",
2919 0x9F: "\u0178",
2920}
2921
2922tokenTypes = {
2923 "Doctype": 0,
2924 "Characters": 1,
2925 "SpaceCharacters": 2,
2926 "StartTag": 3,
2927 "EndTag": 4,
2928 "EmptyTag": 5,
2929 "Comment": 6,
2930 "ParseError": 7
2931}
2932
2933tagTokenTypes = frozenset([tokenTypes["StartTag"], tokenTypes["EndTag"],
2934 tokenTypes["EmptyTag"]])
2935
2936
2937prefixes = dict([(v, k) for k, v in namespaces.items()])
2938prefixes["http://www.w3.org/1998/Math/MathML"] = "math"
2939
2940
2941class DataLossWarning(UserWarning):
2942 """Raised when the current tree is unable to represent the input data"""
2943 pass
2944
2945
2946class _ReparseException(Exception):
2947 pass
diff --git a/venv/lib/python3.7/site-packages/pip-10.0.1-py3.7.egg/pip/_vendor/html5lib/filters/__init__.py b/venv/lib/python3.7/site-packages/pip-10.0.1-py3.7.egg/pip/_vendor/html5lib/filters/__init__.py
new file mode 100644
index 0000000..e69de29
--- /dev/null
+++ b/venv/lib/python3.7/site-packages/pip-10.0.1-py3.7.egg/pip/_vendor/html5lib/filters/__init__.py
diff --git a/venv/lib/python3.7/site-packages/pip-10.0.1-py3.7.egg/pip/_vendor/html5lib/filters/alphabeticalattributes.py b/venv/lib/python3.7/site-packages/pip-10.0.1-py3.7.egg/pip/_vendor/html5lib/filters/alphabeticalattributes.py
new file mode 100644
index 0000000..d9e234a
--- /dev/null
+++ b/venv/lib/python3.7/site-packages/pip-10.0.1-py3.7.egg/pip/_vendor/html5lib/filters/alphabeticalattributes.py
@@ -0,0 +1,29 @@
1from __future__ import absolute_import, division, unicode_literals
2
3from . import base
4
5from collections import OrderedDict
6
7
8def _attr_key(attr):
9 """Return an appropriate key for an attribute for sorting
10
11 Attributes have a namespace that can be either ``None`` or a string. We
12 can't compare the two because they're different types, so we convert
13 ``None`` to an empty string first.
14
15 """
16 return (attr[0][0] or ''), attr[0][1]
17
18
19class Filter(base.Filter):
20 """Alphabetizes attributes for elements"""
21 def __iter__(self):
22 for token in base.Filter.__iter__(self):
23 if token["type"] in ("StartTag", "EmptyTag"):
24 attrs = OrderedDict()
25 for name, value in sorted(token["data"].items(),
26 key=_attr_key):
27 attrs[name] = value
28 token["data"] = attrs
29 yield token
diff --git a/venv/lib/python3.7/site-packages/pip-10.0.1-py3.7.egg/pip/_vendor/html5lib/filters/base.py b/venv/lib/python3.7/site-packages/pip-10.0.1-py3.7.egg/pip/_vendor/html5lib/filters/base.py
new file mode 100644
index 0000000..f5aa523
--- /dev/null
+++ b/venv/lib/python3.7/site-packages/pip-10.0.1-py3.7.egg/pip/_vendor/html5lib/filters/base.py
@@ -0,0 +1,12 @@
1from __future__ import absolute_import, division, unicode_literals
2
3
4class Filter(object):
5 def __init__(self, source):
6 self.source = source
7
8 def __iter__(self):
9 return iter(self.source)
10
11 def __getattr__(self, name):
12 return getattr(self.source, name)
diff --git a/venv/lib/python3.7/site-packages/pip-10.0.1-py3.7.egg/pip/_vendor/html5lib/filters/inject_meta_charset.py b/venv/lib/python3.7/site-packages/pip-10.0.1-py3.7.egg/pip/_vendor/html5lib/filters/inject_meta_charset.py
new file mode 100644
index 0000000..2f8ec4f
--- /dev/null
+++ b/venv/lib/python3.7/site-packages/pip-10.0.1-py3.7.egg/pip/_vendor/html5lib/filters/inject_meta_charset.py
@@ -0,0 +1,73 @@
1from __future__ import absolute_import, division, unicode_literals
2
3from . import base
4
5
6class Filter(base.Filter):
7 """Injects ``<meta charset=ENCODING>`` tag into head of document"""
8 def __init__(self, source, encoding):
9 """Creates a Filter
10
11 :arg source: the source token stream
12
13 :arg encoding: the encoding to set
14
15 """
16 base.Filter.__init__(self, source)
17 self.encoding = encoding
18
19 def __iter__(self):
20 state = "pre_head"
21 meta_found = (self.encoding is None)
22 pending = []
23
24 for token in base.Filter.__iter__(self):
25 type = token["type"]
26 if type == "StartTag":
27 if token["name"].lower() == "head":
28 state = "in_head"
29
30 elif type == "EmptyTag":
31 if token["name"].lower() == "meta":
32 # replace charset with actual encoding
33 has_http_equiv_content_type = False
34 for (namespace, name), value in token["data"].items():
35 if namespace is not None:
36 continue
37 elif name.lower() == 'charset':
38 token["data"][(namespace, name)] = self.encoding
39 meta_found = True
40 break
41 elif name == 'http-equiv' and value.lower() == 'content-type':
42 has_http_equiv_content_type = True
43 else:
44 if has_http_equiv_content_type and (None, "content") in token["data"]:
45 token["data"][(None, "content")] = 'text/html; charset=%s' % self.encoding
46 meta_found = True
47
48 elif token["name"].lower() == "head" and not meta_found:
49 # insert meta into empty head
50 yield {"type": "StartTag", "name": "head",
51 "data": token["data"]}
52 yield {"type": "EmptyTag", "name": "meta",
53 "data": {(None, "charset"): self.encoding}}
54 yield {"type": "EndTag", "name": "head"}
55 meta_found = True
56 continue
57
58 elif type == "EndTag":
59 if token["name"].lower() == "head" and pending:
60 # insert meta into head (if necessary) and flush pending queue
61 yield pending.pop(0)
62 if not meta_found:
63 yield {"type": "EmptyTag", "name": "meta",
64 "data": {(None, "charset"): self.encoding}}
65 while pending:
66 yield pending.pop(0)
67 meta_found = True
68 state = "post_head"
69
70 if state == "in_head":
71 pending.append(token)
72 else:
73 yield token
diff --git a/venv/lib/python3.7/site-packages/pip-10.0.1-py3.7.egg/pip/_vendor/html5lib/filters/lint.py b/venv/lib/python3.7/site-packages/pip-10.0.1-py3.7.egg/pip/_vendor/html5lib/filters/lint.py
new file mode 100644
index 0000000..b5bbd97
--- /dev/null
+++ b/venv/lib/python3.7/site-packages/pip-10.0.1-py3.7.egg/pip/_vendor/html5lib/filters/lint.py
@@ -0,0 +1,93 @@
1from __future__ import absolute_import, division, unicode_literals
2
3from pip._vendor.six import text_type
4
5from . import base
6from ..constants import namespaces, voidElements
7
8from ..constants import spaceCharacters
9spaceCharacters = "".join(spaceCharacters)
10
11
12class Filter(base.Filter):
13 """Lints the token stream for errors
14
15 If it finds any errors, it'll raise an ``AssertionError``.
16
17 """
18 def __init__(self, source, require_matching_tags=True):
19 """Creates a Filter
20
21 :arg source: the source token stream
22
23 :arg require_matching_tags: whether or not to require matching tags
24
25 """
26 super(Filter, self).__init__(source)
27 self.require_matching_tags = require_matching_tags
28
29 def __iter__(self):
30 open_elements = []
31 for token in base.Filter.__iter__(self):
32 type = token["type"]
33 if type in ("StartTag", "EmptyTag"):
34 namespace = token["namespace"]
35 name = token["name"]
36 assert namespace is None or isinstance(namespace, text_type)
37 assert namespace != ""
38 assert isinstance(name, text_type)
39 assert name != ""
40 assert isinstance(token["data"], dict)
41 if (not namespace or namespace == namespaces["html"]) and name in voidElements:
42 assert type == "EmptyTag"
43 else:
44 assert type == "StartTag"
45 if type == "StartTag" and self.require_matching_tags:
46 open_elements.append((namespace, name))
47 for (namespace, name), value in token["data"].items():
48 assert namespace is None or isinstance(namespace, text_type)
49 assert namespace != ""
50 assert isinstance(name, text_type)
51 assert name != ""
52 assert isinstance(value, text_type)
53
54 elif type == "EndTag":
55 namespace = token["namespace"]
56 name = token["name"]
57 assert namespace is None or isinstance(namespace, text_type)
58 assert namespace != ""
59 assert isinstance(name, text_type)
60 assert name != ""
61 if (not namespace or namespace == namespaces["html"]) and name in voidElements:
62 assert False, "Void element reported as EndTag token: %(tag)s" % {"tag": name}
63 elif self.require_matching_tags:
64 start = open_elements.pop()
65 assert start == (namespace, name)
66
67 elif type == "Comment":
68 data = token["data"]
69 assert isinstance(data, text_type)
70
71 elif type in ("Characters", "SpaceCharacters"):
72 data = token["data"]
73 assert isinstance(data, text_type)
74 assert data != ""
75 if type == "SpaceCharacters":
76 assert data.strip(spaceCharacters) == ""
77
78 elif type == "Doctype":
79 name = token["name"]
80 assert name is None or isinstance(name, text_type)
81 assert token["publicId"] is None or isinstance(name, text_type)
82 assert token["systemId"] is None or isinstance(name, text_type)
83
84 elif type == "Entity":
85 assert isinstance(token["name"], text_type)
86
87 elif type == "SerializerError":
88 assert isinstance(token["data"], text_type)
89
90 else:
91 assert False, "Unknown token type: %(type)s" % {"type": type}
92
93 yield token
diff --git a/venv/lib/python3.7/site-packages/pip-10.0.1-py3.7.egg/pip/_vendor/html5lib/filters/optionaltags.py b/venv/lib/python3.7/site-packages/pip-10.0.1-py3.7.egg/pip/_vendor/html5lib/filters/optionaltags.py
new file mode 100644
index 0000000..c8d5e54
--- /dev/null
+++ b/venv/lib/python3.7/site-packages/pip-10.0.1-py3.7.egg/pip/_vendor/html5lib/filters/optionaltags.py
@@ -0,0 +1,207 @@
1from __future__ import absolute_import, division, unicode_literals
2
3from . import base
4
5
6class Filter(base.Filter):
7 """Removes optional tags from the token stream"""
8 def slider(self):
9 previous1 = previous2 = None
10 for token in self.source:
11 if previous1 is not None:
12 yield previous2, previous1, token
13 previous2 = previous1
14 previous1 = token
15 if previous1 is not None:
16 yield previous2, previous1, None
17
18 def __iter__(self):
19 for previous, token, next in self.slider():
20 type = token["type"]
21 if type == "StartTag":
22 if (token["data"] or
23 not self.is_optional_start(token["name"], previous, next)):
24 yield token
25 elif type == "EndTag":
26 if not self.is_optional_end(token["name"], next):
27 yield token
28 else:
29 yield token
30
31 def is_optional_start(self, tagname, previous, next):
32 type = next and next["type"] or None
33 if tagname in 'html':
34 # An html element's start tag may be omitted if the first thing
35 # inside the html element is not a space character or a comment.
36 return type not in ("Comment", "SpaceCharacters")
37 elif tagname == 'head':
38 # A head element's start tag may be omitted if the first thing
39 # inside the head element is an element.
40 # XXX: we also omit the start tag if the head element is empty
41 if type in ("StartTag", "EmptyTag"):
42 return True
43 elif type == "EndTag":
44 return next["name"] == "head"
45 elif tagname == 'body':
46 # A body element's start tag may be omitted if the first thing
47 # inside the body element is not a space character or a comment,
48 # except if the first thing inside the body element is a script
49 # or style element and the node immediately preceding the body
50 # element is a head element whose end tag has been omitted.
51 if type in ("Comment", "SpaceCharacters"):
52 return False
53 elif type == "StartTag":
54 # XXX: we do not look at the preceding event, so we never omit
55 # the body element's start tag if it's followed by a script or
56 # a style element.
57 return next["name"] not in ('script', 'style')
58 else:
59 return True
60 elif tagname == 'colgroup':
61 # A colgroup element's start tag may be omitted if the first thing
62 # inside the colgroup element is a col element, and if the element
63 # is not immediately preceded by another colgroup element whose
64 # end tag has been omitted.
65 if type in ("StartTag", "EmptyTag"):
66 # XXX: we do not look at the preceding event, so instead we never
67 # omit the colgroup element's end tag when it is immediately
68 # followed by another colgroup element. See is_optional_end.
69 return next["name"] == "col"
70 else:
71 return False
72 elif tagname == 'tbody':
73 # A tbody element's start tag may be omitted if the first thing
74 # inside the tbody element is a tr element, and if the element is
75 # not immediately preceded by a tbody, thead, or tfoot element
76 # whose end tag has been omitted.
77 if type == "StartTag":
78 # omit the thead and tfoot elements' end tag when they are
79 # immediately followed by a tbody element. See is_optional_end.
80 if previous and previous['type'] == 'EndTag' and \
81 previous['name'] in ('tbody', 'thead', 'tfoot'):
82 return False
83 return next["name"] == 'tr'
84 else:
85 return False
86 return False
87
88 def is_optional_end(self, tagname, next):
89 type = next and next["type"] or None
90 if tagname in ('html', 'head', 'body'):
91 # An html element's end tag may be omitted if the html element
92 # is not immediately followed by a space character or a comment.
93 return type not in ("Comment", "SpaceCharacters")
94 elif tagname in ('li', 'optgroup', 'tr'):
95 # A li element's end tag may be omitted if the li element is
96 # immediately followed by another li element or if there is
97 # no more content in the parent element.
98 # An optgroup element's end tag may be omitted if the optgroup
99 # element is immediately followed by another optgroup element,
100 # or if there is no more content in the parent element.
101 # A tr element's end tag may be omitted if the tr element is
102 # immediately followed by another tr element, or if there is
103 # no more content in the parent element.
104 if type == "StartTag":
105 return next["name"] == tagname
106 else:
107 return type == "EndTag" or type is None
108 elif tagname in ('dt', 'dd'):
109 # A dt element's end tag may be omitted if the dt element is
110 # immediately followed by another dt element or a dd element.
111 # A dd element's end tag may be omitted if the dd element is
112 # immediately followed by another dd element or a dt element,
113 # or if there is no more content in the parent element.
114 if type == "StartTag":
115 return next["name"] in ('dt', 'dd')
116 elif tagname == 'dd':
117 return type == "EndTag" or type is None
118 else:
119 return False
120 elif tagname == 'p':
121 # A p element's end tag may be omitted if the p element is
122 # immediately followed by an address, article, aside,
123 # blockquote, datagrid, dialog, dir, div, dl, fieldset,
124 # footer, form, h1, h2, h3, h4, h5, h6, header, hr, menu,
125 # nav, ol, p, pre, section, table, or ul, element, or if
126 # there is no more content in the parent element.
127 if type in ("StartTag", "EmptyTag"):
128 return next["name"] in ('address', 'article', 'aside',
129 'blockquote', 'datagrid', 'dialog',
130 'dir', 'div', 'dl', 'fieldset', 'footer',
131 'form', 'h1', 'h2', 'h3', 'h4', 'h5', 'h6',
132 'header', 'hr', 'menu', 'nav', 'ol',
133 'p', 'pre', 'section', 'table', 'ul')
134 else:
135 return type == "EndTag" or type is None
136 elif tagname == 'option':
137 # An option element's end tag may be omitted if the option
138 # element is immediately followed by another option element,
139 # or if it is immediately followed by an <code>optgroup</code>
140 # element, or if there is no more content in the parent
141 # element.
142 if type == "StartTag":
143 return next["name"] in ('option', 'optgroup')
144 else:
145 return type == "EndTag" or type is None
146 elif tagname in ('rt', 'rp'):
147 # An rt element's end tag may be omitted if the rt element is
148 # immediately followed by an rt or rp element, or if there is
149 # no more content in the parent element.
150 # An rp element's end tag may be omitted if the rp element is
151 # immediately followed by an rt or rp element, or if there is
152 # no more content in the parent element.
153 if type == "StartTag":
154 return next["name"] in ('rt', 'rp')
155 else:
156 return type == "EndTag" or type is None
157 elif tagname == 'colgroup':
158 # A colgroup element's end tag may be omitted if the colgroup
159 # element is not immediately followed by a space character or
160 # a comment.
161 if type in ("Comment", "SpaceCharacters"):
162 return False
163 elif type == "StartTag":
164 # XXX: we also look for an immediately following colgroup
165 # element. See is_optional_start.
166 return next["name"] != 'colgroup'
167 else:
168 return True
169 elif tagname in ('thead', 'tbody'):
170 # A thead element's end tag may be omitted if the thead element
171 # is immediately followed by a tbody or tfoot element.
172 # A tbody element's end tag may be omitted if the tbody element
173 # is immediately followed by a tbody or tfoot element, or if
174 # there is no more content in the parent element.
175 # A tfoot element's end tag may be omitted if the tfoot element
176 # is immediately followed by a tbody element, or if there is no
177 # more content in the parent element.
178 # XXX: we never omit the end tag when the following element is
179 # a tbody. See is_optional_start.
180 if type == "StartTag":
181 return next["name"] in ['tbody', 'tfoot']
182 elif tagname == 'tbody':
183 return type == "EndTag" or type is None
184 else:
185 return False
186 elif tagname == 'tfoot':
187 # A tfoot element's end tag may be omitted if the tfoot element
188 # is immediately followed by a tbody element, or if there is no
189 # more content in the parent element.
190 # XXX: we never omit the end tag when the following element is
191 # a tbody. See is_optional_start.
192 if type == "StartTag":
193 return next["name"] == 'tbody'
194 else:
195 return type == "EndTag" or type is None
196 elif tagname in ('td', 'th'):
197 # A td element's end tag may be omitted if the td element is
198 # immediately followed by a td or th element, or if there is
199 # no more content in the parent element.
200 # A th element's end tag may be omitted if the th element is
201 # immediately followed by a td or th element, or if there is
202 # no more content in the parent element.
203 if type == "StartTag":
204 return next["name"] in ('td', 'th')
205 else:
206 return type == "EndTag" or type is None
207 return False
diff --git a/venv/lib/python3.7/site-packages/pip-10.0.1-py3.7.egg/pip/_vendor/html5lib/filters/sanitizer.py b/venv/lib/python3.7/site-packages/pip-10.0.1-py3.7.egg/pip/_vendor/html5lib/filters/sanitizer.py
new file mode 100644
index 0000000..c3199a5
--- /dev/null
+++ b/venv/lib/python3.7/site-packages/pip-10.0.1-py3.7.egg/pip/_vendor/html5lib/filters/sanitizer.py
@@ -0,0 +1,896 @@
1from __future__ import absolute_import, division, unicode_literals
2
3import re
4from xml.sax.saxutils import escape, unescape
5
6from pip._vendor.six.moves import urllib_parse as urlparse
7
8from . import base
9from ..constants import namespaces, prefixes
10
11__all__ = ["Filter"]
12
13
14allowed_elements = frozenset((
15 (namespaces['html'], 'a'),
16 (namespaces['html'], 'abbr'),
17 (namespaces['html'], 'acronym'),
18 (namespaces['html'], 'address'),
19 (namespaces['html'], 'area'),
20 (namespaces['html'], 'article'),
21 (namespaces['html'], 'aside'),
22 (namespaces['html'], 'audio'),
23 (namespaces['html'], 'b'),
24 (namespaces['html'], 'big'),
25 (namespaces['html'], 'blockquote'),
26 (namespaces['html'], 'br'),
27 (namespaces['html'], 'button'),
28 (namespaces['html'], 'canvas'),
29 (namespaces['html'], 'caption'),
30 (namespaces['html'], 'center'),
31 (namespaces['html'], 'cite'),
32 (namespaces['html'], 'code'),
33 (namespaces['html'], 'col'),
34 (namespaces['html'], 'colgroup'),
35 (namespaces['html'], 'command'),
36 (namespaces['html'], 'datagrid'),
37 (namespaces['html'], 'datalist'),
38 (namespaces['html'], 'dd'),
39 (namespaces['html'], 'del'),
40 (namespaces['html'], 'details'),
41 (namespaces['html'], 'dfn'),
42 (namespaces['html'], 'dialog'),
43 (namespaces['html'], 'dir'),
44 (namespaces['html'], 'div'),
45 (namespaces['html'], 'dl'),
46 (namespaces['html'], 'dt'),
47 (namespaces['html'], 'em'),
48 (namespaces['html'], 'event-source'),
49 (namespaces['html'], 'fieldset'),
50 (namespaces['html'], 'figcaption'),
51 (namespaces['html'], 'figure'),
52 (namespaces['html'], 'footer'),
53 (namespaces['html'], 'font'),
54 (namespaces['html'], 'form'),
55 (namespaces['html'], 'header'),
56 (namespaces['html'], 'h1'),
57 (namespaces['html'], 'h2'),
58 (namespaces['html'], 'h3'),
59 (namespaces['html'], 'h4'),
60 (namespaces['html'], 'h5'),
61 (namespaces['html'], 'h6'),
62 (namespaces['html'], 'hr'),
63 (namespaces['html'], 'i'),
64 (namespaces['html'], 'img'),
65 (namespaces['html'], 'input'),
66 (namespaces['html'], 'ins'),
67 (namespaces['html'], 'keygen'),
68 (namespaces['html'], 'kbd'),
69 (namespaces['html'], 'label'),
70 (namespaces['html'], 'legend'),
71 (namespaces['html'], 'li'),
72 (namespaces['html'], 'm'),
73 (namespaces['html'], 'map'),
74 (namespaces['html'], 'menu'),
75 (namespaces['html'], 'meter'),
76 (namespaces['html'], 'multicol'),
77 (namespaces['html'], 'nav'),
78 (namespaces['html'], 'nextid'),
79 (namespaces['html'], 'ol'),
80 (namespaces['html'], 'output'),
81 (namespaces['html'], 'optgroup'),
82 (namespaces['html'], 'option'),
83 (namespaces['html'], 'p'),
84 (namespaces['html'], 'pre'),
85 (namespaces['html'], 'progress'),
86 (namespaces['html'], 'q'),
87 (namespaces['html'], 's'),
88 (namespaces['html'], 'samp'),
89 (namespaces['html'], 'section'),
90 (namespaces['html'], 'select'),
91 (namespaces['html'], 'small'),
92 (namespaces['html'], 'sound'),
93 (namespaces['html'], 'source'),
94 (namespaces['html'], 'spacer'),
95 (namespaces['html'], 'span'),
96 (namespaces['html'], 'strike'),
97 (namespaces['html'], 'strong'),
98 (namespaces['html'], 'sub'),
99 (namespaces['html'], 'sup'),
100 (namespaces['html'], 'table'),
101 (namespaces['html'], 'tbody'),
102 (namespaces['html'], 'td'),
103 (namespaces['html'], 'textarea'),
104 (namespaces['html'], 'time'),
105 (namespaces['html'], 'tfoot'),
106 (namespaces['html'], 'th'),
107 (namespaces['html'], 'thead'),
108 (namespaces['html'], 'tr'),
109 (namespaces['html'], 'tt'),
110 (namespaces['html'], 'u'),
111 (namespaces['html'], 'ul'),
112 (namespaces['html'], 'var'),
113 (namespaces['html'], 'video'),
114 (namespaces['mathml'], 'maction'),
115 (namespaces['mathml'], 'math'),
116 (namespaces['mathml'], 'merror'),
117 (namespaces['mathml'], 'mfrac'),
118 (namespaces['mathml'], 'mi'),
119 (namespaces['mathml'], 'mmultiscripts'),
120 (namespaces['mathml'], 'mn'),
121 (namespaces['mathml'], 'mo'),
122 (namespaces['mathml'], 'mover'),
123 (namespaces['mathml'], 'mpadded'),
124 (namespaces['mathml'], 'mphantom'),
125 (namespaces['mathml'], 'mprescripts'),
126 (namespaces['mathml'], 'mroot'),
127 (namespaces['mathml'], 'mrow'),
128 (namespaces['mathml'], 'mspace'),
129 (namespaces['mathml'], 'msqrt'),
130 (namespaces['mathml'], 'mstyle'),
131 (namespaces['mathml'], 'msub'),
132 (namespaces['mathml'], 'msubsup'),
133 (namespaces['mathml'], 'msup'),
134 (namespaces['mathml'], 'mtable'),
135 (namespaces['mathml'], 'mtd'),
136 (namespaces['mathml'], 'mtext'),
137 (namespaces['mathml'], 'mtr'),
138 (namespaces['mathml'], 'munder'),
139 (namespaces['mathml'], 'munderover'),
140 (namespaces['mathml'], 'none'),
141 (namespaces['svg'], 'a'),
142 (namespaces['svg'], 'animate'),
143 (namespaces['svg'], 'animateColor'),
144 (namespaces['svg'], 'animateMotion'),
145 (namespaces['svg'], 'animateTransform'),
146 (namespaces['svg'], 'clipPath'),
147 (namespaces['svg'], 'circle'),
148 (namespaces['svg'], 'defs'),
149 (namespaces['svg'], 'desc'),
150 (namespaces['svg'], 'ellipse'),
151 (namespaces['svg'], 'font-face'),
152 (namespaces['svg'], 'font-face-name'),
153 (namespaces['svg'], 'font-face-src'),
154 (namespaces['svg'], 'g'),
155 (namespaces['svg'], 'glyph'),
156 (namespaces['svg'], 'hkern'),
157 (namespaces['svg'], 'linearGradient'),
158 (namespaces['svg'], 'line'),
159 (namespaces['svg'], 'marker'),
160 (namespaces['svg'], 'metadata'),
161 (namespaces['svg'], 'missing-glyph'),
162 (namespaces['svg'], 'mpath'),
163 (namespaces['svg'], 'path'),
164 (namespaces['svg'], 'polygon'),
165 (namespaces['svg'], 'polyline'),
166 (namespaces['svg'], 'radialGradient'),
167 (namespaces['svg'], 'rect'),
168 (namespaces['svg'], 'set'),
169 (namespaces['svg'], 'stop'),
170 (namespaces['svg'], 'svg'),
171 (namespaces['svg'], 'switch'),
172 (namespaces['svg'], 'text'),
173 (namespaces['svg'], 'title'),
174 (namespaces['svg'], 'tspan'),
175 (namespaces['svg'], 'use'),
176))
177
178allowed_attributes = frozenset((
179 # HTML attributes
180 (None, 'abbr'),
181 (None, 'accept'),
182 (None, 'accept-charset'),
183 (None, 'accesskey'),
184 (None, 'action'),
185 (None, 'align'),
186 (None, 'alt'),
187 (None, 'autocomplete'),
188 (None, 'autofocus'),
189 (None, 'axis'),
190 (None, 'background'),
191 (None, 'balance'),
192 (None, 'bgcolor'),
193 (None, 'bgproperties'),
194 (None, 'border'),
195 (None, 'bordercolor'),
196 (None, 'bordercolordark'),
197 (None, 'bordercolorlight'),
198 (None, 'bottompadding'),
199 (None, 'cellpadding'),
200 (None, 'cellspacing'),
201 (None, 'ch'),
202 (None, 'challenge'),
203 (None, 'char'),
204 (None, 'charoff'),
205 (None, 'choff'),
206 (None, 'charset'),
207 (None, 'checked'),
208 (None, 'cite'),
209 (None, 'class'),
210 (None, 'clear'),
211 (None, 'color'),
212 (None, 'cols'),
213 (None, 'colspan'),
214 (None, 'compact'),
215 (None, 'contenteditable'),
216 (None, 'controls'),
217 (None, 'coords'),
218 (None, 'data'),
219 (None, 'datafld'),
220 (None, 'datapagesize'),
221 (None, 'datasrc'),
222 (None, 'datetime'),
223 (None, 'default'),
224 (None, 'delay'),
225 (None, 'dir'),
226 (None, 'disabled'),
227 (None, 'draggable'),
228 (None, 'dynsrc'),
229 (None, 'enctype'),
230 (None, 'end'),
231 (None, 'face'),
232 (None, 'for'),
233 (None, 'form'),
234 (None, 'frame'),
235 (None, 'galleryimg'),
236 (None, 'gutter'),
237 (None, 'headers'),
238 (None, 'height'),
239 (None, 'hidefocus'),
240 (None, 'hidden'),
241 (None, 'high'),
242 (None, 'href'),
243 (None, 'hreflang'),
244 (None, 'hspace'),
245 (None, 'icon'),
246 (None, 'id'),
247 (None, 'inputmode'),
248 (None, 'ismap'),
249 (None, 'keytype'),
250 (None, 'label'),
251 (None, 'leftspacing'),
252 (None, 'lang'),
253 (None, 'list'),
254 (None, 'longdesc'),
255 (None, 'loop'),
256 (None, 'loopcount'),
257 (None, 'loopend'),
258 (None, 'loopstart'),
259 (None, 'low'),
260 (None, 'lowsrc'),
261 (None, 'max'),
262 (None, 'maxlength'),
263 (None, 'media'),
264 (None, 'method'),
265 (None, 'min'),
266 (None, 'multiple'),
267 (None, 'name'),
268 (None, 'nohref'),
269 (None, 'noshade'),
270 (None, 'nowrap'),
271 (None, 'open'),
272 (None, 'optimum'),
273 (None, 'pattern'),
274 (None, 'ping'),
275 (None, 'point-size'),
276 (None, 'poster'),
277 (None, 'pqg'),
278 (None, 'preload'),
279 (None, 'prompt'),
280 (None, 'radiogroup'),
281 (None, 'readonly'),
282 (None, 'rel'),
283 (None, 'repeat-max'),
284 (None, 'repeat-min'),
285 (None, 'replace'),
286 (None, 'required'),
287 (None, 'rev'),
288 (None, 'rightspacing'),
289 (None, 'rows'),
290 (None, 'rowspan'),
291 (None, 'rules'),
292 (None, 'scope'),
293 (None, 'selected'),
294 (None, 'shape'),
295 (None, 'size'),
296 (None, 'span'),
297 (None, 'src'),
298 (None, 'start'),
299 (None, 'step'),
300 (None, 'style'),
301 (None, 'summary'),
302 (None, 'suppress'),
303 (None, 'tabindex'),
304 (None, 'target'),
305 (None, 'template'),
306 (None, 'title'),
307 (None, 'toppadding'),
308 (None, 'type'),
309 (None, 'unselectable'),
310 (None, 'usemap'),
311 (None, 'urn'),
312 (None, 'valign'),
313 (None, 'value'),
314 (None, 'variable'),
315 (None, 'volume'),
316 (None, 'vspace'),
317 (None, 'vrml'),
318 (None, 'width'),
319 (None, 'wrap'),
320 (namespaces['xml'], 'lang'),
321 # MathML attributes
322 (None, 'actiontype'),
323 (None, 'align'),
324 (None, 'columnalign'),
325 (None, 'columnalign'),
326 (None, 'columnalign'),
327 (None, 'columnlines'),
328 (None, 'columnspacing'),
329 (None, 'columnspan'),
330 (None, 'depth'),
331 (None, 'display'),
332 (None, 'displaystyle'),
333 (None, 'equalcolumns'),
334 (None, 'equalrows'),
335 (None, 'fence'),
336 (None, 'fontstyle'),
337 (None, 'fontweight'),
338 (None, 'frame'),
339 (None, 'height'),
340 (None, 'linethickness'),
341 (None, 'lspace'),
342 (None, 'mathbackground'),
343 (None, 'mathcolor'),
344 (None, 'mathvariant'),
345 (None, 'mathvariant'),
346 (None, 'maxsize'),
347 (None, 'minsize'),
348 (None, 'other'),
349 (None, 'rowalign'),
350 (None, 'rowalign'),
351 (None, 'rowalign'),
352 (None, 'rowlines'),
353 (None, 'rowspacing'),
354 (None, 'rowspan'),
355 (None, 'rspace'),
356 (None, 'scriptlevel'),
357 (None, 'selection'),
358 (None, 'separator'),
359 (None, 'stretchy'),
360 (None, 'width'),
361 (None, 'width'),
362 (namespaces['xlink'], 'href'),
363 (namespaces['xlink'], 'show'),
364 (namespaces['xlink'], 'type'),
365 # SVG attributes
366 (None, 'accent-height'),
367 (None, 'accumulate'),
368 (None, 'additive'),
369 (None, 'alphabetic'),
370 (None, 'arabic-form'),
371 (None, 'ascent'),
372 (None, 'attributeName'),
373 (None, 'attributeType'),
374 (None, 'baseProfile'),
375 (None, 'bbox'),
376 (None, 'begin'),
377 (None, 'by'),
378 (None, 'calcMode'),
379 (None, 'cap-height'),
380 (None, 'class'),
381 (None, 'clip-path'),
382 (None, 'color'),
383 (None, 'color-rendering'),
384 (None, 'content'),
385 (None, 'cx'),
386 (None, 'cy'),
387 (None, 'd'),
388 (None, 'dx'),
389 (None, 'dy'),
390 (None, 'descent'),
391 (None, 'display'),
392 (None, 'dur'),
393 (None, 'end'),
394 (None, 'fill'),
395 (None, 'fill-opacity'),
396 (None, 'fill-rule'),
397 (None, 'font-family'),
398 (None, 'font-size'),
399 (None, 'font-stretch'),
400 (None, 'font-style'),
401 (None, 'font-variant'),
402 (None, 'font-weight'),
403 (None, 'from'),
404 (None, 'fx'),
405 (None, 'fy'),
406 (None, 'g1'),
407 (None, 'g2'),
408 (None, 'glyph-name'),
409 (None, 'gradientUnits'),
410 (None, 'hanging'),
411 (None, 'height'),
412 (None, 'horiz-adv-x'),
413 (None, 'horiz-origin-x'),
414 (None, 'id'),
415 (None, 'ideographic'),
416 (None, 'k'),
417 (None, 'keyPoints'),
418 (None, 'keySplines'),
419 (None, 'keyTimes'),
420 (None, 'lang'),
421 (None, 'marker-end'),
422 (None, 'marker-mid'),
423 (None, 'marker-start'),
424 (None, 'markerHeight'),
425 (None, 'markerUnits'),
426 (None, 'markerWidth'),
427 (None, 'mathematical'),
428 (None, 'max'),
429 (None, 'min'),
430 (None, 'name'),
431 (None, 'offset'),
432 (None, 'opacity'),
433 (None, 'orient'),
434 (None, 'origin'),
435 (None, 'overline-position'),
436 (None, 'overline-thickness'),
437 (None, 'panose-1'),
438 (None, 'path'),
439 (None, 'pathLength'),
440 (None, 'points'),
441 (None, 'preserveAspectRatio'),
442 (None, 'r'),
443 (None, 'refX'),
444 (None, 'refY'),
445 (None, 'repeatCount'),
446 (None, 'repeatDur'),
447 (None, 'requiredExtensions'),
448 (None, 'requiredFeatures'),
449 (None, 'restart'),
450 (None, 'rotate'),
451 (None, 'rx'),
452 (None, 'ry'),
453 (None, 'slope'),
454 (None, 'stemh'),
455 (None, 'stemv'),
456 (None, 'stop-color'),
457 (None, 'stop-opacity'),
458 (None, 'strikethrough-position'),
459 (None, 'strikethrough-thickness'),
460 (None, 'stroke'),
461 (None, 'stroke-dasharray'),
462 (None, 'stroke-dashoffset'),
463 (None, 'stroke-linecap'),
464 (None, 'stroke-linejoin'),
465 (None, 'stroke-miterlimit'),
466 (None, 'stroke-opacity'),
467 (None, 'stroke-width'),
468 (None, 'systemLanguage'),
469 (None, 'target'),
470 (None, 'text-anchor'),
471 (None, 'to'),
472 (None, 'transform'),
473 (None, 'type'),
474 (None, 'u1'),
475 (None, 'u2'),
476 (None, 'underline-position'),
477 (None, 'underline-thickness'),
478 (None, 'unicode'),
479 (None, 'unicode-range'),
480 (None, 'units-per-em'),
481 (None, 'values'),
482 (None, 'version'),
483 (None, 'viewBox'),
484 (None, 'visibility'),
485 (None, 'width'),
486 (None, 'widths'),
487 (None, 'x'),
488 (None, 'x-height'),
489 (None, 'x1'),
490 (None, 'x2'),
491 (namespaces['xlink'], 'actuate'),
492 (namespaces['xlink'], 'arcrole'),
493 (namespaces['xlink'], 'href'),
494 (namespaces['xlink'], 'role'),
495 (namespaces['xlink'], 'show'),
496 (namespaces['xlink'], 'title'),
497 (namespaces['xlink'], 'type'),
498 (namespaces['xml'], 'base'),
499 (namespaces['xml'], 'lang'),
500 (namespaces['xml'], 'space'),
501 (None, 'y'),
502 (None, 'y1'),
503 (None, 'y2'),
504 (None, 'zoomAndPan'),
505))
506
507attr_val_is_uri = frozenset((
508 (None, 'href'),
509 (None, 'src'),
510 (None, 'cite'),
511 (None, 'action'),
512 (None, 'longdesc'),
513 (None, 'poster'),
514 (None, 'background'),
515 (None, 'datasrc'),
516 (None, 'dynsrc'),
517 (None, 'lowsrc'),
518 (None, 'ping'),
519 (namespaces['xlink'], 'href'),
520 (namespaces['xml'], 'base'),
521))
522
523svg_attr_val_allows_ref = frozenset((
524 (None, 'clip-path'),
525 (None, 'color-profile'),
526 (None, 'cursor'),
527 (None, 'fill'),
528 (None, 'filter'),
529 (None, 'marker'),
530 (None, 'marker-start'),
531 (None, 'marker-mid'),
532 (None, 'marker-end'),
533 (None, 'mask'),
534 (None, 'stroke'),
535))
536
537svg_allow_local_href = frozenset((
538 (None, 'altGlyph'),
539 (None, 'animate'),
540 (None, 'animateColor'),
541 (None, 'animateMotion'),
542 (None, 'animateTransform'),
543 (None, 'cursor'),
544 (None, 'feImage'),
545 (None, 'filter'),
546 (None, 'linearGradient'),
547 (None, 'pattern'),
548 (None, 'radialGradient'),
549 (None, 'textpath'),
550 (None, 'tref'),
551 (None, 'set'),
552 (None, 'use')
553))
554
555allowed_css_properties = frozenset((
556 'azimuth',
557 'background-color',
558 'border-bottom-color',
559 'border-collapse',
560 'border-color',
561 'border-left-color',
562 'border-right-color',
563 'border-top-color',
564 'clear',
565 'color',
566 'cursor',
567 'direction',
568 'display',
569 'elevation',
570 'float',
571 'font',
572 'font-family',
573 'font-size',
574 'font-style',
575 'font-variant',
576 'font-weight',
577 'height',
578 'letter-spacing',
579 'line-height',
580 'overflow',
581 'pause',
582 'pause-after',
583 'pause-before',
584 'pitch',
585 'pitch-range',
586 'richness',
587 'speak',
588 'speak-header',
589 'speak-numeral',
590 'speak-punctuation',
591 'speech-rate',
592 'stress',
593 'text-align',
594 'text-decoration',
595 'text-indent',
596 'unicode-bidi',
597 'vertical-align',
598 'voice-family',
599 'volume',
600 'white-space',
601 'width',
602))
603
604allowed_css_keywords = frozenset((
605 'auto',
606 'aqua',
607 'black',
608 'block',
609 'blue',
610 'bold',
611 'both',
612 'bottom',
613 'brown',
614 'center',
615 'collapse',
616 'dashed',
617 'dotted',
618 'fuchsia',
619 'gray',
620 'green',
621 '!important',
622 'italic',
623 'left',
624 'lime',
625 'maroon',
626 'medium',
627 'none',
628 'navy',
629 'normal',
630 'nowrap',
631 'olive',
632 'pointer',
633 'purple',
634 'red',
635 'right',
636 'solid',
637 'silver',
638 'teal',
639 'top',
640 'transparent',
641 'underline',
642 'white',
643 'yellow',
644))
645
646allowed_svg_properties = frozenset((
647 'fill',
648 'fill-opacity',
649 'fill-rule',
650 'stroke',
651 'stroke-width',
652 'stroke-linecap',
653 'stroke-linejoin',
654 'stroke-opacity',
655))
656
657allowed_protocols = frozenset((
658 'ed2k',
659 'ftp',
660 'http',
661 'https',
662 'irc',
663 'mailto',
664 'news',
665 'gopher',
666 'nntp',
667 'telnet',
668 'webcal',
669 'xmpp',
670 'callto',
671 'feed',
672 'urn',
673 'aim',
674 'rsync',
675 'tag',
676 'ssh',
677 'sftp',
678 'rtsp',
679 'afs',
680 'data',
681))
682
683allowed_content_types = frozenset((
684 'image/png',
685 'image/jpeg',
686 'image/gif',
687 'image/webp',
688 'image/bmp',
689 'text/plain',
690))
691
692
693data_content_type = re.compile(r'''
694 ^
695 # Match a content type <application>/<type>
696 (?P<content_type>[-a-zA-Z0-9.]+/[-a-zA-Z0-9.]+)
697 # Match any character set and encoding
698 (?:(?:;charset=(?:[-a-zA-Z0-9]+)(?:;(?:base64))?)
699 |(?:;(?:base64))?(?:;charset=(?:[-a-zA-Z0-9]+))?)
700 # Assume the rest is data
701 ,.*
702 $
703 ''',
704 re.VERBOSE)
705
706
707class Filter(base.Filter):
708 """Sanitizes token stream of XHTML+MathML+SVG and of inline style attributes"""
709 def __init__(self,
710 source,
711 allowed_elements=allowed_elements,
712 allowed_attributes=allowed_attributes,
713 allowed_css_properties=allowed_css_properties,
714 allowed_css_keywords=allowed_css_keywords,
715 allowed_svg_properties=allowed_svg_properties,
716 allowed_protocols=allowed_protocols,
717 allowed_content_types=allowed_content_types,
718 attr_val_is_uri=attr_val_is_uri,
719 svg_attr_val_allows_ref=svg_attr_val_allows_ref,
720 svg_allow_local_href=svg_allow_local_href):
721 """Creates a Filter
722
723 :arg allowed_elements: set of elements to allow--everything else will
724 be escaped
725
726 :arg allowed_attributes: set of attributes to allow in
727 elements--everything else will be stripped
728
729 :arg allowed_css_properties: set of CSS properties to allow--everything
730 else will be stripped
731
732 :arg allowed_css_keywords: set of CSS keywords to allow--everything
733 else will be stripped
734
735 :arg allowed_svg_properties: set of SVG properties to allow--everything
736 else will be removed
737
738 :arg allowed_protocols: set of allowed protocols for URIs
739
740 :arg allowed_content_types: set of allowed content types for ``data`` URIs.
741
742 :arg attr_val_is_uri: set of attributes that have URI values--values
743 that have a scheme not listed in ``allowed_protocols`` are removed
744
745 :arg svg_attr_val_allows_ref: set of SVG attributes that can have
746 references
747
748 :arg svg_allow_local_href: set of SVG elements that can have local
749 hrefs--these are removed
750
751 """
752 super(Filter, self).__init__(source)
753 self.allowed_elements = allowed_elements
754 self.allowed_attributes = allowed_attributes
755 self.allowed_css_properties = allowed_css_properties
756 self.allowed_css_keywords = allowed_css_keywords
757 self.allowed_svg_properties = allowed_svg_properties
758 self.allowed_protocols = allowed_protocols
759 self.allowed_content_types = allowed_content_types
760 self.attr_val_is_uri = attr_val_is_uri
761 self.svg_attr_val_allows_ref = svg_attr_val_allows_ref
762 self.svg_allow_local_href = svg_allow_local_href
763
764 def __iter__(self):
765 for token in base.Filter.__iter__(self):
766 token = self.sanitize_token(token)
767 if token:
768 yield token
769
770 # Sanitize the +html+, escaping all elements not in ALLOWED_ELEMENTS, and
771 # stripping out all attributes not in ALLOWED_ATTRIBUTES. Style attributes
772 # are parsed, and a restricted set, specified by ALLOWED_CSS_PROPERTIES and
773 # ALLOWED_CSS_KEYWORDS, are allowed through. attributes in ATTR_VAL_IS_URI
774 # are scanned, and only URI schemes specified in ALLOWED_PROTOCOLS are
775 # allowed.
776 #
777 # sanitize_html('<script> do_nasty_stuff() </script>')
778 # => &lt;script> do_nasty_stuff() &lt;/script>
779 # sanitize_html('<a href="javascript: sucker();">Click here for $100</a>')
780 # => <a>Click here for $100</a>
781 def sanitize_token(self, token):
782
783 # accommodate filters which use token_type differently
784 token_type = token["type"]
785 if token_type in ("StartTag", "EndTag", "EmptyTag"):
786 name = token["name"]
787 namespace = token["namespace"]
788 if ((namespace, name) in self.allowed_elements or
789 (namespace is None and
790 (namespaces["html"], name) in self.allowed_elements)):
791 return self.allowed_token(token)
792 else:
793 return self.disallowed_token(token)
794 elif token_type == "Comment":
795 pass
796 else:
797 return token
798
799 def allowed_token(self, token):
800 if "data" in token:
801 attrs = token["data"]
802 attr_names = set(attrs.keys())
803
804 # Remove forbidden attributes
805 for to_remove in (attr_names - self.allowed_attributes):
806 del token["data"][to_remove]
807 attr_names.remove(to_remove)
808
809 # Remove attributes with disallowed URL values
810 for attr in (attr_names & self.attr_val_is_uri):
811 assert attr in attrs
812 # I don't have a clue where this regexp comes from or why it matches those
813 # characters, nor why we call unescape. I just know it's always been here.
814 # Should you be worried by this comment in a sanitizer? Yes. On the other hand, all
815 # this will do is remove *more* than it otherwise would.
816 val_unescaped = re.sub("[`\x00-\x20\x7f-\xa0\\s]+", '',
817 unescape(attrs[attr])).lower()
818 # remove replacement characters from unescaped characters
819 val_unescaped = val_unescaped.replace("\ufffd", "")
820 try:
821 uri = urlparse.urlparse(val_unescaped)
822 except ValueError:
823 uri = None
824 del attrs[attr]
825 if uri and uri.scheme:
826 if uri.scheme not in self.allowed_protocols:
827 del attrs[attr]
828 if uri.scheme == 'data':
829 m = data_content_type.match(uri.path)
830 if not m:
831 del attrs[attr]
832 elif m.group('content_type') not in self.allowed_content_types:
833 del attrs[attr]
834
835 for attr in self.svg_attr_val_allows_ref:
836 if attr in attrs:
837 attrs[attr] = re.sub(r'url\s*\(\s*[^#\s][^)]+?\)',
838 ' ',
839 unescape(attrs[attr]))
840 if (token["name"] in self.svg_allow_local_href and
841 (namespaces['xlink'], 'href') in attrs and re.search(r'^\s*[^#\s].*',
842 attrs[(namespaces['xlink'], 'href')])):
843 del attrs[(namespaces['xlink'], 'href')]
844 if (None, 'style') in attrs:
845 attrs[(None, 'style')] = self.sanitize_css(attrs[(None, 'style')])
846 token["data"] = attrs
847 return token
848
849 def disallowed_token(self, token):
850 token_type = token["type"]
851 if token_type == "EndTag":
852 token["data"] = "</%s>" % token["name"]
853 elif token["data"]:
854 assert token_type in ("StartTag", "EmptyTag")
855 attrs = []
856 for (ns, name), v in token["data"].items():
857 attrs.append(' %s="%s"' % (name if ns is None else "%s:%s" % (prefixes[ns], name), escape(v)))
858 token["data"] = "<%s%s>" % (token["name"], ''.join(attrs))
859 else:
860 token["data"] = "<%s>" % token["name"]
861 if token.get("selfClosing"):
862 token["data"] = token["data"][:-1] + "/>"
863
864 token["type"] = "Characters"
865
866 del token["name"]
867 return token
868
869 def sanitize_css(self, style):
870 # disallow urls
871 style = re.compile(r'url\s*\(\s*[^\s)]+?\s*\)\s*').sub(' ', style)
872
873 # gauntlet
874 if not re.match(r"""^([:,;#%.\sa-zA-Z0-9!]|\w-\w|'[\s\w]+'|"[\s\w]+"|\([\d,\s]+\))*$""", style):
875 return ''
876 if not re.match(r"^\s*([-\w]+\s*:[^:;]*(;\s*|$))*$", style):
877 return ''
878
879 clean = []
880 for prop, value in re.findall(r"([-\w]+)\s*:\s*([^:;]*)", style):
881 if not value:
882 continue
883 if prop.lower() in self.allowed_css_properties:
884 clean.append(prop + ': ' + value + ';')
885 elif prop.split('-')[0].lower() in ['background', 'border', 'margin',
886 'padding']:
887 for keyword in value.split():
888 if keyword not in self.allowed_css_keywords and \
889 not re.match(r"^(#[0-9a-fA-F]+|rgb\(\d+%?,\d*%?,?\d*%?\)?|\d{0,2}\.?\d{0,2}(cm|em|ex|in|mm|pc|pt|px|%|,|\))?)$", keyword): # noqa
890 break
891 else:
892 clean.append(prop + ': ' + value + ';')
893 elif prop.lower() in self.allowed_svg_properties:
894 clean.append(prop + ': ' + value + ';')
895
896 return ' '.join(clean)
diff --git a/venv/lib/python3.7/site-packages/pip-10.0.1-py3.7.egg/pip/_vendor/html5lib/filters/whitespace.py b/venv/lib/python3.7/site-packages/pip-10.0.1-py3.7.egg/pip/_vendor/html5lib/filters/whitespace.py
new file mode 100644
index 0000000..24bb0de
--- /dev/null
+++ b/venv/lib/python3.7/site-packages/pip-10.0.1-py3.7.egg/pip/_vendor/html5lib/filters/whitespace.py
@@ -0,0 +1,38 @@
1from __future__ import absolute_import, division, unicode_literals
2
3import re
4
5from . import base
6from ..constants import rcdataElements, spaceCharacters
7spaceCharacters = "".join(spaceCharacters)
8
9SPACES_REGEX = re.compile("[%s]+" % spaceCharacters)
10
11
12class Filter(base.Filter):
13 """Collapses whitespace except in pre, textarea, and script elements"""
14 spacePreserveElements = frozenset(["pre", "textarea"] + list(rcdataElements))
15
16 def __iter__(self):
17 preserve = 0
18 for token in base.Filter.__iter__(self):
19 type = token["type"]
20 if type == "StartTag" \
21 and (preserve or token["name"] in self.spacePreserveElements):
22 preserve += 1
23
24 elif type == "EndTag" and preserve:
25 preserve -= 1
26
27 elif not preserve and type == "SpaceCharacters" and token["data"]:
28 # Test on token["data"] above to not introduce spaces where there were not
29 token["data"] = " "
30
31 elif not preserve and type == "Characters":
32 token["data"] = collapse_spaces(token["data"])
33
34 yield token
35
36
37def collapse_spaces(text):
38 return SPACES_REGEX.sub(' ', text)
diff --git a/venv/lib/python3.7/site-packages/pip-10.0.1-py3.7.egg/pip/_vendor/html5lib/html5parser.py b/venv/lib/python3.7/site-packages/pip-10.0.1-py3.7.egg/pip/_vendor/html5lib/html5parser.py
new file mode 100644
index 0000000..b185971
--- /dev/null
+++ b/venv/lib/python3.7/site-packages/pip-10.0.1-py3.7.egg/pip/_vendor/html5lib/html5parser.py
@@ -0,0 +1,2791 @@
1from __future__ import absolute_import, division, unicode_literals
2from pip._vendor.six import with_metaclass, viewkeys
3
4import types
5from collections import OrderedDict
6
7from . import _inputstream
8from . import _tokenizer
9
10from . import treebuilders
11from .treebuilders.base import Marker
12
13from . import _utils
14from .constants import (
15 spaceCharacters, asciiUpper2Lower,
16 specialElements, headingElements, cdataElements, rcdataElements,
17 tokenTypes, tagTokenTypes,
18 namespaces,
19 htmlIntegrationPointElements, mathmlTextIntegrationPointElements,
20 adjustForeignAttributes as adjustForeignAttributesMap,
21 adjustMathMLAttributes, adjustSVGAttributes,
22 E,
23 _ReparseException
24)
25
26
27def parse(doc, treebuilder="etree", namespaceHTMLElements=True, **kwargs):
28 """Parse an HTML document as a string or file-like object into a tree
29
30 :arg doc: the document to parse as a string or file-like object
31
32 :arg treebuilder: the treebuilder to use when parsing
33
34 :arg namespaceHTMLElements: whether or not to namespace HTML elements
35
36 :returns: parsed tree
37
38 Example:
39
40 >>> from html5lib.html5parser import parse
41 >>> parse('<html><body><p>This is a doc</p></body></html>')
42 <Element u'{http://www.w3.org/1999/xhtml}html' at 0x7feac4909db0>
43
44 """
45 tb = treebuilders.getTreeBuilder(treebuilder)
46 p = HTMLParser(tb, namespaceHTMLElements=namespaceHTMLElements)
47 return p.parse(doc, **kwargs)
48
49
50def parseFragment(doc, container="div", treebuilder="etree", namespaceHTMLElements=True, **kwargs):
51 """Parse an HTML fragment as a string or file-like object into a tree
52
53 :arg doc: the fragment to parse as a string or file-like object
54
55 :arg container: the container context to parse the fragment in
56
57 :arg treebuilder: the treebuilder to use when parsing
58
59 :arg namespaceHTMLElements: whether or not to namespace HTML elements
60
61 :returns: parsed tree
62
63 Example:
64
65 >>> from html5lib.html5libparser import parseFragment
66 >>> parseFragment('<b>this is a fragment</b>')
67 <Element u'DOCUMENT_FRAGMENT' at 0x7feac484b090>
68
69 """
70 tb = treebuilders.getTreeBuilder(treebuilder)
71 p = HTMLParser(tb, namespaceHTMLElements=namespaceHTMLElements)
72 return p.parseFragment(doc, container=container, **kwargs)
73
74
75def method_decorator_metaclass(function):
76 class Decorated(type):
77 def __new__(meta, classname, bases, classDict):
78 for attributeName, attribute in classDict.items():
79 if isinstance(attribute, types.FunctionType):
80 attribute = function(attribute)
81
82 classDict[attributeName] = attribute
83 return type.__new__(meta, classname, bases, classDict)
84 return Decorated
85
86
87class HTMLParser(object):
88 """HTML parser
89
90 Generates a tree structure from a stream of (possibly malformed) HTML.
91
92 """
93
94 def __init__(self, tree=None, strict=False, namespaceHTMLElements=True, debug=False):
95 """
96 :arg tree: a treebuilder class controlling the type of tree that will be
97 returned. Built in treebuilders can be accessed through
98 html5lib.treebuilders.getTreeBuilder(treeType)
99
100 :arg strict: raise an exception when a parse error is encountered
101
102 :arg namespaceHTMLElements: whether or not to namespace HTML elements
103
104 :arg debug: whether or not to enable debug mode which logs things
105
106 Example:
107
108 >>> from html5lib.html5parser import HTMLParser
109 >>> parser = HTMLParser() # generates parser with etree builder
110 >>> parser = HTMLParser('lxml', strict=True) # generates parser with lxml builder which is strict
111
112 """
113
114 # Raise an exception on the first error encountered
115 self.strict = strict
116
117 if tree is None:
118 tree = treebuilders.getTreeBuilder("etree")
119 self.tree = tree(namespaceHTMLElements)
120 self.errors = []
121
122 self.phases = dict([(name, cls(self, self.tree)) for name, cls in
123 getPhases(debug).items()])
124
125 def _parse(self, stream, innerHTML=False, container="div", scripting=False, **kwargs):
126
127 self.innerHTMLMode = innerHTML
128 self.container = container
129 self.scripting = scripting
130 self.tokenizer = _tokenizer.HTMLTokenizer(stream, parser=self, **kwargs)
131 self.reset()
132
133 try:
134 self.mainLoop()
135 except _ReparseException:
136 self.reset()
137 self.mainLoop()
138
139 def reset(self):
140 self.tree.reset()
141 self.firstStartTag = False
142 self.errors = []
143 self.log = [] # only used with debug mode
144 # "quirks" / "limited quirks" / "no quirks"
145 self.compatMode = "no quirks"
146
147 if self.innerHTMLMode:
148 self.innerHTML = self.container.lower()
149
150 if self.innerHTML in cdataElements:
151 self.tokenizer.state = self.tokenizer.rcdataState
152 elif self.innerHTML in rcdataElements:
153 self.tokenizer.state = self.tokenizer.rawtextState
154 elif self.innerHTML == 'plaintext':
155 self.tokenizer.state = self.tokenizer.plaintextState
156 else:
157 # state already is data state
158 # self.tokenizer.state = self.tokenizer.dataState
159 pass
160 self.phase = self.phases["beforeHtml"]
161 self.phase.insertHtmlElement()
162 self.resetInsertionMode()
163 else:
164 self.innerHTML = False # pylint:disable=redefined-variable-type
165 self.phase = self.phases["initial"]
166
167 self.lastPhase = None
168
169 self.beforeRCDataPhase = None
170
171 self.framesetOK = True
172
173 @property
174 def documentEncoding(self):
175 """Name of the character encoding that was used to decode the input stream, or
176 :obj:`None` if that is not determined yet
177
178 """
179 if not hasattr(self, 'tokenizer'):
180 return None
181 return self.tokenizer.stream.charEncoding[0].name
182
183 def isHTMLIntegrationPoint(self, element):
184 if (element.name == "annotation-xml" and
185 element.namespace == namespaces["mathml"]):
186 return ("encoding" in element.attributes and
187 element.attributes["encoding"].translate(
188 asciiUpper2Lower) in
189 ("text/html", "application/xhtml+xml"))
190 else:
191 return (element.namespace, element.name) in htmlIntegrationPointElements
192
193 def isMathMLTextIntegrationPoint(self, element):
194 return (element.namespace, element.name) in mathmlTextIntegrationPointElements
195
196 def mainLoop(self):
197 CharactersToken = tokenTypes["Characters"]
198 SpaceCharactersToken = tokenTypes["SpaceCharacters"]
199 StartTagToken = tokenTypes["StartTag"]
200 EndTagToken = tokenTypes["EndTag"]
201 CommentToken = tokenTypes["Comment"]
202 DoctypeToken = tokenTypes["Doctype"]
203 ParseErrorToken = tokenTypes["ParseError"]
204
205 for token in self.normalizedTokens():
206 prev_token = None
207 new_token = token
208 while new_token is not None:
209 prev_token = new_token
210 currentNode = self.tree.openElements[-1] if self.tree.openElements else None
211 currentNodeNamespace = currentNode.namespace if currentNode else None
212 currentNodeName = currentNode.name if currentNode else None
213
214 type = new_token["type"]
215
216 if type == ParseErrorToken:
217 self.parseError(new_token["data"], new_token.get("datavars", {}))
218 new_token = None
219 else:
220 if (len(self.tree.openElements) == 0 or
221 currentNodeNamespace == self.tree.defaultNamespace or
222 (self.isMathMLTextIntegrationPoint(currentNode) and
223 ((type == StartTagToken and
224 token["name"] not in frozenset(["mglyph", "malignmark"])) or
225 type in (CharactersToken, SpaceCharactersToken))) or
226 (currentNodeNamespace == namespaces["mathml"] and
227 currentNodeName == "annotation-xml" and
228 type == StartTagToken and
229 token["name"] == "svg") or
230 (self.isHTMLIntegrationPoint(currentNode) and
231 type in (StartTagToken, CharactersToken, SpaceCharactersToken))):
232 phase = self.phase
233 else:
234 phase = self.phases["inForeignContent"]
235
236 if type == CharactersToken:
237 new_token = phase.processCharacters(new_token)
238 elif type == SpaceCharactersToken:
239 new_token = phase.processSpaceCharacters(new_token)
240 elif type == StartTagToken:
241 new_token = phase.processStartTag(new_token)
242 elif type == EndTagToken:
243 new_token = phase.processEndTag(new_token)
244 elif type == CommentToken:
245 new_token = phase.processComment(new_token)
246 elif type == DoctypeToken:
247 new_token = phase.processDoctype(new_token)
248
249 if (type == StartTagToken and prev_token["selfClosing"] and
250 not prev_token["selfClosingAcknowledged"]):
251 self.parseError("non-void-element-with-trailing-solidus",
252 {"name": prev_token["name"]})
253
254 # When the loop finishes it's EOF
255 reprocess = True
256 phases = []
257 while reprocess:
258 phases.append(self.phase)
259 reprocess = self.phase.processEOF()
260 if reprocess:
261 assert self.phase not in phases
262
263 def normalizedTokens(self):
264 for token in self.tokenizer:
265 yield self.normalizeToken(token)
266
267 def parse(self, stream, *args, **kwargs):
268 """Parse a HTML document into a well-formed tree
269
270 :arg stream: a file-like object or string containing the HTML to be parsed
271
272 The optional encoding parameter must be a string that indicates
273 the encoding. If specified, that encoding will be used,
274 regardless of any BOM or later declaration (such as in a meta
275 element).
276
277 :arg scripting: treat noscript elements as if JavaScript was turned on
278
279 :returns: parsed tree
280
281 Example:
282
283 >>> from html5lib.html5parser import HTMLParser
284 >>> parser = HTMLParser()
285 >>> parser.parse('<html><body><p>This is a doc</p></body></html>')
286 <Element u'{http://www.w3.org/1999/xhtml}html' at 0x7feac4909db0>
287
288 """
289 self._parse(stream, False, None, *args, **kwargs)
290 return self.tree.getDocument()
291
292 def parseFragment(self, stream, *args, **kwargs):
293 """Parse a HTML fragment into a well-formed tree fragment
294
295 :arg container: name of the element we're setting the innerHTML
296 property if set to None, default to 'div'
297
298 :arg stream: a file-like object or string containing the HTML to be parsed
299
300 The optional encoding parameter must be a string that indicates
301 the encoding. If specified, that encoding will be used,
302 regardless of any BOM or later declaration (such as in a meta
303 element)
304
305 :arg scripting: treat noscript elements as if JavaScript was turned on
306
307 :returns: parsed tree
308
309 Example:
310
311 >>> from html5lib.html5libparser import HTMLParser
312 >>> parser = HTMLParser()
313 >>> parser.parseFragment('<b>this is a fragment</b>')
314 <Element u'DOCUMENT_FRAGMENT' at 0x7feac484b090>
315
316 """
317 self._parse(stream, True, *args, **kwargs)
318 return self.tree.getFragment()
319
320 def parseError(self, errorcode="XXX-undefined-error", datavars=None):
321 # XXX The idea is to make errorcode mandatory.
322 if datavars is None:
323 datavars = {}
324 self.errors.append((self.tokenizer.stream.position(), errorcode, datavars))
325 if self.strict:
326 raise ParseError(E[errorcode] % datavars)
327
328 def normalizeToken(self, token):
329 # HTML5 specific normalizations to the token stream
330 if token["type"] == tokenTypes["StartTag"]:
331 raw = token["data"]
332 token["data"] = OrderedDict(raw)
333 if len(raw) > len(token["data"]):
334 # we had some duplicated attribute, fix so first wins
335 token["data"].update(raw[::-1])
336
337 return token
338
339 def adjustMathMLAttributes(self, token):
340 adjust_attributes(token, adjustMathMLAttributes)
341
342 def adjustSVGAttributes(self, token):
343 adjust_attributes(token, adjustSVGAttributes)
344
345 def adjustForeignAttributes(self, token):
346 adjust_attributes(token, adjustForeignAttributesMap)
347
348 def reparseTokenNormal(self, token):
349 # pylint:disable=unused-argument
350 self.parser.phase()
351
352 def resetInsertionMode(self):
353 # The name of this method is mostly historical. (It's also used in the
354 # specification.)
355 last = False
356 newModes = {
357 "select": "inSelect",
358 "td": "inCell",
359 "th": "inCell",
360 "tr": "inRow",
361 "tbody": "inTableBody",
362 "thead": "inTableBody",
363 "tfoot": "inTableBody",
364 "caption": "inCaption",
365 "colgroup": "inColumnGroup",
366 "table": "inTable",
367 "head": "inBody",
368 "body": "inBody",
369 "frameset": "inFrameset",
370 "html": "beforeHead"
371 }
372 for node in self.tree.openElements[::-1]:
373 nodeName = node.name
374 new_phase = None
375 if node == self.tree.openElements[0]:
376 assert self.innerHTML
377 last = True
378 nodeName = self.innerHTML
379 # Check for conditions that should only happen in the innerHTML
380 # case
381 if nodeName in ("select", "colgroup", "head", "html"):
382 assert self.innerHTML
383
384 if not last and node.namespace != self.tree.defaultNamespace:
385 continue
386
387 if nodeName in newModes:
388 new_phase = self.phases[newModes[nodeName]]
389 break
390 elif last:
391 new_phase = self.phases["inBody"]
392 break
393
394 self.phase = new_phase
395
396 def parseRCDataRawtext(self, token, contentType):
397 # Generic RCDATA/RAWTEXT Parsing algorithm
398 assert contentType in ("RAWTEXT", "RCDATA")
399
400 self.tree.insertElement(token)
401
402 if contentType == "RAWTEXT":
403 self.tokenizer.state = self.tokenizer.rawtextState
404 else:
405 self.tokenizer.state = self.tokenizer.rcdataState
406
407 self.originalPhase = self.phase
408
409 self.phase = self.phases["text"]
410
411
412@_utils.memoize
413def getPhases(debug):
414 def log(function):
415 """Logger that records which phase processes each token"""
416 type_names = dict((value, key) for key, value in
417 tokenTypes.items())
418
419 def wrapped(self, *args, **kwargs):
420 if function.__name__.startswith("process") and len(args) > 0:
421 token = args[0]
422 try:
423 info = {"type": type_names[token['type']]}
424 except:
425 raise
426 if token['type'] in tagTokenTypes:
427 info["name"] = token['name']
428
429 self.parser.log.append((self.parser.tokenizer.state.__name__,
430 self.parser.phase.__class__.__name__,
431 self.__class__.__name__,
432 function.__name__,
433 info))
434 return function(self, *args, **kwargs)
435 else:
436 return function(self, *args, **kwargs)
437 return wrapped
438
439 def getMetaclass(use_metaclass, metaclass_func):
440 if use_metaclass:
441 return method_decorator_metaclass(metaclass_func)
442 else:
443 return type
444
445 # pylint:disable=unused-argument
446 class Phase(with_metaclass(getMetaclass(debug, log))):
447 """Base class for helper object that implements each phase of processing
448 """
449
450 def __init__(self, parser, tree):
451 self.parser = parser
452 self.tree = tree
453
454 def processEOF(self):
455 raise NotImplementedError
456
457 def processComment(self, token):
458 # For most phases the following is correct. Where it's not it will be
459 # overridden.
460 self.tree.insertComment(token, self.tree.openElements[-1])
461
462 def processDoctype(self, token):
463 self.parser.parseError("unexpected-doctype")
464
465 def processCharacters(self, token):
466 self.tree.insertText(token["data"])
467
468 def processSpaceCharacters(self, token):
469 self.tree.insertText(token["data"])
470
471 def processStartTag(self, token):
472 return self.startTagHandler[token["name"]](token)
473
474 def startTagHtml(self, token):
475 if not self.parser.firstStartTag and token["name"] == "html":
476 self.parser.parseError("non-html-root")
477 # XXX Need a check here to see if the first start tag token emitted is
478 # this token... If it's not, invoke self.parser.parseError().
479 for attr, value in token["data"].items():
480 if attr not in self.tree.openElements[0].attributes:
481 self.tree.openElements[0].attributes[attr] = value
482 self.parser.firstStartTag = False
483
484 def processEndTag(self, token):
485 return self.endTagHandler[token["name"]](token)
486
487 class InitialPhase(Phase):
488 def processSpaceCharacters(self, token):
489 pass
490
491 def processComment(self, token):
492 self.tree.insertComment(token, self.tree.document)
493
494 def processDoctype(self, token):
495 name = token["name"]
496 publicId = token["publicId"]
497 systemId = token["systemId"]
498 correct = token["correct"]
499
500 if (name != "html" or publicId is not None or
501 systemId is not None and systemId != "about:legacy-compat"):
502 self.parser.parseError("unknown-doctype")
503
504 if publicId is None:
505 publicId = ""
506
507 self.tree.insertDoctype(token)
508
509 if publicId != "":
510 publicId = publicId.translate(asciiUpper2Lower)
511
512 if (not correct or token["name"] != "html" or
513 publicId.startswith(
514 ("+//silmaril//dtd html pro v0r11 19970101//",
515 "-//advasoft ltd//dtd html 3.0 aswedit + extensions//",
516 "-//as//dtd html 3.0 aswedit + extensions//",
517 "-//ietf//dtd html 2.0 level 1//",
518 "-//ietf//dtd html 2.0 level 2//",
519 "-//ietf//dtd html 2.0 strict level 1//",
520 "-//ietf//dtd html 2.0 strict level 2//",
521 "-//ietf//dtd html 2.0 strict//",
522 "-//ietf//dtd html 2.0//",
523 "-//ietf//dtd html 2.1e//",
524 "-//ietf//dtd html 3.0//",
525 "-//ietf//dtd html 3.2 final//",
526 "-//ietf//dtd html 3.2//",
527 "-//ietf//dtd html 3//",
528 "-//ietf//dtd html level 0//",
529 "-//ietf//dtd html level 1//",
530 "-//ietf//dtd html level 2//",
531 "-//ietf//dtd html level 3//",
532 "-//ietf//dtd html strict level 0//",
533 "-//ietf//dtd html strict level 1//",
534 "-//ietf//dtd html strict level 2//",
535 "-//ietf//dtd html strict level 3//",
536 "-//ietf//dtd html strict//",
537 "-//ietf//dtd html//",
538 "-//metrius//dtd metrius presentational//",
539 "-//microsoft//dtd internet explorer 2.0 html strict//",
540 "-//microsoft//dtd internet explorer 2.0 html//",
541 "-//microsoft//dtd internet explorer 2.0 tables//",
542 "-//microsoft//dtd internet explorer 3.0 html strict//",
543 "-//microsoft//dtd internet explorer 3.0 html//",
544 "-//microsoft//dtd internet explorer 3.0 tables//",
545 "-//netscape comm. corp.//dtd html//",
546 "-//netscape comm. corp.//dtd strict html//",
547 "-//o'reilly and associates//dtd html 2.0//",
548 "-//o'reilly and associates//dtd html extended 1.0//",
549 "-//o'reilly and associates//dtd html extended relaxed 1.0//",
550 "-//softquad software//dtd hotmetal pro 6.0::19990601::extensions to html 4.0//",
551 "-//softquad//dtd hotmetal pro 4.0::19971010::extensions to html 4.0//",
552 "-//spyglass//dtd html 2.0 extended//",
553 "-//sq//dtd html 2.0 hotmetal + extensions//",
554 "-//sun microsystems corp.//dtd hotjava html//",
555 "-//sun microsystems corp.//dtd hotjava strict html//",
556 "-//w3c//dtd html 3 1995-03-24//",
557 "-//w3c//dtd html 3.2 draft//",
558 "-//w3c//dtd html 3.2 final//",
559 "-//w3c//dtd html 3.2//",
560 "-//w3c//dtd html 3.2s draft//",
561 "-//w3c//dtd html 4.0 frameset//",
562 "-//w3c//dtd html 4.0 transitional//",
563 "-//w3c//dtd html experimental 19960712//",
564 "-//w3c//dtd html experimental 970421//",
565 "-//w3c//dtd w3 html//",
566 "-//w3o//dtd w3 html 3.0//",
567 "-//webtechs//dtd mozilla html 2.0//",
568 "-//webtechs//dtd mozilla html//")) or
569 publicId in ("-//w3o//dtd w3 html strict 3.0//en//",
570 "-/w3c/dtd html 4.0 transitional/en",
571 "html") or
572 publicId.startswith(
573 ("-//w3c//dtd html 4.01 frameset//",
574 "-//w3c//dtd html 4.01 transitional//")) and
575 systemId is None or
576 systemId and systemId.lower() == "http://www.ibm.com/data/dtd/v11/ibmxhtml1-transitional.dtd"):
577 self.parser.compatMode = "quirks"
578 elif (publicId.startswith(
579 ("-//w3c//dtd xhtml 1.0 frameset//",
580 "-//w3c//dtd xhtml 1.0 transitional//")) or
581 publicId.startswith(
582 ("-//w3c//dtd html 4.01 frameset//",
583 "-//w3c//dtd html 4.01 transitional//")) and
584 systemId is not None):
585 self.parser.compatMode = "limited quirks"
586
587 self.parser.phase = self.parser.phases["beforeHtml"]
588
589 def anythingElse(self):
590 self.parser.compatMode = "quirks"
591 self.parser.phase = self.parser.phases["beforeHtml"]
592
593 def processCharacters(self, token):
594 self.parser.parseError("expected-doctype-but-got-chars")
595 self.anythingElse()
596 return token
597
598 def processStartTag(self, token):
599 self.parser.parseError("expected-doctype-but-got-start-tag",
600 {"name": token["name"]})
601 self.anythingElse()
602 return token
603
604 def processEndTag(self, token):
605 self.parser.parseError("expected-doctype-but-got-end-tag",
606 {"name": token["name"]})
607 self.anythingElse()
608 return token
609
610 def processEOF(self):
611 self.parser.parseError("expected-doctype-but-got-eof")
612 self.anythingElse()
613 return True
614
615 class BeforeHtmlPhase(Phase):
616 # helper methods
617 def insertHtmlElement(self):
618 self.tree.insertRoot(impliedTagToken("html", "StartTag"))
619 self.parser.phase = self.parser.phases["beforeHead"]
620
621 # other
622 def processEOF(self):
623 self.insertHtmlElement()
624 return True
625
626 def processComment(self, token):
627 self.tree.insertComment(token, self.tree.document)
628
629 def processSpaceCharacters(self, token):
630 pass
631
632 def processCharacters(self, token):
633 self.insertHtmlElement()
634 return token
635
636 def processStartTag(self, token):
637 if token["name"] == "html":
638 self.parser.firstStartTag = True
639 self.insertHtmlElement()
640 return token
641
642 def processEndTag(self, token):
643 if token["name"] not in ("head", "body", "html", "br"):
644 self.parser.parseError("unexpected-end-tag-before-html",
645 {"name": token["name"]})
646 else:
647 self.insertHtmlElement()
648 return token
649
650 class BeforeHeadPhase(Phase):
651 def __init__(self, parser, tree):
652 Phase.__init__(self, parser, tree)
653
654 self.startTagHandler = _utils.MethodDispatcher([
655 ("html", self.startTagHtml),
656 ("head", self.startTagHead)
657 ])
658 self.startTagHandler.default = self.startTagOther
659
660 self.endTagHandler = _utils.MethodDispatcher([
661 (("head", "body", "html", "br"), self.endTagImplyHead)
662 ])
663 self.endTagHandler.default = self.endTagOther
664
665 def processEOF(self):
666 self.startTagHead(impliedTagToken("head", "StartTag"))
667 return True
668
669 def processSpaceCharacters(self, token):
670 pass
671
672 def processCharacters(self, token):
673 self.startTagHead(impliedTagToken("head", "StartTag"))
674 return token
675
676 def startTagHtml(self, token):
677 return self.parser.phases["inBody"].processStartTag(token)
678
679 def startTagHead(self, token):
680 self.tree.insertElement(token)
681 self.tree.headPointer = self.tree.openElements[-1]
682 self.parser.phase = self.parser.phases["inHead"]
683
684 def startTagOther(self, token):
685 self.startTagHead(impliedTagToken("head", "StartTag"))
686 return token
687
688 def endTagImplyHead(self, token):
689 self.startTagHead(impliedTagToken("head", "StartTag"))
690 return token
691
692 def endTagOther(self, token):
693 self.parser.parseError("end-tag-after-implied-root",
694 {"name": token["name"]})
695
696 class InHeadPhase(Phase):
697 def __init__(self, parser, tree):
698 Phase.__init__(self, parser, tree)
699
700 self.startTagHandler = _utils.MethodDispatcher([
701 ("html", self.startTagHtml),
702 ("title", self.startTagTitle),
703 (("noframes", "style"), self.startTagNoFramesStyle),
704 ("noscript", self.startTagNoscript),
705 ("script", self.startTagScript),
706 (("base", "basefont", "bgsound", "command", "link"),
707 self.startTagBaseLinkCommand),
708 ("meta", self.startTagMeta),
709 ("head", self.startTagHead)
710 ])
711 self.startTagHandler.default = self.startTagOther
712
713 self.endTagHandler = _utils.MethodDispatcher([
714 ("head", self.endTagHead),
715 (("br", "html", "body"), self.endTagHtmlBodyBr)
716 ])
717 self.endTagHandler.default = self.endTagOther
718
719 # the real thing
720 def processEOF(self):
721 self.anythingElse()
722 return True
723
724 def processCharacters(self, token):
725 self.anythingElse()
726 return token
727
728 def startTagHtml(self, token):
729 return self.parser.phases["inBody"].processStartTag(token)
730
731 def startTagHead(self, token):
732 self.parser.parseError("two-heads-are-not-better-than-one")
733
734 def startTagBaseLinkCommand(self, token):
735 self.tree.insertElement(token)
736 self.tree.openElements.pop()
737 token["selfClosingAcknowledged"] = True
738
739 def startTagMeta(self, token):
740 self.tree.insertElement(token)
741 self.tree.openElements.pop()
742 token["selfClosingAcknowledged"] = True
743
744 attributes = token["data"]
745 if self.parser.tokenizer.stream.charEncoding[1] == "tentative":
746 if "charset" in attributes:
747 self.parser.tokenizer.stream.changeEncoding(attributes["charset"])
748 elif ("content" in attributes and
749 "http-equiv" in attributes and
750 attributes["http-equiv"].lower() == "content-type"):
751 # Encoding it as UTF-8 here is a hack, as really we should pass
752 # the abstract Unicode string, and just use the
753 # ContentAttrParser on that, but using UTF-8 allows all chars
754 # to be encoded and as a ASCII-superset works.
755 data = _inputstream.EncodingBytes(attributes["content"].encode("utf-8"))
756 parser = _inputstream.ContentAttrParser(data)
757 codec = parser.parse()
758 self.parser.tokenizer.stream.changeEncoding(codec)
759
760 def startTagTitle(self, token):
761 self.parser.parseRCDataRawtext(token, "RCDATA")
762
763 def startTagNoFramesStyle(self, token):
764 # Need to decide whether to implement the scripting-disabled case
765 self.parser.parseRCDataRawtext(token, "RAWTEXT")
766
767 def startTagNoscript(self, token):
768 if self.parser.scripting:
769 self.parser.parseRCDataRawtext(token, "RAWTEXT")
770 else:
771 self.tree.insertElement(token)
772 self.parser.phase = self.parser.phases["inHeadNoscript"]
773
774 def startTagScript(self, token):
775 self.tree.insertElement(token)
776 self.parser.tokenizer.state = self.parser.tokenizer.scriptDataState
777 self.parser.originalPhase = self.parser.phase
778 self.parser.phase = self.parser.phases["text"]
779
780 def startTagOther(self, token):
781 self.anythingElse()
782 return token
783
784 def endTagHead(self, token):
785 node = self.parser.tree.openElements.pop()
786 assert node.name == "head", "Expected head got %s" % node.name
787 self.parser.phase = self.parser.phases["afterHead"]
788
789 def endTagHtmlBodyBr(self, token):
790 self.anythingElse()
791 return token
792
793 def endTagOther(self, token):
794 self.parser.parseError("unexpected-end-tag", {"name": token["name"]})
795
796 def anythingElse(self):
797 self.endTagHead(impliedTagToken("head"))
798
799 class InHeadNoscriptPhase(Phase):
800 def __init__(self, parser, tree):
801 Phase.__init__(self, parser, tree)
802
803 self.startTagHandler = _utils.MethodDispatcher([
804 ("html", self.startTagHtml),
805 (("basefont", "bgsound", "link", "meta", "noframes", "style"), self.startTagBaseLinkCommand),
806 (("head", "noscript"), self.startTagHeadNoscript),
807 ])
808 self.startTagHandler.default = self.startTagOther
809
810 self.endTagHandler = _utils.MethodDispatcher([
811 ("noscript", self.endTagNoscript),
812 ("br", self.endTagBr),
813 ])
814 self.endTagHandler.default = self.endTagOther
815
816 def processEOF(self):
817 self.parser.parseError("eof-in-head-noscript")
818 self.anythingElse()
819 return True
820
821 def processComment(self, token):
822 return self.parser.phases["inHead"].processComment(token)
823
824 def processCharacters(self, token):
825 self.parser.parseError("char-in-head-noscript")
826 self.anythingElse()
827 return token
828
829 def processSpaceCharacters(self, token):
830 return self.parser.phases["inHead"].processSpaceCharacters(token)
831
832 def startTagHtml(self, token):
833 return self.parser.phases["inBody"].processStartTag(token)
834
835 def startTagBaseLinkCommand(self, token):
836 return self.parser.phases["inHead"].processStartTag(token)
837
838 def startTagHeadNoscript(self, token):
839 self.parser.parseError("unexpected-start-tag", {"name": token["name"]})
840
841 def startTagOther(self, token):
842 self.parser.parseError("unexpected-inhead-noscript-tag", {"name": token["name"]})
843 self.anythingElse()
844 return token
845
846 def endTagNoscript(self, token):
847 node = self.parser.tree.openElements.pop()
848 assert node.name == "noscript", "Expected noscript got %s" % node.name
849 self.parser.phase = self.parser.phases["inHead"]
850
851 def endTagBr(self, token):
852 self.parser.parseError("unexpected-inhead-noscript-tag", {"name": token["name"]})
853 self.anythingElse()
854 return token
855
856 def endTagOther(self, token):
857 self.parser.parseError("unexpected-end-tag", {"name": token["name"]})
858
859 def anythingElse(self):
860 # Caller must raise parse error first!
861 self.endTagNoscript(impliedTagToken("noscript"))
862
863 class AfterHeadPhase(Phase):
864 def __init__(self, parser, tree):
865 Phase.__init__(self, parser, tree)
866
867 self.startTagHandler = _utils.MethodDispatcher([
868 ("html", self.startTagHtml),
869 ("body", self.startTagBody),
870 ("frameset", self.startTagFrameset),
871 (("base", "basefont", "bgsound", "link", "meta", "noframes", "script",
872 "style", "title"),
873 self.startTagFromHead),
874 ("head", self.startTagHead)
875 ])
876 self.startTagHandler.default = self.startTagOther
877 self.endTagHandler = _utils.MethodDispatcher([(("body", "html", "br"),
878 self.endTagHtmlBodyBr)])
879 self.endTagHandler.default = self.endTagOther
880
881 def processEOF(self):
882 self.anythingElse()
883 return True
884
885 def processCharacters(self, token):
886 self.anythingElse()
887 return token
888
889 def startTagHtml(self, token):
890 return self.parser.phases["inBody"].processStartTag(token)
891
892 def startTagBody(self, token):
893 self.parser.framesetOK = False
894 self.tree.insertElement(token)
895 self.parser.phase = self.parser.phases["inBody"]
896
897 def startTagFrameset(self, token):
898 self.tree.insertElement(token)
899 self.parser.phase = self.parser.phases["inFrameset"]
900
901 def startTagFromHead(self, token):
902 self.parser.parseError("unexpected-start-tag-out-of-my-head",
903 {"name": token["name"]})
904 self.tree.openElements.append(self.tree.headPointer)
905 self.parser.phases["inHead"].processStartTag(token)
906 for node in self.tree.openElements[::-1]:
907 if node.name == "head":
908 self.tree.openElements.remove(node)
909 break
910
911 def startTagHead(self, token):
912 self.parser.parseError("unexpected-start-tag", {"name": token["name"]})
913
914 def startTagOther(self, token):
915 self.anythingElse()
916 return token
917
918 def endTagHtmlBodyBr(self, token):
919 self.anythingElse()
920 return token
921
922 def endTagOther(self, token):
923 self.parser.parseError("unexpected-end-tag", {"name": token["name"]})
924
925 def anythingElse(self):
926 self.tree.insertElement(impliedTagToken("body", "StartTag"))
927 self.parser.phase = self.parser.phases["inBody"]
928 self.parser.framesetOK = True
929
930 class InBodyPhase(Phase):
931 # http://www.whatwg.org/specs/web-apps/current-work/#parsing-main-inbody
932 # the really-really-really-very crazy mode
933 def __init__(self, parser, tree):
934 Phase.__init__(self, parser, tree)
935
936 # Set this to the default handler
937 self.processSpaceCharacters = self.processSpaceCharactersNonPre
938
939 self.startTagHandler = _utils.MethodDispatcher([
940 ("html", self.startTagHtml),
941 (("base", "basefont", "bgsound", "command", "link", "meta",
942 "script", "style", "title"),
943 self.startTagProcessInHead),
944 ("body", self.startTagBody),
945 ("frameset", self.startTagFrameset),
946 (("address", "article", "aside", "blockquote", "center", "details",
947 "dir", "div", "dl", "fieldset", "figcaption", "figure",
948 "footer", "header", "hgroup", "main", "menu", "nav", "ol", "p",
949 "section", "summary", "ul"),
950 self.startTagCloseP),
951 (headingElements, self.startTagHeading),
952 (("pre", "listing"), self.startTagPreListing),
953 ("form", self.startTagForm),
954 (("li", "dd", "dt"), self.startTagListItem),
955 ("plaintext", self.startTagPlaintext),
956 ("a", self.startTagA),
957 (("b", "big", "code", "em", "font", "i", "s", "small", "strike",
958 "strong", "tt", "u"), self.startTagFormatting),
959 ("nobr", self.startTagNobr),
960 ("button", self.startTagButton),
961 (("applet", "marquee", "object"), self.startTagAppletMarqueeObject),
962 ("xmp", self.startTagXmp),
963 ("table", self.startTagTable),
964 (("area", "br", "embed", "img", "keygen", "wbr"),
965 self.startTagVoidFormatting),
966 (("param", "source", "track"), self.startTagParamSource),
967 ("input", self.startTagInput),
968 ("hr", self.startTagHr),
969 ("image", self.startTagImage),
970 ("isindex", self.startTagIsIndex),
971 ("textarea", self.startTagTextarea),
972 ("iframe", self.startTagIFrame),
973 ("noscript", self.startTagNoscript),
974 (("noembed", "noframes"), self.startTagRawtext),
975 ("select", self.startTagSelect),
976 (("rp", "rt"), self.startTagRpRt),
977 (("option", "optgroup"), self.startTagOpt),
978 (("math"), self.startTagMath),
979 (("svg"), self.startTagSvg),
980 (("caption", "col", "colgroup", "frame", "head",
981 "tbody", "td", "tfoot", "th", "thead",
982 "tr"), self.startTagMisplaced)
983 ])
984 self.startTagHandler.default = self.startTagOther
985
986 self.endTagHandler = _utils.MethodDispatcher([
987 ("body", self.endTagBody),
988 ("html", self.endTagHtml),
989 (("address", "article", "aside", "blockquote", "button", "center",
990 "details", "dialog", "dir", "div", "dl", "fieldset", "figcaption", "figure",
991 "footer", "header", "hgroup", "listing", "main", "menu", "nav", "ol", "pre",
992 "section", "summary", "ul"), self.endTagBlock),
993 ("form", self.endTagForm),
994 ("p", self.endTagP),
995 (("dd", "dt", "li"), self.endTagListItem),
996 (headingElements, self.endTagHeading),
997 (("a", "b", "big", "code", "em", "font", "i", "nobr", "s", "small",
998 "strike", "strong", "tt", "u"), self.endTagFormatting),
999 (("applet", "marquee", "object"), self.endTagAppletMarqueeObject),
1000 ("br", self.endTagBr),
1001 ])
1002 self.endTagHandler.default = self.endTagOther
1003
1004 def isMatchingFormattingElement(self, node1, node2):
1005 return (node1.name == node2.name and
1006 node1.namespace == node2.namespace and
1007 node1.attributes == node2.attributes)
1008
1009 # helper
1010 def addFormattingElement(self, token):
1011 self.tree.insertElement(token)
1012 element = self.tree.openElements[-1]
1013
1014 matchingElements = []
1015 for node in self.tree.activeFormattingElements[::-1]:
1016 if node is Marker:
1017 break
1018 elif self.isMatchingFormattingElement(node, element):
1019 matchingElements.append(node)
1020
1021 assert len(matchingElements) <= 3
1022 if len(matchingElements) == 3:
1023 self.tree.activeFormattingElements.remove(matchingElements[-1])
1024 self.tree.activeFormattingElements.append(element)
1025
1026 # the real deal
1027 def processEOF(self):
1028 allowed_elements = frozenset(("dd", "dt", "li", "p", "tbody", "td",
1029 "tfoot", "th", "thead", "tr", "body",
1030 "html"))
1031 for node in self.tree.openElements[::-1]:
1032 if node.name not in allowed_elements:
1033 self.parser.parseError("expected-closing-tag-but-got-eof")
1034 break
1035 # Stop parsing
1036
1037 def processSpaceCharactersDropNewline(self, token):
1038 # Sometimes (start of <pre>, <listing>, and <textarea> blocks) we
1039 # want to drop leading newlines
1040 data = token["data"]
1041 self.processSpaceCharacters = self.processSpaceCharactersNonPre
1042 if (data.startswith("\n") and
1043 self.tree.openElements[-1].name in ("pre", "listing", "textarea") and
1044 not self.tree.openElements[-1].hasContent()):
1045 data = data[1:]
1046 if data:
1047 self.tree.reconstructActiveFormattingElements()
1048 self.tree.insertText(data)
1049
1050 def processCharacters(self, token):
1051 if token["data"] == "\u0000":
1052 # The tokenizer should always emit null on its own
1053 return
1054 self.tree.reconstructActiveFormattingElements()
1055 self.tree.insertText(token["data"])
1056 # This must be bad for performance
1057 if (self.parser.framesetOK and
1058 any([char not in spaceCharacters
1059 for char in token["data"]])):
1060 self.parser.framesetOK = False
1061
1062 def processSpaceCharactersNonPre(self, token):
1063 self.tree.reconstructActiveFormattingElements()
1064 self.tree.insertText(token["data"])
1065
1066 def startTagProcessInHead(self, token):
1067 return self.parser.phases["inHead"].processStartTag(token)
1068
1069 def startTagBody(self, token):
1070 self.parser.parseError("unexpected-start-tag", {"name": "body"})
1071 if (len(self.tree.openElements) == 1 or
1072 self.tree.openElements[1].name != "body"):
1073 assert self.parser.innerHTML
1074 else:
1075 self.parser.framesetOK = False
1076 for attr, value in token["data"].items():
1077 if attr not in self.tree.openElements[1].attributes:
1078 self.tree.openElements[1].attributes[attr] = value
1079
1080 def startTagFrameset(self, token):
1081 self.parser.parseError("unexpected-start-tag", {"name": "frameset"})
1082 if (len(self.tree.openElements) == 1 or self.tree.openElements[1].name != "body"):
1083 assert self.parser.innerHTML
1084 elif not self.parser.framesetOK:
1085 pass
1086 else:
1087 if self.tree.openElements[1].parent:
1088 self.tree.openElements[1].parent.removeChild(self.tree.openElements[1])
1089 while self.tree.openElements[-1].name != "html":
1090 self.tree.openElements.pop()
1091 self.tree.insertElement(token)
1092 self.parser.phase = self.parser.phases["inFrameset"]
1093
1094 def startTagCloseP(self, token):
1095 if self.tree.elementInScope("p", variant="button"):
1096 self.endTagP(impliedTagToken("p"))
1097 self.tree.insertElement(token)
1098
1099 def startTagPreListing(self, token):
1100 if self.tree.elementInScope("p", variant="button"):
1101 self.endTagP(impliedTagToken("p"))
1102 self.tree.insertElement(token)
1103 self.parser.framesetOK = False
1104 self.processSpaceCharacters = self.processSpaceCharactersDropNewline
1105
1106 def startTagForm(self, token):
1107 if self.tree.formPointer:
1108 self.parser.parseError("unexpected-start-tag", {"name": "form"})
1109 else:
1110 if self.tree.elementInScope("p", variant="button"):
1111 self.endTagP(impliedTagToken("p"))
1112 self.tree.insertElement(token)
1113 self.tree.formPointer = self.tree.openElements[-1]
1114
1115 def startTagListItem(self, token):
1116 self.parser.framesetOK = False
1117
1118 stopNamesMap = {"li": ["li"],
1119 "dt": ["dt", "dd"],
1120 "dd": ["dt", "dd"]}
1121 stopNames = stopNamesMap[token["name"]]
1122 for node in reversed(self.tree.openElements):
1123 if node.name in stopNames:
1124 self.parser.phase.processEndTag(
1125 impliedTagToken(node.name, "EndTag"))
1126 break
1127 if (node.nameTuple in specialElements and
1128 node.name not in ("address", "div", "p")):
1129 break
1130
1131 if self.tree.elementInScope("p", variant="button"):
1132 self.parser.phase.processEndTag(
1133 impliedTagToken("p", "EndTag"))
1134
1135 self.tree.insertElement(token)
1136
1137 def startTagPlaintext(self, token):
1138 if self.tree.elementInScope("p", variant="button"):
1139 self.endTagP(impliedTagToken("p"))
1140 self.tree.insertElement(token)
1141 self.parser.tokenizer.state = self.parser.tokenizer.plaintextState
1142
1143 def startTagHeading(self, token):
1144 if self.tree.elementInScope("p", variant="button"):
1145 self.endTagP(impliedTagToken("p"))
1146 if self.tree.openElements[-1].name in headingElements:
1147 self.parser.parseError("unexpected-start-tag", {"name": token["name"]})
1148 self.tree.openElements.pop()
1149 self.tree.insertElement(token)
1150
1151 def startTagA(self, token):
1152 afeAElement = self.tree.elementInActiveFormattingElements("a")
1153 if afeAElement:
1154 self.parser.parseError("unexpected-start-tag-implies-end-tag",
1155 {"startName": "a", "endName": "a"})
1156 self.endTagFormatting(impliedTagToken("a"))
1157 if afeAElement in self.tree.openElements:
1158 self.tree.openElements.remove(afeAElement)
1159 if afeAElement in self.tree.activeFormattingElements:
1160 self.tree.activeFormattingElements.remove(afeAElement)
1161 self.tree.reconstructActiveFormattingElements()
1162 self.addFormattingElement(token)
1163
1164 def startTagFormatting(self, token):
1165 self.tree.reconstructActiveFormattingElements()
1166 self.addFormattingElement(token)
1167
1168 def startTagNobr(self, token):
1169 self.tree.reconstructActiveFormattingElements()
1170 if self.tree.elementInScope("nobr"):
1171 self.parser.parseError("unexpected-start-tag-implies-end-tag",
1172 {"startName": "nobr", "endName": "nobr"})
1173 self.processEndTag(impliedTagToken("nobr"))
1174 # XXX Need tests that trigger the following
1175 self.tree.reconstructActiveFormattingElements()
1176 self.addFormattingElement(token)
1177
1178 def startTagButton(self, token):
1179 if self.tree.elementInScope("button"):
1180 self.parser.parseError("unexpected-start-tag-implies-end-tag",
1181 {"startName": "button", "endName": "button"})
1182 self.processEndTag(impliedTagToken("button"))
1183 return token
1184 else:
1185 self.tree.reconstructActiveFormattingElements()
1186 self.tree.insertElement(token)
1187 self.parser.framesetOK = False
1188
1189 def startTagAppletMarqueeObject(self, token):
1190 self.tree.reconstructActiveFormattingElements()
1191 self.tree.insertElement(token)
1192 self.tree.activeFormattingElements.append(Marker)
1193 self.parser.framesetOK = False
1194
1195 def startTagXmp(self, token):
1196 if self.tree.elementInScope("p", variant="button"):
1197 self.endTagP(impliedTagToken("p"))
1198 self.tree.reconstructActiveFormattingElements()
1199 self.parser.framesetOK = False
1200 self.parser.parseRCDataRawtext(token, "RAWTEXT")
1201
1202 def startTagTable(self, token):
1203 if self.parser.compatMode != "quirks":
1204 if self.tree.elementInScope("p", variant="button"):
1205 self.processEndTag(impliedTagToken("p"))
1206 self.tree.insertElement(token)
1207 self.parser.framesetOK = False
1208 self.parser.phase = self.parser.phases["inTable"]
1209
1210 def startTagVoidFormatting(self, token):
1211 self.tree.reconstructActiveFormattingElements()
1212 self.tree.insertElement(token)
1213 self.tree.openElements.pop()
1214 token["selfClosingAcknowledged"] = True
1215 self.parser.framesetOK = False
1216
1217 def startTagInput(self, token):
1218 framesetOK = self.parser.framesetOK
1219 self.startTagVoidFormatting(token)
1220 if ("type" in token["data"] and
1221 token["data"]["type"].translate(asciiUpper2Lower) == "hidden"):
1222 # input type=hidden doesn't change framesetOK
1223 self.parser.framesetOK = framesetOK
1224
1225 def startTagParamSource(self, token):
1226 self.tree.insertElement(token)
1227 self.tree.openElements.pop()
1228 token["selfClosingAcknowledged"] = True
1229
1230 def startTagHr(self, token):
1231 if self.tree.elementInScope("p", variant="button"):
1232 self.endTagP(impliedTagToken("p"))
1233 self.tree.insertElement(token)
1234 self.tree.openElements.pop()
1235 token["selfClosingAcknowledged"] = True
1236 self.parser.framesetOK = False
1237
1238 def startTagImage(self, token):
1239 # No really...
1240 self.parser.parseError("unexpected-start-tag-treated-as",
1241 {"originalName": "image", "newName": "img"})
1242 self.processStartTag(impliedTagToken("img", "StartTag",
1243 attributes=token["data"],
1244 selfClosing=token["selfClosing"]))
1245
1246 def startTagIsIndex(self, token):
1247 self.parser.parseError("deprecated-tag", {"name": "isindex"})
1248 if self.tree.formPointer:
1249 return
1250 form_attrs = {}
1251 if "action" in token["data"]:
1252 form_attrs["action"] = token["data"]["action"]
1253 self.processStartTag(impliedTagToken("form", "StartTag",
1254 attributes=form_attrs))
1255 self.processStartTag(impliedTagToken("hr", "StartTag"))
1256 self.processStartTag(impliedTagToken("label", "StartTag"))
1257 # XXX Localization ...
1258 if "prompt" in token["data"]:
1259 prompt = token["data"]["prompt"]
1260 else:
1261 prompt = "This is a searchable index. Enter search keywords: "
1262 self.processCharacters(
1263 {"type": tokenTypes["Characters"], "data": prompt})
1264 attributes = token["data"].copy()
1265 if "action" in attributes:
1266 del attributes["action"]
1267 if "prompt" in attributes:
1268 del attributes["prompt"]
1269 attributes["name"] = "isindex"
1270 self.processStartTag(impliedTagToken("input", "StartTag",
1271 attributes=attributes,
1272 selfClosing=token["selfClosing"]))
1273 self.processEndTag(impliedTagToken("label"))
1274 self.processStartTag(impliedTagToken("hr", "StartTag"))
1275 self.processEndTag(impliedTagToken("form"))
1276
1277 def startTagTextarea(self, token):
1278 self.tree.insertElement(token)
1279 self.parser.tokenizer.state = self.parser.tokenizer.rcdataState
1280 self.processSpaceCharacters = self.processSpaceCharactersDropNewline
1281 self.parser.framesetOK = False
1282
1283 def startTagIFrame(self, token):
1284 self.parser.framesetOK = False
1285 self.startTagRawtext(token)
1286
1287 def startTagNoscript(self, token):
1288 if self.parser.scripting:
1289 self.startTagRawtext(token)
1290 else:
1291 self.startTagOther(token)
1292
1293 def startTagRawtext(self, token):
1294 """iframe, noembed noframes, noscript(if scripting enabled)"""
1295 self.parser.parseRCDataRawtext(token, "RAWTEXT")
1296
1297 def startTagOpt(self, token):
1298 if self.tree.openElements[-1].name == "option":
1299 self.parser.phase.processEndTag(impliedTagToken("option"))
1300 self.tree.reconstructActiveFormattingElements()
1301 self.parser.tree.insertElement(token)
1302
1303 def startTagSelect(self, token):
1304 self.tree.reconstructActiveFormattingElements()
1305 self.tree.insertElement(token)
1306 self.parser.framesetOK = False
1307 if self.parser.phase in (self.parser.phases["inTable"],
1308 self.parser.phases["inCaption"],
1309 self.parser.phases["inColumnGroup"],
1310 self.parser.phases["inTableBody"],
1311 self.parser.phases["inRow"],
1312 self.parser.phases["inCell"]):
1313 self.parser.phase = self.parser.phases["inSelectInTable"]
1314 else:
1315 self.parser.phase = self.parser.phases["inSelect"]
1316
1317 def startTagRpRt(self, token):
1318 if self.tree.elementInScope("ruby"):
1319 self.tree.generateImpliedEndTags()
1320 if self.tree.openElements[-1].name != "ruby":
1321 self.parser.parseError()
1322 self.tree.insertElement(token)
1323
1324 def startTagMath(self, token):
1325 self.tree.reconstructActiveFormattingElements()
1326 self.parser.adjustMathMLAttributes(token)
1327 self.parser.adjustForeignAttributes(token)
1328 token["namespace"] = namespaces["mathml"]
1329 self.tree.insertElement(token)
1330 # Need to get the parse error right for the case where the token
1331 # has a namespace not equal to the xmlns attribute
1332 if token["selfClosing"]:
1333 self.tree.openElements.pop()
1334 token["selfClosingAcknowledged"] = True
1335
1336 def startTagSvg(self, token):
1337 self.tree.reconstructActiveFormattingElements()
1338 self.parser.adjustSVGAttributes(token)
1339 self.parser.adjustForeignAttributes(token)
1340 token["namespace"] = namespaces["svg"]
1341 self.tree.insertElement(token)
1342 # Need to get the parse error right for the case where the token
1343 # has a namespace not equal to the xmlns attribute
1344 if token["selfClosing"]:
1345 self.tree.openElements.pop()
1346 token["selfClosingAcknowledged"] = True
1347
1348 def startTagMisplaced(self, token):
1349 """ Elements that should be children of other elements that have a
1350 different insertion mode; here they are ignored
1351 "caption", "col", "colgroup", "frame", "frameset", "head",
1352 "option", "optgroup", "tbody", "td", "tfoot", "th", "thead",
1353 "tr", "noscript"
1354 """
1355 self.parser.parseError("unexpected-start-tag-ignored", {"name": token["name"]})
1356
1357 def startTagOther(self, token):
1358 self.tree.reconstructActiveFormattingElements()
1359 self.tree.insertElement(token)
1360
1361 def endTagP(self, token):
1362 if not self.tree.elementInScope("p", variant="button"):
1363 self.startTagCloseP(impliedTagToken("p", "StartTag"))
1364 self.parser.parseError("unexpected-end-tag", {"name": "p"})
1365 self.endTagP(impliedTagToken("p", "EndTag"))
1366 else:
1367 self.tree.generateImpliedEndTags("p")
1368 if self.tree.openElements[-1].name != "p":
1369 self.parser.parseError("unexpected-end-tag", {"name": "p"})
1370 node = self.tree.openElements.pop()
1371 while node.name != "p":
1372 node = self.tree.openElements.pop()
1373
1374 def endTagBody(self, token):
1375 if not self.tree.elementInScope("body"):
1376 self.parser.parseError()
1377 return
1378 elif self.tree.openElements[-1].name != "body":
1379 for node in self.tree.openElements[2:]:
1380 if node.name not in frozenset(("dd", "dt", "li", "optgroup",
1381 "option", "p", "rp", "rt",
1382 "tbody", "td", "tfoot",
1383 "th", "thead", "tr", "body",
1384 "html")):
1385 # Not sure this is the correct name for the parse error
1386 self.parser.parseError(
1387 "expected-one-end-tag-but-got-another",
1388 {"gotName": "body", "expectedName": node.name})
1389 break
1390 self.parser.phase = self.parser.phases["afterBody"]
1391
1392 def endTagHtml(self, token):
1393 # We repeat the test for the body end tag token being ignored here
1394 if self.tree.elementInScope("body"):
1395 self.endTagBody(impliedTagToken("body"))
1396 return token
1397
1398 def endTagBlock(self, token):
1399 # Put us back in the right whitespace handling mode
1400 if token["name"] == "pre":
1401 self.processSpaceCharacters = self.processSpaceCharactersNonPre
1402 inScope = self.tree.elementInScope(token["name"])
1403 if inScope:
1404 self.tree.generateImpliedEndTags()
1405 if self.tree.openElements[-1].name != token["name"]:
1406 self.parser.parseError("end-tag-too-early", {"name": token["name"]})
1407 if inScope:
1408 node = self.tree.openElements.pop()
1409 while node.name != token["name"]:
1410 node = self.tree.openElements.pop()
1411
1412 def endTagForm(self, token):
1413 node = self.tree.formPointer
1414 self.tree.formPointer = None
1415 if node is None or not self.tree.elementInScope(node):
1416 self.parser.parseError("unexpected-end-tag",
1417 {"name": "form"})
1418 else:
1419 self.tree.generateImpliedEndTags()
1420 if self.tree.openElements[-1] != node:
1421 self.parser.parseError("end-tag-too-early-ignored",
1422 {"name": "form"})
1423 self.tree.openElements.remove(node)
1424
1425 def endTagListItem(self, token):
1426 if token["name"] == "li":
1427 variant = "list"
1428 else:
1429 variant = None
1430 if not self.tree.elementInScope(token["name"], variant=variant):
1431 self.parser.parseError("unexpected-end-tag", {"name": token["name"]})
1432 else:
1433 self.tree.generateImpliedEndTags(exclude=token["name"])
1434 if self.tree.openElements[-1].name != token["name"]:
1435 self.parser.parseError(
1436 "end-tag-too-early",
1437 {"name": token["name"]})
1438 node = self.tree.openElements.pop()
1439 while node.name != token["name"]:
1440 node = self.tree.openElements.pop()
1441
1442 def endTagHeading(self, token):
1443 for item in headingElements:
1444 if self.tree.elementInScope(item):
1445 self.tree.generateImpliedEndTags()
1446 break
1447 if self.tree.openElements[-1].name != token["name"]:
1448 self.parser.parseError("end-tag-too-early", {"name": token["name"]})
1449
1450 for item in headingElements:
1451 if self.tree.elementInScope(item):
1452 item = self.tree.openElements.pop()
1453 while item.name not in headingElements:
1454 item = self.tree.openElements.pop()
1455 break
1456
1457 def endTagFormatting(self, token):
1458 """The much-feared adoption agency algorithm"""
1459 # http://svn.whatwg.org/webapps/complete.html#adoptionAgency revision 7867
1460 # XXX Better parseError messages appreciated.
1461
1462 # Step 1
1463 outerLoopCounter = 0
1464
1465 # Step 2
1466 while outerLoopCounter < 8:
1467
1468 # Step 3
1469 outerLoopCounter += 1
1470
1471 # Step 4:
1472
1473 # Let the formatting element be the last element in
1474 # the list of active formatting elements that:
1475 # - is between the end of the list and the last scope
1476 # marker in the list, if any, or the start of the list
1477 # otherwise, and
1478 # - has the same tag name as the token.
1479 formattingElement = self.tree.elementInActiveFormattingElements(
1480 token["name"])
1481 if (not formattingElement or
1482 (formattingElement in self.tree.openElements and
1483 not self.tree.elementInScope(formattingElement.name))):
1484 # If there is no such node, then abort these steps
1485 # and instead act as described in the "any other
1486 # end tag" entry below.
1487 self.endTagOther(token)
1488 return
1489
1490 # Otherwise, if there is such a node, but that node is
1491 # not in the stack of open elements, then this is a
1492 # parse error; remove the element from the list, and
1493 # abort these steps.
1494 elif formattingElement not in self.tree.openElements:
1495 self.parser.parseError("adoption-agency-1.2", {"name": token["name"]})
1496 self.tree.activeFormattingElements.remove(formattingElement)
1497 return
1498
1499 # Otherwise, if there is such a node, and that node is
1500 # also in the stack of open elements, but the element
1501 # is not in scope, then this is a parse error; ignore
1502 # the token, and abort these steps.
1503 elif not self.tree.elementInScope(formattingElement.name):
1504 self.parser.parseError("adoption-agency-4.4", {"name": token["name"]})
1505 return
1506
1507 # Otherwise, there is a formatting element and that
1508 # element is in the stack and is in scope. If the
1509 # element is not the current node, this is a parse
1510 # error. In any case, proceed with the algorithm as
1511 # written in the following steps.
1512 else:
1513 if formattingElement != self.tree.openElements[-1]:
1514 self.parser.parseError("adoption-agency-1.3", {"name": token["name"]})
1515
1516 # Step 5:
1517
1518 # Let the furthest block be the topmost node in the
1519 # stack of open elements that is lower in the stack
1520 # than the formatting element, and is an element in
1521 # the special category. There might not be one.
1522 afeIndex = self.tree.openElements.index(formattingElement)
1523 furthestBlock = None
1524 for element in self.tree.openElements[afeIndex:]:
1525 if element.nameTuple in specialElements:
1526 furthestBlock = element
1527 break
1528
1529 # Step 6:
1530
1531 # If there is no furthest block, then the UA must
1532 # first pop all the nodes from the bottom of the stack
1533 # of open elements, from the current node up to and
1534 # including the formatting element, then remove the
1535 # formatting element from the list of active
1536 # formatting elements, and finally abort these steps.
1537 if furthestBlock is None:
1538 element = self.tree.openElements.pop()
1539 while element != formattingElement:
1540 element = self.tree.openElements.pop()
1541 self.tree.activeFormattingElements.remove(element)
1542 return
1543
1544 # Step 7
1545 commonAncestor = self.tree.openElements[afeIndex - 1]
1546
1547 # Step 8:
1548 # The bookmark is supposed to help us identify where to reinsert
1549 # nodes in step 15. We have to ensure that we reinsert nodes after
1550 # the node before the active formatting element. Note the bookmark
1551 # can move in step 9.7
1552 bookmark = self.tree.activeFormattingElements.index(formattingElement)
1553
1554 # Step 9
1555 lastNode = node = furthestBlock
1556 innerLoopCounter = 0
1557
1558 index = self.tree.openElements.index(node)
1559 while innerLoopCounter < 3:
1560 innerLoopCounter += 1
1561 # Node is element before node in open elements
1562 index -= 1
1563 node = self.tree.openElements[index]
1564 if node not in self.tree.activeFormattingElements:
1565 self.tree.openElements.remove(node)
1566 continue
1567 # Step 9.6
1568 if node == formattingElement:
1569 break
1570 # Step 9.7
1571 if lastNode == furthestBlock:
1572 bookmark = self.tree.activeFormattingElements.index(node) + 1
1573 # Step 9.8
1574 clone = node.cloneNode()
1575 # Replace node with clone
1576 self.tree.activeFormattingElements[
1577 self.tree.activeFormattingElements.index(node)] = clone
1578 self.tree.openElements[
1579 self.tree.openElements.index(node)] = clone
1580 node = clone
1581 # Step 9.9
1582 # Remove lastNode from its parents, if any
1583 if lastNode.parent:
1584 lastNode.parent.removeChild(lastNode)
1585 node.appendChild(lastNode)
1586 # Step 9.10
1587 lastNode = node
1588
1589 # Step 10
1590 # Foster parent lastNode if commonAncestor is a
1591 # table, tbody, tfoot, thead, or tr we need to foster
1592 # parent the lastNode
1593 if lastNode.parent:
1594 lastNode.parent.removeChild(lastNode)
1595
1596 if commonAncestor.name in frozenset(("table", "tbody", "tfoot", "thead", "tr")):
1597 parent, insertBefore = self.tree.getTableMisnestedNodePosition()
1598 parent.insertBefore(lastNode, insertBefore)
1599 else:
1600 commonAncestor.appendChild(lastNode)
1601
1602 # Step 11
1603 clone = formattingElement.cloneNode()
1604
1605 # Step 12
1606 furthestBlock.reparentChildren(clone)
1607
1608 # Step 13
1609 furthestBlock.appendChild(clone)
1610
1611 # Step 14
1612 self.tree.activeFormattingElements.remove(formattingElement)
1613 self.tree.activeFormattingElements.insert(bookmark, clone)
1614
1615 # Step 15
1616 self.tree.openElements.remove(formattingElement)
1617 self.tree.openElements.insert(
1618 self.tree.openElements.index(furthestBlock) + 1, clone)
1619
1620 def endTagAppletMarqueeObject(self, token):
1621 if self.tree.elementInScope(token["name"]):
1622 self.tree.generateImpliedEndTags()
1623 if self.tree.openElements[-1].name != token["name"]:
1624 self.parser.parseError("end-tag-too-early", {"name": token["name"]})
1625
1626 if self.tree.elementInScope(token["name"]):
1627 element = self.tree.openElements.pop()
1628 while element.name != token["name"]:
1629 element = self.tree.openElements.pop()
1630 self.tree.clearActiveFormattingElements()
1631
1632 def endTagBr(self, token):
1633 self.parser.parseError("unexpected-end-tag-treated-as",
1634 {"originalName": "br", "newName": "br element"})
1635 self.tree.reconstructActiveFormattingElements()
1636 self.tree.insertElement(impliedTagToken("br", "StartTag"))
1637 self.tree.openElements.pop()
1638
1639 def endTagOther(self, token):
1640 for node in self.tree.openElements[::-1]:
1641 if node.name == token["name"]:
1642 self.tree.generateImpliedEndTags(exclude=token["name"])
1643 if self.tree.openElements[-1].name != token["name"]:
1644 self.parser.parseError("unexpected-end-tag", {"name": token["name"]})
1645 while self.tree.openElements.pop() != node:
1646 pass
1647 break
1648 else:
1649 if node.nameTuple in specialElements:
1650 self.parser.parseError("unexpected-end-tag", {"name": token["name"]})
1651 break
1652
1653 class TextPhase(Phase):
1654 def __init__(self, parser, tree):
1655 Phase.__init__(self, parser, tree)
1656 self.startTagHandler = _utils.MethodDispatcher([])
1657 self.startTagHandler.default = self.startTagOther
1658 self.endTagHandler = _utils.MethodDispatcher([
1659 ("script", self.endTagScript)])
1660 self.endTagHandler.default = self.endTagOther
1661
1662 def processCharacters(self, token):
1663 self.tree.insertText(token["data"])
1664
1665 def processEOF(self):
1666 self.parser.parseError("expected-named-closing-tag-but-got-eof",
1667 {"name": self.tree.openElements[-1].name})
1668 self.tree.openElements.pop()
1669 self.parser.phase = self.parser.originalPhase
1670 return True
1671
1672 def startTagOther(self, token):
1673 assert False, "Tried to process start tag %s in RCDATA/RAWTEXT mode" % token['name']
1674
1675 def endTagScript(self, token):
1676 node = self.tree.openElements.pop()
1677 assert node.name == "script"
1678 self.parser.phase = self.parser.originalPhase
1679 # The rest of this method is all stuff that only happens if
1680 # document.write works
1681
1682 def endTagOther(self, token):
1683 self.tree.openElements.pop()
1684 self.parser.phase = self.parser.originalPhase
1685
1686 class InTablePhase(Phase):
1687 # http://www.whatwg.org/specs/web-apps/current-work/#in-table
1688 def __init__(self, parser, tree):
1689 Phase.__init__(self, parser, tree)
1690 self.startTagHandler = _utils.MethodDispatcher([
1691 ("html", self.startTagHtml),
1692 ("caption", self.startTagCaption),
1693 ("colgroup", self.startTagColgroup),
1694 ("col", self.startTagCol),
1695 (("tbody", "tfoot", "thead"), self.startTagRowGroup),
1696 (("td", "th", "tr"), self.startTagImplyTbody),
1697 ("table", self.startTagTable),
1698 (("style", "script"), self.startTagStyleScript),
1699 ("input", self.startTagInput),
1700 ("form", self.startTagForm)
1701 ])
1702 self.startTagHandler.default = self.startTagOther
1703
1704 self.endTagHandler = _utils.MethodDispatcher([
1705 ("table", self.endTagTable),
1706 (("body", "caption", "col", "colgroup", "html", "tbody", "td",
1707 "tfoot", "th", "thead", "tr"), self.endTagIgnore)
1708 ])
1709 self.endTagHandler.default = self.endTagOther
1710
1711 # helper methods
1712 def clearStackToTableContext(self):
1713 # "clear the stack back to a table context"
1714 while self.tree.openElements[-1].name not in ("table", "html"):
1715 # self.parser.parseError("unexpected-implied-end-tag-in-table",
1716 # {"name": self.tree.openElements[-1].name})
1717 self.tree.openElements.pop()
1718 # When the current node is <html> it's an innerHTML case
1719
1720 # processing methods
1721 def processEOF(self):
1722 if self.tree.openElements[-1].name != "html":
1723 self.parser.parseError("eof-in-table")
1724 else:
1725 assert self.parser.innerHTML
1726 # Stop parsing
1727
1728 def processSpaceCharacters(self, token):
1729 originalPhase = self.parser.phase
1730 self.parser.phase = self.parser.phases["inTableText"]
1731 self.parser.phase.originalPhase = originalPhase
1732 self.parser.phase.processSpaceCharacters(token)
1733
1734 def processCharacters(self, token):
1735 originalPhase = self.parser.phase
1736 self.parser.phase = self.parser.phases["inTableText"]
1737 self.parser.phase.originalPhase = originalPhase
1738 self.parser.phase.processCharacters(token)
1739
1740 def insertText(self, token):
1741 # If we get here there must be at least one non-whitespace character
1742 # Do the table magic!
1743 self.tree.insertFromTable = True
1744 self.parser.phases["inBody"].processCharacters(token)
1745 self.tree.insertFromTable = False
1746
1747 def startTagCaption(self, token):
1748 self.clearStackToTableContext()
1749 self.tree.activeFormattingElements.append(Marker)
1750 self.tree.insertElement(token)
1751 self.parser.phase = self.parser.phases["inCaption"]
1752
1753 def startTagColgroup(self, token):
1754 self.clearStackToTableContext()
1755 self.tree.insertElement(token)
1756 self.parser.phase = self.parser.phases["inColumnGroup"]
1757
1758 def startTagCol(self, token):
1759 self.startTagColgroup(impliedTagToken("colgroup", "StartTag"))
1760 return token
1761
1762 def startTagRowGroup(self, token):
1763 self.clearStackToTableContext()
1764 self.tree.insertElement(token)
1765 self.parser.phase = self.parser.phases["inTableBody"]
1766
1767 def startTagImplyTbody(self, token):
1768 self.startTagRowGroup(impliedTagToken("tbody", "StartTag"))
1769 return token
1770
1771 def startTagTable(self, token):
1772 self.parser.parseError("unexpected-start-tag-implies-end-tag",
1773 {"startName": "table", "endName": "table"})
1774 self.parser.phase.processEndTag(impliedTagToken("table"))
1775 if not self.parser.innerHTML:
1776 return token
1777
1778 def startTagStyleScript(self, token):
1779 return self.parser.phases["inHead"].processStartTag(token)
1780
1781 def startTagInput(self, token):
1782 if ("type" in token["data"] and
1783 token["data"]["type"].translate(asciiUpper2Lower) == "hidden"):
1784 self.parser.parseError("unexpected-hidden-input-in-table")
1785 self.tree.insertElement(token)
1786 # XXX associate with form
1787 self.tree.openElements.pop()
1788 else:
1789 self.startTagOther(token)
1790
1791 def startTagForm(self, token):
1792 self.parser.parseError("unexpected-form-in-table")
1793 if self.tree.formPointer is None:
1794 self.tree.insertElement(token)
1795 self.tree.formPointer = self.tree.openElements[-1]
1796 self.tree.openElements.pop()
1797
1798 def startTagOther(self, token):
1799 self.parser.parseError("unexpected-start-tag-implies-table-voodoo", {"name": token["name"]})
1800 # Do the table magic!
1801 self.tree.insertFromTable = True
1802 self.parser.phases["inBody"].processStartTag(token)
1803 self.tree.insertFromTable = False
1804
1805 def endTagTable(self, token):
1806 if self.tree.elementInScope("table", variant="table"):
1807 self.tree.generateImpliedEndTags()
1808 if self.tree.openElements[-1].name != "table":
1809 self.parser.parseError("end-tag-too-early-named",
1810 {"gotName": "table",
1811 "expectedName": self.tree.openElements[-1].name})
1812 while self.tree.openElements[-1].name != "table":
1813 self.tree.openElements.pop()
1814 self.tree.openElements.pop()
1815 self.parser.resetInsertionMode()
1816 else:
1817 # innerHTML case
1818 assert self.parser.innerHTML
1819 self.parser.parseError()
1820
1821 def endTagIgnore(self, token):
1822 self.parser.parseError("unexpected-end-tag", {"name": token["name"]})
1823
1824 def endTagOther(self, token):
1825 self.parser.parseError("unexpected-end-tag-implies-table-voodoo", {"name": token["name"]})
1826 # Do the table magic!
1827 self.tree.insertFromTable = True
1828 self.parser.phases["inBody"].processEndTag(token)
1829 self.tree.insertFromTable = False
1830
1831 class InTableTextPhase(Phase):
1832 def __init__(self, parser, tree):
1833 Phase.__init__(self, parser, tree)
1834 self.originalPhase = None
1835 self.characterTokens = []
1836
1837 def flushCharacters(self):
1838 data = "".join([item["data"] for item in self.characterTokens])
1839 if any([item not in spaceCharacters for item in data]):
1840 token = {"type": tokenTypes["Characters"], "data": data}
1841 self.parser.phases["inTable"].insertText(token)
1842 elif data:
1843 self.tree.insertText(data)
1844 self.characterTokens = []
1845
1846 def processComment(self, token):
1847 self.flushCharacters()
1848 self.parser.phase = self.originalPhase
1849 return token
1850
1851 def processEOF(self):
1852 self.flushCharacters()
1853 self.parser.phase = self.originalPhase
1854 return True
1855
1856 def processCharacters(self, token):
1857 if token["data"] == "\u0000":
1858 return
1859 self.characterTokens.append(token)
1860
1861 def processSpaceCharacters(self, token):
1862 # pretty sure we should never reach here
1863 self.characterTokens.append(token)
1864 # assert False
1865
1866 def processStartTag(self, token):
1867 self.flushCharacters()
1868 self.parser.phase = self.originalPhase
1869 return token
1870
1871 def processEndTag(self, token):
1872 self.flushCharacters()
1873 self.parser.phase = self.originalPhase
1874 return token
1875
1876 class InCaptionPhase(Phase):
1877 # http://www.whatwg.org/specs/web-apps/current-work/#in-caption
1878 def __init__(self, parser, tree):
1879 Phase.__init__(self, parser, tree)
1880
1881 self.startTagHandler = _utils.MethodDispatcher([
1882 ("html", self.startTagHtml),
1883 (("caption", "col", "colgroup", "tbody", "td", "tfoot", "th",
1884 "thead", "tr"), self.startTagTableElement)
1885 ])
1886 self.startTagHandler.default = self.startTagOther
1887
1888 self.endTagHandler = _utils.MethodDispatcher([
1889 ("caption", self.endTagCaption),
1890 ("table", self.endTagTable),
1891 (("body", "col", "colgroup", "html", "tbody", "td", "tfoot", "th",
1892 "thead", "tr"), self.endTagIgnore)
1893 ])
1894 self.endTagHandler.default = self.endTagOther
1895
1896 def ignoreEndTagCaption(self):
1897 return not self.tree.elementInScope("caption", variant="table")
1898
1899 def processEOF(self):
1900 self.parser.phases["inBody"].processEOF()
1901
1902 def processCharacters(self, token):
1903 return self.parser.phases["inBody"].processCharacters(token)
1904
1905 def startTagTableElement(self, token):
1906 self.parser.parseError()
1907 # XXX Have to duplicate logic here to find out if the tag is ignored
1908 ignoreEndTag = self.ignoreEndTagCaption()
1909 self.parser.phase.processEndTag(impliedTagToken("caption"))
1910 if not ignoreEndTag:
1911 return token
1912
1913 def startTagOther(self, token):
1914 return self.parser.phases["inBody"].processStartTag(token)
1915
1916 def endTagCaption(self, token):
1917 if not self.ignoreEndTagCaption():
1918 # AT this code is quite similar to endTagTable in "InTable"
1919 self.tree.generateImpliedEndTags()
1920 if self.tree.openElements[-1].name != "caption":
1921 self.parser.parseError("expected-one-end-tag-but-got-another",
1922 {"gotName": "caption",
1923 "expectedName": self.tree.openElements[-1].name})
1924 while self.tree.openElements[-1].name != "caption":
1925 self.tree.openElements.pop()
1926 self.tree.openElements.pop()
1927 self.tree.clearActiveFormattingElements()
1928 self.parser.phase = self.parser.phases["inTable"]
1929 else:
1930 # innerHTML case
1931 assert self.parser.innerHTML
1932 self.parser.parseError()
1933
1934 def endTagTable(self, token):
1935 self.parser.parseError()
1936 ignoreEndTag = self.ignoreEndTagCaption()
1937 self.parser.phase.processEndTag(impliedTagToken("caption"))
1938 if not ignoreEndTag:
1939 return token
1940
1941 def endTagIgnore(self, token):
1942 self.parser.parseError("unexpected-end-tag", {"name": token["name"]})
1943
1944 def endTagOther(self, token):
1945 return self.parser.phases["inBody"].processEndTag(token)
1946
1947 class InColumnGroupPhase(Phase):
1948 # http://www.whatwg.org/specs/web-apps/current-work/#in-column
1949
1950 def __init__(self, parser, tree):
1951 Phase.__init__(self, parser, tree)
1952
1953 self.startTagHandler = _utils.MethodDispatcher([
1954 ("html", self.startTagHtml),
1955 ("col", self.startTagCol)
1956 ])
1957 self.startTagHandler.default = self.startTagOther
1958
1959 self.endTagHandler = _utils.MethodDispatcher([
1960 ("colgroup", self.endTagColgroup),
1961 ("col", self.endTagCol)
1962 ])
1963 self.endTagHandler.default = self.endTagOther
1964
1965 def ignoreEndTagColgroup(self):
1966 return self.tree.openElements[-1].name == "html"
1967
1968 def processEOF(self):
1969 if self.tree.openElements[-1].name == "html":
1970 assert self.parser.innerHTML
1971 return
1972 else:
1973 ignoreEndTag = self.ignoreEndTagColgroup()
1974 self.endTagColgroup(impliedTagToken("colgroup"))
1975 if not ignoreEndTag:
1976 return True
1977
1978 def processCharacters(self, token):
1979 ignoreEndTag = self.ignoreEndTagColgroup()
1980 self.endTagColgroup(impliedTagToken("colgroup"))
1981 if not ignoreEndTag:
1982 return token
1983
1984 def startTagCol(self, token):
1985 self.tree.insertElement(token)
1986 self.tree.openElements.pop()
1987 token["selfClosingAcknowledged"] = True
1988
1989 def startTagOther(self, token):
1990 ignoreEndTag = self.ignoreEndTagColgroup()
1991 self.endTagColgroup(impliedTagToken("colgroup"))
1992 if not ignoreEndTag:
1993 return token
1994
1995 def endTagColgroup(self, token):
1996 if self.ignoreEndTagColgroup():
1997 # innerHTML case
1998 assert self.parser.innerHTML
1999 self.parser.parseError()
2000 else:
2001 self.tree.openElements.pop()
2002 self.parser.phase = self.parser.phases["inTable"]
2003
2004 def endTagCol(self, token):
2005 self.parser.parseError("no-end-tag", {"name": "col"})
2006
2007 def endTagOther(self, token):
2008 ignoreEndTag = self.ignoreEndTagColgroup()
2009 self.endTagColgroup(impliedTagToken("colgroup"))
2010 if not ignoreEndTag:
2011 return token
2012
2013 class InTableBodyPhase(Phase):
2014 # http://www.whatwg.org/specs/web-apps/current-work/#in-table0
2015 def __init__(self, parser, tree):
2016 Phase.__init__(self, parser, tree)
2017 self.startTagHandler = _utils.MethodDispatcher([
2018 ("html", self.startTagHtml),
2019 ("tr", self.startTagTr),
2020 (("td", "th"), self.startTagTableCell),
2021 (("caption", "col", "colgroup", "tbody", "tfoot", "thead"),
2022 self.startTagTableOther)
2023 ])
2024 self.startTagHandler.default = self.startTagOther
2025
2026 self.endTagHandler = _utils.MethodDispatcher([
2027 (("tbody", "tfoot", "thead"), self.endTagTableRowGroup),
2028 ("table", self.endTagTable),
2029 (("body", "caption", "col", "colgroup", "html", "td", "th",
2030 "tr"), self.endTagIgnore)
2031 ])
2032 self.endTagHandler.default = self.endTagOther
2033
2034 # helper methods
2035 def clearStackToTableBodyContext(self):
2036 while self.tree.openElements[-1].name not in ("tbody", "tfoot",
2037 "thead", "html"):
2038 # self.parser.parseError("unexpected-implied-end-tag-in-table",
2039 # {"name": self.tree.openElements[-1].name})
2040 self.tree.openElements.pop()
2041 if self.tree.openElements[-1].name == "html":
2042 assert self.parser.innerHTML
2043
2044 # the rest
2045 def processEOF(self):
2046 self.parser.phases["inTable"].processEOF()
2047
2048 def processSpaceCharacters(self, token):
2049 return self.parser.phases["inTable"].processSpaceCharacters(token)
2050
2051 def processCharacters(self, token):
2052 return self.parser.phases["inTable"].processCharacters(token)
2053
2054 def startTagTr(self, token):
2055 self.clearStackToTableBodyContext()
2056 self.tree.insertElement(token)
2057 self.parser.phase = self.parser.phases["inRow"]
2058
2059 def startTagTableCell(self, token):
2060 self.parser.parseError("unexpected-cell-in-table-body",
2061 {"name": token["name"]})
2062 self.startTagTr(impliedTagToken("tr", "StartTag"))
2063 return token
2064
2065 def startTagTableOther(self, token):
2066 # XXX AT Any ideas on how to share this with endTagTable?
2067 if (self.tree.elementInScope("tbody", variant="table") or
2068 self.tree.elementInScope("thead", variant="table") or
2069 self.tree.elementInScope("tfoot", variant="table")):
2070 self.clearStackToTableBodyContext()
2071 self.endTagTableRowGroup(
2072 impliedTagToken(self.tree.openElements[-1].name))
2073 return token
2074 else:
2075 # innerHTML case
2076 assert self.parser.innerHTML
2077 self.parser.parseError()
2078
2079 def startTagOther(self, token):
2080 return self.parser.phases["inTable"].processStartTag(token)
2081
2082 def endTagTableRowGroup(self, token):
2083 if self.tree.elementInScope(token["name"], variant="table"):
2084 self.clearStackToTableBodyContext()
2085 self.tree.openElements.pop()
2086 self.parser.phase = self.parser.phases["inTable"]
2087 else:
2088 self.parser.parseError("unexpected-end-tag-in-table-body",
2089 {"name": token["name"]})
2090
2091 def endTagTable(self, token):
2092 if (self.tree.elementInScope("tbody", variant="table") or
2093 self.tree.elementInScope("thead", variant="table") or
2094 self.tree.elementInScope("tfoot", variant="table")):
2095 self.clearStackToTableBodyContext()
2096 self.endTagTableRowGroup(
2097 impliedTagToken(self.tree.openElements[-1].name))
2098 return token
2099 else:
2100 # innerHTML case
2101 assert self.parser.innerHTML
2102 self.parser.parseError()
2103
2104 def endTagIgnore(self, token):
2105 self.parser.parseError("unexpected-end-tag-in-table-body",
2106 {"name": token["name"]})
2107
2108 def endTagOther(self, token):
2109 return self.parser.phases["inTable"].processEndTag(token)
2110
2111 class InRowPhase(Phase):
2112 # http://www.whatwg.org/specs/web-apps/current-work/#in-row
2113 def __init__(self, parser, tree):
2114 Phase.__init__(self, parser, tree)
2115 self.startTagHandler = _utils.MethodDispatcher([
2116 ("html", self.startTagHtml),
2117 (("td", "th"), self.startTagTableCell),
2118 (("caption", "col", "colgroup", "tbody", "tfoot", "thead",
2119 "tr"), self.startTagTableOther)
2120 ])
2121 self.startTagHandler.default = self.startTagOther
2122
2123 self.endTagHandler = _utils.MethodDispatcher([
2124 ("tr", self.endTagTr),
2125 ("table", self.endTagTable),
2126 (("tbody", "tfoot", "thead"), self.endTagTableRowGroup),
2127 (("body", "caption", "col", "colgroup", "html", "td", "th"),
2128 self.endTagIgnore)
2129 ])
2130 self.endTagHandler.default = self.endTagOther
2131
2132 # helper methods (XXX unify this with other table helper methods)
2133 def clearStackToTableRowContext(self):
2134 while self.tree.openElements[-1].name not in ("tr", "html"):
2135 self.parser.parseError("unexpected-implied-end-tag-in-table-row",
2136 {"name": self.tree.openElements[-1].name})
2137 self.tree.openElements.pop()
2138
2139 def ignoreEndTagTr(self):
2140 return not self.tree.elementInScope("tr", variant="table")
2141
2142 # the rest
2143 def processEOF(self):
2144 self.parser.phases["inTable"].processEOF()
2145
2146 def processSpaceCharacters(self, token):
2147 return self.parser.phases["inTable"].processSpaceCharacters(token)
2148
2149 def processCharacters(self, token):
2150 return self.parser.phases["inTable"].processCharacters(token)
2151
2152 def startTagTableCell(self, token):
2153 self.clearStackToTableRowContext()
2154 self.tree.insertElement(token)
2155 self.parser.phase = self.parser.phases["inCell"]
2156 self.tree.activeFormattingElements.append(Marker)
2157
2158 def startTagTableOther(self, token):
2159 ignoreEndTag = self.ignoreEndTagTr()
2160 self.endTagTr(impliedTagToken("tr"))
2161 # XXX how are we sure it's always ignored in the innerHTML case?
2162 if not ignoreEndTag:
2163 return token
2164
2165 def startTagOther(self, token):
2166 return self.parser.phases["inTable"].processStartTag(token)
2167
2168 def endTagTr(self, token):
2169 if not self.ignoreEndTagTr():
2170 self.clearStackToTableRowContext()
2171 self.tree.openElements.pop()
2172 self.parser.phase = self.parser.phases["inTableBody"]
2173 else:
2174 # innerHTML case
2175 assert self.parser.innerHTML
2176 self.parser.parseError()
2177
2178 def endTagTable(self, token):
2179 ignoreEndTag = self.ignoreEndTagTr()
2180 self.endTagTr(impliedTagToken("tr"))
2181 # Reprocess the current tag if the tr end tag was not ignored
2182 # XXX how are we sure it's always ignored in the innerHTML case?
2183 if not ignoreEndTag:
2184 return token
2185
2186 def endTagTableRowGroup(self, token):
2187 if self.tree.elementInScope(token["name"], variant="table"):
2188 self.endTagTr(impliedTagToken("tr"))
2189 return token
2190 else:
2191 self.parser.parseError()
2192
2193 def endTagIgnore(self, token):
2194 self.parser.parseError("unexpected-end-tag-in-table-row",
2195 {"name": token["name"]})
2196
2197 def endTagOther(self, token):
2198 return self.parser.phases["inTable"].processEndTag(token)
2199
2200 class InCellPhase(Phase):
2201 # http://www.whatwg.org/specs/web-apps/current-work/#in-cell
2202 def __init__(self, parser, tree):
2203 Phase.__init__(self, parser, tree)
2204 self.startTagHandler = _utils.MethodDispatcher([
2205 ("html", self.startTagHtml),
2206 (("caption", "col", "colgroup", "tbody", "td", "tfoot", "th",
2207 "thead", "tr"), self.startTagTableOther)
2208 ])
2209 self.startTagHandler.default = self.startTagOther
2210
2211 self.endTagHandler = _utils.MethodDispatcher([
2212 (("td", "th"), self.endTagTableCell),
2213 (("body", "caption", "col", "colgroup", "html"), self.endTagIgnore),
2214 (("table", "tbody", "tfoot", "thead", "tr"), self.endTagImply)
2215 ])
2216 self.endTagHandler.default = self.endTagOther
2217
2218 # helper
2219 def closeCell(self):
2220 if self.tree.elementInScope("td", variant="table"):
2221 self.endTagTableCell(impliedTagToken("td"))
2222 elif self.tree.elementInScope("th", variant="table"):
2223 self.endTagTableCell(impliedTagToken("th"))
2224
2225 # the rest
2226 def processEOF(self):
2227 self.parser.phases["inBody"].processEOF()
2228
2229 def processCharacters(self, token):
2230 return self.parser.phases["inBody"].processCharacters(token)
2231
2232 def startTagTableOther(self, token):
2233 if (self.tree.elementInScope("td", variant="table") or
2234 self.tree.elementInScope("th", variant="table")):
2235 self.closeCell()
2236 return token
2237 else:
2238 # innerHTML case
2239 assert self.parser.innerHTML
2240 self.parser.parseError()
2241
2242 def startTagOther(self, token):
2243 return self.parser.phases["inBody"].processStartTag(token)
2244
2245 def endTagTableCell(self, token):
2246 if self.tree.elementInScope(token["name"], variant="table"):
2247 self.tree.generateImpliedEndTags(token["name"])
2248 if self.tree.openElements[-1].name != token["name"]:
2249 self.parser.parseError("unexpected-cell-end-tag",
2250 {"name": token["name"]})
2251 while True:
2252 node = self.tree.openElements.pop()
2253 if node.name == token["name"]:
2254 break
2255 else:
2256 self.tree.openElements.pop()
2257 self.tree.clearActiveFormattingElements()
2258 self.parser.phase = self.parser.phases["inRow"]
2259 else:
2260 self.parser.parseError("unexpected-end-tag", {"name": token["name"]})
2261
2262 def endTagIgnore(self, token):
2263 self.parser.parseError("unexpected-end-tag", {"name": token["name"]})
2264
2265 def endTagImply(self, token):
2266 if self.tree.elementInScope(token["name"], variant="table"):
2267 self.closeCell()
2268 return token
2269 else:
2270 # sometimes innerHTML case
2271 self.parser.parseError()
2272
2273 def endTagOther(self, token):
2274 return self.parser.phases["inBody"].processEndTag(token)
2275
2276 class InSelectPhase(Phase):
2277 def __init__(self, parser, tree):
2278 Phase.__init__(self, parser, tree)
2279
2280 self.startTagHandler = _utils.MethodDispatcher([
2281 ("html", self.startTagHtml),
2282 ("option", self.startTagOption),
2283 ("optgroup", self.startTagOptgroup),
2284 ("select", self.startTagSelect),
2285 (("input", "keygen", "textarea"), self.startTagInput),
2286 ("script", self.startTagScript)
2287 ])
2288 self.startTagHandler.default = self.startTagOther
2289
2290 self.endTagHandler = _utils.MethodDispatcher([
2291 ("option", self.endTagOption),
2292 ("optgroup", self.endTagOptgroup),
2293 ("select", self.endTagSelect)
2294 ])
2295 self.endTagHandler.default = self.endTagOther
2296
2297 # http://www.whatwg.org/specs/web-apps/current-work/#in-select
2298 def processEOF(self):
2299 if self.tree.openElements[-1].name != "html":
2300 self.parser.parseError("eof-in-select")
2301 else:
2302 assert self.parser.innerHTML
2303
2304 def processCharacters(self, token):
2305 if token["data"] == "\u0000":
2306 return
2307 self.tree.insertText(token["data"])
2308
2309 def startTagOption(self, token):
2310 # We need to imply </option> if <option> is the current node.
2311 if self.tree.openElements[-1].name == "option":
2312 self.tree.openElements.pop()
2313 self.tree.insertElement(token)
2314
2315 def startTagOptgroup(self, token):
2316 if self.tree.openElements[-1].name == "option":
2317 self.tree.openElements.pop()
2318 if self.tree.openElements[-1].name == "optgroup":
2319 self.tree.openElements.pop()
2320 self.tree.insertElement(token)
2321
2322 def startTagSelect(self, token):
2323 self.parser.parseError("unexpected-select-in-select")
2324 self.endTagSelect(impliedTagToken("select"))
2325
2326 def startTagInput(self, token):
2327 self.parser.parseError("unexpected-input-in-select")
2328 if self.tree.elementInScope("select", variant="select"):
2329 self.endTagSelect(impliedTagToken("select"))
2330 return token
2331 else:
2332 assert self.parser.innerHTML
2333
2334 def startTagScript(self, token):
2335 return self.parser.phases["inHead"].processStartTag(token)
2336
2337 def startTagOther(self, token):
2338 self.parser.parseError("unexpected-start-tag-in-select",
2339 {"name": token["name"]})
2340
2341 def endTagOption(self, token):
2342 if self.tree.openElements[-1].name == "option":
2343 self.tree.openElements.pop()
2344 else:
2345 self.parser.parseError("unexpected-end-tag-in-select",
2346 {"name": "option"})
2347
2348 def endTagOptgroup(self, token):
2349 # </optgroup> implicitly closes <option>
2350 if (self.tree.openElements[-1].name == "option" and
2351 self.tree.openElements[-2].name == "optgroup"):
2352 self.tree.openElements.pop()
2353 # It also closes </optgroup>
2354 if self.tree.openElements[-1].name == "optgroup":
2355 self.tree.openElements.pop()
2356 # But nothing else
2357 else:
2358 self.parser.parseError("unexpected-end-tag-in-select",
2359 {"name": "optgroup"})
2360
2361 def endTagSelect(self, token):
2362 if self.tree.elementInScope("select", variant="select"):
2363 node = self.tree.openElements.pop()
2364 while node.name != "select":
2365 node = self.tree.openElements.pop()
2366 self.parser.resetInsertionMode()
2367 else:
2368 # innerHTML case
2369 assert self.parser.innerHTML
2370 self.parser.parseError()
2371
2372 def endTagOther(self, token):
2373 self.parser.parseError("unexpected-end-tag-in-select",
2374 {"name": token["name"]})
2375
2376 class InSelectInTablePhase(Phase):
2377 def __init__(self, parser, tree):
2378 Phase.__init__(self, parser, tree)
2379
2380 self.startTagHandler = _utils.MethodDispatcher([
2381 (("caption", "table", "tbody", "tfoot", "thead", "tr", "td", "th"),
2382 self.startTagTable)
2383 ])
2384 self.startTagHandler.default = self.startTagOther
2385
2386 self.endTagHandler = _utils.MethodDispatcher([
2387 (("caption", "table", "tbody", "tfoot", "thead", "tr", "td", "th"),
2388 self.endTagTable)
2389 ])
2390 self.endTagHandler.default = self.endTagOther
2391
2392 def processEOF(self):
2393 self.parser.phases["inSelect"].processEOF()
2394
2395 def processCharacters(self, token):
2396 return self.parser.phases["inSelect"].processCharacters(token)
2397
2398 def startTagTable(self, token):
2399 self.parser.parseError("unexpected-table-element-start-tag-in-select-in-table", {"name": token["name"]})
2400 self.endTagOther(impliedTagToken("select"))
2401 return token
2402
2403 def startTagOther(self, token):
2404 return self.parser.phases["inSelect"].processStartTag(token)
2405
2406 def endTagTable(self, token):
2407 self.parser.parseError("unexpected-table-element-end-tag-in-select-in-table", {"name": token["name"]})
2408 if self.tree.elementInScope(token["name"], variant="table"):
2409 self.endTagOther(impliedTagToken("select"))
2410 return token
2411
2412 def endTagOther(self, token):
2413 return self.parser.phases["inSelect"].processEndTag(token)
2414
2415 class InForeignContentPhase(Phase):
2416 breakoutElements = frozenset(["b", "big", "blockquote", "body", "br",
2417 "center", "code", "dd", "div", "dl", "dt",
2418 "em", "embed", "h1", "h2", "h3",
2419 "h4", "h5", "h6", "head", "hr", "i", "img",
2420 "li", "listing", "menu", "meta", "nobr",
2421 "ol", "p", "pre", "ruby", "s", "small",
2422 "span", "strong", "strike", "sub", "sup",
2423 "table", "tt", "u", "ul", "var"])
2424
2425 def __init__(self, parser, tree):
2426 Phase.__init__(self, parser, tree)
2427
2428 def adjustSVGTagNames(self, token):
2429 replacements = {"altglyph": "altGlyph",
2430 "altglyphdef": "altGlyphDef",
2431 "altglyphitem": "altGlyphItem",
2432 "animatecolor": "animateColor",
2433 "animatemotion": "animateMotion",
2434 "animatetransform": "animateTransform",
2435 "clippath": "clipPath",
2436 "feblend": "feBlend",
2437 "fecolormatrix": "feColorMatrix",
2438 "fecomponenttransfer": "feComponentTransfer",
2439 "fecomposite": "feComposite",
2440 "feconvolvematrix": "feConvolveMatrix",
2441 "fediffuselighting": "feDiffuseLighting",
2442 "fedisplacementmap": "feDisplacementMap",
2443 "fedistantlight": "feDistantLight",
2444 "feflood": "feFlood",
2445 "fefunca": "feFuncA",
2446 "fefuncb": "feFuncB",
2447 "fefuncg": "feFuncG",
2448 "fefuncr": "feFuncR",
2449 "fegaussianblur": "feGaussianBlur",
2450 "feimage": "feImage",
2451 "femerge": "feMerge",
2452 "femergenode": "feMergeNode",
2453 "femorphology": "feMorphology",
2454 "feoffset": "feOffset",
2455 "fepointlight": "fePointLight",
2456 "fespecularlighting": "feSpecularLighting",
2457 "fespotlight": "feSpotLight",
2458 "fetile": "feTile",
2459 "feturbulence": "feTurbulence",
2460 "foreignobject": "foreignObject",
2461 "glyphref": "glyphRef",
2462 "lineargradient": "linearGradient",
2463 "radialgradient": "radialGradient",
2464 "textpath": "textPath"}
2465
2466 if token["name"] in replacements:
2467 token["name"] = replacements[token["name"]]
2468
2469 def processCharacters(self, token):
2470 if token["data"] == "\u0000":
2471 token["data"] = "\uFFFD"
2472 elif (self.parser.framesetOK and
2473 any(char not in spaceCharacters for char in token["data"])):
2474 self.parser.framesetOK = False
2475 Phase.processCharacters(self, token)
2476
2477 def processStartTag(self, token):
2478 currentNode = self.tree.openElements[-1]
2479 if (token["name"] in self.breakoutElements or
2480 (token["name"] == "font" and
2481 set(token["data"].keys()) & set(["color", "face", "size"]))):
2482 self.parser.parseError("unexpected-html-element-in-foreign-content",
2483 {"name": token["name"]})
2484 while (self.tree.openElements[-1].namespace !=
2485 self.tree.defaultNamespace and
2486 not self.parser.isHTMLIntegrationPoint(self.tree.openElements[-1]) and
2487 not self.parser.isMathMLTextIntegrationPoint(self.tree.openElements[-1])):
2488 self.tree.openElements.pop()
2489 return token
2490
2491 else:
2492 if currentNode.namespace == namespaces["mathml"]:
2493 self.parser.adjustMathMLAttributes(token)
2494 elif currentNode.namespace == namespaces["svg"]:
2495 self.adjustSVGTagNames(token)
2496 self.parser.adjustSVGAttributes(token)
2497 self.parser.adjustForeignAttributes(token)
2498 token["namespace"] = currentNode.namespace
2499 self.tree.insertElement(token)
2500 if token["selfClosing"]:
2501 self.tree.openElements.pop()
2502 token["selfClosingAcknowledged"] = True
2503
2504 def processEndTag(self, token):
2505 nodeIndex = len(self.tree.openElements) - 1
2506 node = self.tree.openElements[-1]
2507 if node.name.translate(asciiUpper2Lower) != token["name"]:
2508 self.parser.parseError("unexpected-end-tag", {"name": token["name"]})
2509
2510 while True:
2511 if node.name.translate(asciiUpper2Lower) == token["name"]:
2512 # XXX this isn't in the spec but it seems necessary
2513 if self.parser.phase == self.parser.phases["inTableText"]:
2514 self.parser.phase.flushCharacters()
2515 self.parser.phase = self.parser.phase.originalPhase
2516 while self.tree.openElements.pop() != node:
2517 assert self.tree.openElements
2518 new_token = None
2519 break
2520 nodeIndex -= 1
2521
2522 node = self.tree.openElements[nodeIndex]
2523 if node.namespace != self.tree.defaultNamespace:
2524 continue
2525 else:
2526 new_token = self.parser.phase.processEndTag(token)
2527 break
2528 return new_token
2529
2530 class AfterBodyPhase(Phase):
2531 def __init__(self, parser, tree):
2532 Phase.__init__(self, parser, tree)
2533
2534 self.startTagHandler = _utils.MethodDispatcher([
2535 ("html", self.startTagHtml)
2536 ])
2537 self.startTagHandler.default = self.startTagOther
2538
2539 self.endTagHandler = _utils.MethodDispatcher([("html", self.endTagHtml)])
2540 self.endTagHandler.default = self.endTagOther
2541
2542 def processEOF(self):
2543 # Stop parsing
2544 pass
2545
2546 def processComment(self, token):
2547 # This is needed because data is to be appended to the <html> element
2548 # here and not to whatever is currently open.
2549 self.tree.insertComment(token, self.tree.openElements[0])
2550
2551 def processCharacters(self, token):
2552 self.parser.parseError("unexpected-char-after-body")
2553 self.parser.phase = self.parser.phases["inBody"]
2554 return token
2555
2556 def startTagHtml(self, token):
2557 return self.parser.phases["inBody"].processStartTag(token)
2558
2559 def startTagOther(self, token):
2560 self.parser.parseError("unexpected-start-tag-after-body",
2561 {"name": token["name"]})
2562 self.parser.phase = self.parser.phases["inBody"]
2563 return token
2564
2565 def endTagHtml(self, name):
2566 if self.parser.innerHTML:
2567 self.parser.parseError("unexpected-end-tag-after-body-innerhtml")
2568 else:
2569 self.parser.phase = self.parser.phases["afterAfterBody"]
2570
2571 def endTagOther(self, token):
2572 self.parser.parseError("unexpected-end-tag-after-body",
2573 {"name": token["name"]})
2574 self.parser.phase = self.parser.phases["inBody"]
2575 return token
2576
2577 class InFramesetPhase(Phase):
2578 # http://www.whatwg.org/specs/web-apps/current-work/#in-frameset
2579 def __init__(self, parser, tree):
2580 Phase.__init__(self, parser, tree)
2581
2582 self.startTagHandler = _utils.MethodDispatcher([
2583 ("html", self.startTagHtml),
2584 ("frameset", self.startTagFrameset),
2585 ("frame", self.startTagFrame),
2586 ("noframes", self.startTagNoframes)
2587 ])
2588 self.startTagHandler.default = self.startTagOther
2589
2590 self.endTagHandler = _utils.MethodDispatcher([
2591 ("frameset", self.endTagFrameset)
2592 ])
2593 self.endTagHandler.default = self.endTagOther
2594
2595 def processEOF(self):
2596 if self.tree.openElements[-1].name != "html":
2597 self.parser.parseError("eof-in-frameset")
2598 else:
2599 assert self.parser.innerHTML
2600
2601 def processCharacters(self, token):
2602 self.parser.parseError("unexpected-char-in-frameset")
2603
2604 def startTagFrameset(self, token):
2605 self.tree.insertElement(token)
2606
2607 def startTagFrame(self, token):
2608 self.tree.insertElement(token)
2609 self.tree.openElements.pop()
2610
2611 def startTagNoframes(self, token):
2612 return self.parser.phases["inBody"].processStartTag(token)
2613
2614 def startTagOther(self, token):
2615 self.parser.parseError("unexpected-start-tag-in-frameset",
2616 {"name": token["name"]})
2617
2618 def endTagFrameset(self, token):
2619 if self.tree.openElements[-1].name == "html":
2620 # innerHTML case
2621 self.parser.parseError("unexpected-frameset-in-frameset-innerhtml")
2622 else:
2623 self.tree.openElements.pop()
2624 if (not self.parser.innerHTML and
2625 self.tree.openElements[-1].name != "frameset"):
2626 # If we're not in innerHTML mode and the current node is not a
2627 # "frameset" element (anymore) then switch.
2628 self.parser.phase = self.parser.phases["afterFrameset"]
2629
2630 def endTagOther(self, token):
2631 self.parser.parseError("unexpected-end-tag-in-frameset",
2632 {"name": token["name"]})
2633
2634 class AfterFramesetPhase(Phase):
2635 # http://www.whatwg.org/specs/web-apps/current-work/#after3
2636 def __init__(self, parser, tree):
2637 Phase.__init__(self, parser, tree)
2638
2639 self.startTagHandler = _utils.MethodDispatcher([
2640 ("html", self.startTagHtml),
2641 ("noframes", self.startTagNoframes)
2642 ])
2643 self.startTagHandler.default = self.startTagOther
2644
2645 self.endTagHandler = _utils.MethodDispatcher([
2646 ("html", self.endTagHtml)
2647 ])
2648 self.endTagHandler.default = self.endTagOther
2649
2650 def processEOF(self):
2651 # Stop parsing
2652 pass
2653
2654 def processCharacters(self, token):
2655 self.parser.parseError("unexpected-char-after-frameset")
2656
2657 def startTagNoframes(self, token):
2658 return self.parser.phases["inHead"].processStartTag(token)
2659
2660 def startTagOther(self, token):
2661 self.parser.parseError("unexpected-start-tag-after-frameset",
2662 {"name": token["name"]})
2663
2664 def endTagHtml(self, token):
2665 self.parser.phase = self.parser.phases["afterAfterFrameset"]
2666
2667 def endTagOther(self, token):
2668 self.parser.parseError("unexpected-end-tag-after-frameset",
2669 {"name": token["name"]})
2670
2671 class AfterAfterBodyPhase(Phase):
2672 def __init__(self, parser, tree):
2673 Phase.__init__(self, parser, tree)
2674
2675 self.startTagHandler = _utils.MethodDispatcher([
2676 ("html", self.startTagHtml)
2677 ])
2678 self.startTagHandler.default = self.startTagOther
2679
2680 def processEOF(self):
2681 pass
2682
2683 def processComment(self, token):
2684 self.tree.insertComment(token, self.tree.document)
2685
2686 def processSpaceCharacters(self, token):
2687 return self.parser.phases["inBody"].processSpaceCharacters(token)
2688
2689 def processCharacters(self, token):
2690 self.parser.parseError("expected-eof-but-got-char")
2691 self.parser.phase = self.parser.phases["inBody"]
2692 return token
2693
2694 def startTagHtml(self, token):
2695 return self.parser.phases["inBody"].processStartTag(token)
2696
2697 def startTagOther(self, token):
2698 self.parser.parseError("expected-eof-but-got-start-tag",
2699 {"name": token["name"]})
2700 self.parser.phase = self.parser.phases["inBody"]
2701 return token
2702
2703 def processEndTag(self, token):
2704 self.parser.parseError("expected-eof-but-got-end-tag",
2705 {"name": token["name"]})
2706 self.parser.phase = self.parser.phases["inBody"]
2707 return token
2708
2709 class AfterAfterFramesetPhase(Phase):
2710 def __init__(self, parser, tree):
2711 Phase.__init__(self, parser, tree)
2712
2713 self.startTagHandler = _utils.MethodDispatcher([
2714 ("html", self.startTagHtml),
2715 ("noframes", self.startTagNoFrames)
2716 ])
2717 self.startTagHandler.default = self.startTagOther
2718
2719 def processEOF(self):
2720 pass
2721
2722 def processComment(self, token):
2723 self.tree.insertComment(token, self.tree.document)
2724
2725 def processSpaceCharacters(self, token):
2726 return self.parser.phases["inBody"].processSpaceCharacters(token)
2727
2728 def processCharacters(self, token):
2729 self.parser.parseError("expected-eof-but-got-char")
2730
2731 def startTagHtml(self, token):
2732 return self.parser.phases["inBody"].processStartTag(token)
2733
2734 def startTagNoFrames(self, token):
2735 return self.parser.phases["inHead"].processStartTag(token)
2736
2737 def startTagOther(self, token):
2738 self.parser.parseError("expected-eof-but-got-start-tag",
2739 {"name": token["name"]})
2740
2741 def processEndTag(self, token):
2742 self.parser.parseError("expected-eof-but-got-end-tag",
2743 {"name": token["name"]})
2744 # pylint:enable=unused-argument
2745
2746 return {
2747 "initial": InitialPhase,
2748 "beforeHtml": BeforeHtmlPhase,
2749 "beforeHead": BeforeHeadPhase,
2750 "inHead": InHeadPhase,
2751 "inHeadNoscript": InHeadNoscriptPhase,
2752 "afterHead": AfterHeadPhase,
2753 "inBody": InBodyPhase,
2754 "text": TextPhase,
2755 "inTable": InTablePhase,
2756 "inTableText": InTableTextPhase,
2757 "inCaption": InCaptionPhase,
2758 "inColumnGroup": InColumnGroupPhase,
2759 "inTableBody": InTableBodyPhase,
2760 "inRow": InRowPhase,
2761 "inCell": InCellPhase,
2762 "inSelect": InSelectPhase,
2763 "inSelectInTable": InSelectInTablePhase,
2764 "inForeignContent": InForeignContentPhase,
2765 "afterBody": AfterBodyPhase,
2766 "inFrameset": InFramesetPhase,
2767 "afterFrameset": AfterFramesetPhase,
2768 "afterAfterBody": AfterAfterBodyPhase,
2769 "afterAfterFrameset": AfterAfterFramesetPhase,
2770 # XXX after after frameset
2771 }
2772
2773
2774def adjust_attributes(token, replacements):
2775 needs_adjustment = viewkeys(token['data']) & viewkeys(replacements)
2776 if needs_adjustment:
2777 token['data'] = OrderedDict((replacements.get(k, k), v)
2778 for k, v in token['data'].items())
2779
2780
2781def impliedTagToken(name, type="EndTag", attributes=None,
2782 selfClosing=False):
2783 if attributes is None:
2784 attributes = {}
2785 return {"type": tokenTypes[type], "name": name, "data": attributes,
2786 "selfClosing": selfClosing}
2787
2788
2789class ParseError(Exception):
2790 """Error in parsed document"""
2791 pass
diff --git a/venv/lib/python3.7/site-packages/pip-10.0.1-py3.7.egg/pip/_vendor/html5lib/serializer.py b/venv/lib/python3.7/site-packages/pip-10.0.1-py3.7.egg/pip/_vendor/html5lib/serializer.py
new file mode 100644
index 0000000..641323e
--- /dev/null
+++ b/venv/lib/python3.7/site-packages/pip-10.0.1-py3.7.egg/pip/_vendor/html5lib/serializer.py
@@ -0,0 +1,409 @@
1from __future__ import absolute_import, division, unicode_literals
2from pip._vendor.six import text_type
3
4import re
5
6from codecs import register_error, xmlcharrefreplace_errors
7
8from .constants import voidElements, booleanAttributes, spaceCharacters
9from .constants import rcdataElements, entities, xmlEntities
10from . import treewalkers, _utils
11from xml.sax.saxutils import escape
12
13_quoteAttributeSpecChars = "".join(spaceCharacters) + "\"'=<>`"
14_quoteAttributeSpec = re.compile("[" + _quoteAttributeSpecChars + "]")
15_quoteAttributeLegacy = re.compile("[" + _quoteAttributeSpecChars +
16 "\x00\x01\x02\x03\x04\x05\x06\x07\x08\t\n"
17 "\x0b\x0c\r\x0e\x0f\x10\x11\x12\x13\x14\x15"
18 "\x16\x17\x18\x19\x1a\x1b\x1c\x1d\x1e\x1f"
19 "\x20\x2f\x60\xa0\u1680\u180e\u180f\u2000"
20 "\u2001\u2002\u2003\u2004\u2005\u2006\u2007"
21 "\u2008\u2009\u200a\u2028\u2029\u202f\u205f"
22 "\u3000]")
23
24
25_encode_entity_map = {}
26_is_ucs4 = len("\U0010FFFF") == 1
27for k, v in list(entities.items()):
28 # skip multi-character entities
29 if ((_is_ucs4 and len(v) > 1) or
30 (not _is_ucs4 and len(v) > 2)):
31 continue
32 if v != "&":
33 if len(v) == 2:
34 v = _utils.surrogatePairToCodepoint(v)
35 else:
36 v = ord(v)
37 if v not in _encode_entity_map or k.islower():
38 # prefer &lt; over &LT; and similarly for &amp;, &gt;, etc.
39 _encode_entity_map[v] = k
40
41
42def htmlentityreplace_errors(exc):
43 if isinstance(exc, (UnicodeEncodeError, UnicodeTranslateError)):
44 res = []
45 codepoints = []
46 skip = False
47 for i, c in enumerate(exc.object[exc.start:exc.end]):
48 if skip:
49 skip = False
50 continue
51 index = i + exc.start
52 if _utils.isSurrogatePair(exc.object[index:min([exc.end, index + 2])]):
53 codepoint = _utils.surrogatePairToCodepoint(exc.object[index:index + 2])
54 skip = True
55 else:
56 codepoint = ord(c)
57 codepoints.append(codepoint)
58 for cp in codepoints:
59 e = _encode_entity_map.get(cp)
60 if e:
61 res.append("&")
62 res.append(e)
63 if not e.endswith(";"):
64 res.append(";")
65 else:
66 res.append("&#x%s;" % (hex(cp)[2:]))
67 return ("".join(res), exc.end)
68 else:
69 return xmlcharrefreplace_errors(exc)
70
71
72register_error("htmlentityreplace", htmlentityreplace_errors)
73
74
75def serialize(input, tree="etree", encoding=None, **serializer_opts):
76 """Serializes the input token stream using the specified treewalker
77
78 :arg input: the token stream to serialize
79
80 :arg tree: the treewalker to use
81
82 :arg encoding: the encoding to use
83
84 :arg serializer_opts: any options to pass to the
85 :py:class:`html5lib.serializer.HTMLSerializer` that gets created
86
87 :returns: the tree serialized as a string
88
89 Example:
90
91 >>> from html5lib.html5parser import parse
92 >>> from html5lib.serializer import serialize
93 >>> token_stream = parse('<html><body><p>Hi!</p></body></html>')
94 >>> serialize(token_stream, omit_optional_tags=False)
95 '<html><head></head><body><p>Hi!</p></body></html>'
96
97 """
98 # XXX: Should we cache this?
99 walker = treewalkers.getTreeWalker(tree)
100 s = HTMLSerializer(**serializer_opts)
101 return s.render(walker(input), encoding)
102
103
104class HTMLSerializer(object):
105
106 # attribute quoting options
107 quote_attr_values = "legacy" # be secure by default
108 quote_char = '"'
109 use_best_quote_char = True
110
111 # tag syntax options
112 omit_optional_tags = True
113 minimize_boolean_attributes = True
114 use_trailing_solidus = False
115 space_before_trailing_solidus = True
116
117 # escaping options
118 escape_lt_in_attrs = False
119 escape_rcdata = False
120 resolve_entities = True
121
122 # miscellaneous options
123 alphabetical_attributes = False
124 inject_meta_charset = True
125 strip_whitespace = False
126 sanitize = False
127
128 options = ("quote_attr_values", "quote_char", "use_best_quote_char",
129 "omit_optional_tags", "minimize_boolean_attributes",
130 "use_trailing_solidus", "space_before_trailing_solidus",
131 "escape_lt_in_attrs", "escape_rcdata", "resolve_entities",
132 "alphabetical_attributes", "inject_meta_charset",
133 "strip_whitespace", "sanitize")
134
135 def __init__(self, **kwargs):
136 """Initialize HTMLSerializer
137
138 :arg inject_meta_charset: Whether or not to inject the meta charset.
139
140 Defaults to ``True``.
141
142 :arg quote_attr_values: Whether to quote attribute values that don't
143 require quoting per legacy browser behavior (``"legacy"``), when
144 required by the standard (``"spec"``), or always (``"always"``).
145
146 Defaults to ``"legacy"``.
147
148 :arg quote_char: Use given quote character for attribute quoting.
149
150 Defaults to ``"`` which will use double quotes unless attribute
151 value contains a double quote, in which case single quotes are
152 used.
153
154 :arg escape_lt_in_attrs: Whether or not to escape ``<`` in attribute
155 values.
156
157 Defaults to ``False``.
158
159 :arg escape_rcdata: Whether to escape characters that need to be
160 escaped within normal elements within rcdata elements such as
161 style.
162
163 Defaults to ``False``.
164
165 :arg resolve_entities: Whether to resolve named character entities that
166 appear in the source tree. The XML predefined entities &lt; &gt;
167 &amp; &quot; &apos; are unaffected by this setting.
168
169 Defaults to ``True``.
170
171 :arg strip_whitespace: Whether to remove semantically meaningless
172 whitespace. (This compresses all whitespace to a single space
173 except within ``pre``.)
174
175 Defaults to ``False``.
176
177 :arg minimize_boolean_attributes: Shortens boolean attributes to give
178 just the attribute value, for example::
179
180 <input disabled="disabled">
181
182 becomes::
183
184 <input disabled>
185
186 Defaults to ``True``.
187
188 :arg use_trailing_solidus: Includes a close-tag slash at the end of the
189 start tag of void elements (empty elements whose end tag is
190 forbidden). E.g. ``<hr/>``.
191
192 Defaults to ``False``.
193
194 :arg space_before_trailing_solidus: Places a space immediately before
195 the closing slash in a tag using a trailing solidus. E.g.
196 ``<hr />``. Requires ``use_trailing_solidus=True``.
197
198 Defaults to ``True``.
199
200 :arg sanitize: Strip all unsafe or unknown constructs from output.
201 See :py:class:`html5lib.filters.sanitizer.Filter`.
202
203 Defaults to ``False``.
204
205 :arg omit_optional_tags: Omit start/end tags that are optional.
206
207 Defaults to ``True``.
208
209 :arg alphabetical_attributes: Reorder attributes to be in alphabetical order.
210
211 Defaults to ``False``.
212
213 """
214 unexpected_args = frozenset(kwargs) - frozenset(self.options)
215 if len(unexpected_args) > 0:
216 raise TypeError("__init__() got an unexpected keyword argument '%s'" % next(iter(unexpected_args)))
217 if 'quote_char' in kwargs:
218 self.use_best_quote_char = False
219 for attr in self.options:
220 setattr(self, attr, kwargs.get(attr, getattr(self, attr)))
221 self.errors = []
222 self.strict = False
223
224 def encode(self, string):
225 assert(isinstance(string, text_type))
226 if self.encoding:
227 return string.encode(self.encoding, "htmlentityreplace")
228 else:
229 return string
230
231 def encodeStrict(self, string):
232 assert(isinstance(string, text_type))
233 if self.encoding:
234 return string.encode(self.encoding, "strict")
235 else:
236 return string
237
238 def serialize(self, treewalker, encoding=None):
239 # pylint:disable=too-many-nested-blocks
240 self.encoding = encoding
241 in_cdata = False
242 self.errors = []
243
244 if encoding and self.inject_meta_charset:
245 from .filters.inject_meta_charset import Filter
246 treewalker = Filter(treewalker, encoding)
247 # Alphabetical attributes is here under the assumption that none of
248 # the later filters add or change order of attributes; it needs to be
249 # before the sanitizer so escaped elements come out correctly
250 if self.alphabetical_attributes:
251 from .filters.alphabeticalattributes import Filter
252 treewalker = Filter(treewalker)
253 # WhitespaceFilter should be used before OptionalTagFilter
254 # for maximum efficiently of this latter filter
255 if self.strip_whitespace:
256 from .filters.whitespace import Filter
257 treewalker = Filter(treewalker)
258 if self.sanitize:
259 from .filters.sanitizer import Filter
260 treewalker = Filter(treewalker)
261 if self.omit_optional_tags:
262 from .filters.optionaltags import Filter
263 treewalker = Filter(treewalker)
264
265 for token in treewalker:
266 type = token["type"]
267 if type == "Doctype":
268 doctype = "<!DOCTYPE %s" % token["name"]
269
270 if token["publicId"]:
271 doctype += ' PUBLIC "%s"' % token["publicId"]
272 elif token["systemId"]:
273 doctype += " SYSTEM"
274 if token["systemId"]:
275 if token["systemId"].find('"') >= 0:
276 if token["systemId"].find("'") >= 0:
277 self.serializeError("System identifer contains both single and double quote characters")
278 quote_char = "'"
279 else:
280 quote_char = '"'
281 doctype += " %s%s%s" % (quote_char, token["systemId"], quote_char)
282
283 doctype += ">"
284 yield self.encodeStrict(doctype)
285
286 elif type in ("Characters", "SpaceCharacters"):
287 if type == "SpaceCharacters" or in_cdata:
288 if in_cdata and token["data"].find("</") >= 0:
289 self.serializeError("Unexpected </ in CDATA")
290 yield self.encode(token["data"])
291 else:
292 yield self.encode(escape(token["data"]))
293
294 elif type in ("StartTag", "EmptyTag"):
295 name = token["name"]
296 yield self.encodeStrict("<%s" % name)
297 if name in rcdataElements and not self.escape_rcdata:
298 in_cdata = True
299 elif in_cdata:
300 self.serializeError("Unexpected child element of a CDATA element")
301 for (_, attr_name), attr_value in token["data"].items():
302 # TODO: Add namespace support here
303 k = attr_name
304 v = attr_value
305 yield self.encodeStrict(' ')
306
307 yield self.encodeStrict(k)
308 if not self.minimize_boolean_attributes or \
309 (k not in booleanAttributes.get(name, tuple()) and
310 k not in booleanAttributes.get("", tuple())):
311 yield self.encodeStrict("=")
312 if self.quote_attr_values == "always" or len(v) == 0:
313 quote_attr = True
314 elif self.quote_attr_values == "spec":
315 quote_attr = _quoteAttributeSpec.search(v) is not None
316 elif self.quote_attr_values == "legacy":
317 quote_attr = _quoteAttributeLegacy.search(v) is not None
318 else:
319 raise ValueError("quote_attr_values must be one of: "
320 "'always', 'spec', or 'legacy'")
321 v = v.replace("&", "&amp;")
322 if self.escape_lt_in_attrs:
323 v = v.replace("<", "&lt;")
324 if quote_attr:
325 quote_char = self.quote_char
326 if self.use_best_quote_char:
327 if "'" in v and '"' not in v:
328 quote_char = '"'
329 elif '"' in v and "'" not in v:
330 quote_char = "'"
331 if quote_char == "'":
332 v = v.replace("'", "&#39;")
333 else:
334 v = v.replace('"', "&quot;")
335 yield self.encodeStrict(quote_char)
336 yield self.encode(v)
337 yield self.encodeStrict(quote_char)
338 else:
339 yield self.encode(v)
340 if name in voidElements and self.use_trailing_solidus:
341 if self.space_before_trailing_solidus:
342 yield self.encodeStrict(" /")
343 else:
344 yield self.encodeStrict("/")
345 yield self.encode(">")
346
347 elif type == "EndTag":
348 name = token["name"]
349 if name in rcdataElements:
350 in_cdata = False
351 elif in_cdata:
352 self.serializeError("Unexpected child element of a CDATA element")
353 yield self.encodeStrict("</%s>" % name)
354
355 elif type == "Comment":
356 data = token["data"]
357 if data.find("--") >= 0:
358 self.serializeError("Comment contains --")
359 yield self.encodeStrict("<!--%s-->" % token["data"])
360
361 elif type == "Entity":
362 name = token["name"]
363 key = name + ";"
364 if key not in entities:
365 self.serializeError("Entity %s not recognized" % name)
366 if self.resolve_entities and key not in xmlEntities:
367 data = entities[key]
368 else:
369 data = "&%s;" % name
370 yield self.encodeStrict(data)
371
372 else:
373 self.serializeError(token["data"])
374
375 def render(self, treewalker, encoding=None):
376 """Serializes the stream from the treewalker into a string
377
378 :arg treewalker: the treewalker to serialize
379
380 :arg encoding: the string encoding to use
381
382 :returns: the serialized tree
383
384 Example:
385
386 >>> from html5lib import parse, getTreeWalker
387 >>> from html5lib.serializer import HTMLSerializer
388 >>> token_stream = parse('<html><body>Hi!</body></html>')
389 >>> walker = getTreeWalker('etree')
390 >>> serializer = HTMLSerializer(omit_optional_tags=False)
391 >>> serializer.render(walker(token_stream))
392 '<html><head></head><body>Hi!</body></html>'
393
394 """
395 if encoding:
396 return b"".join(list(self.serialize(treewalker, encoding)))
397 else:
398 return "".join(list(self.serialize(treewalker)))
399
400 def serializeError(self, data="XXX ERROR MESSAGE NEEDED"):
401 # XXX The idea is to make data mandatory.
402 self.errors.append(data)
403 if self.strict:
404 raise SerializeError
405
406
407class SerializeError(Exception):
408 """Error in serialized tree"""
409 pass
diff --git a/venv/lib/python3.7/site-packages/pip-10.0.1-py3.7.egg/pip/_vendor/html5lib/treeadapters/__init__.py b/venv/lib/python3.7/site-packages/pip-10.0.1-py3.7.egg/pip/_vendor/html5lib/treeadapters/__init__.py
new file mode 100644
index 0000000..8767fb0
--- /dev/null
+++ b/venv/lib/python3.7/site-packages/pip-10.0.1-py3.7.egg/pip/_vendor/html5lib/treeadapters/__init__.py
@@ -0,0 +1,30 @@
1"""Tree adapters let you convert from one tree structure to another
2
3Example:
4
5.. code-block:: python
6
7 from pip._vendor import html5lib
8 from pip._vendor.html5lib.treeadapters import genshi
9
10 doc = '<html><body>Hi!</body></html>'
11 treebuilder = html5lib.getTreeBuilder('etree')
12 parser = html5lib.HTMLParser(tree=treebuilder)
13 tree = parser.parse(doc)
14 TreeWalker = html5lib.getTreeWalker('etree')
15
16 genshi_tree = genshi.to_genshi(TreeWalker(tree))
17
18"""
19from __future__ import absolute_import, division, unicode_literals
20
21from . import sax
22
23__all__ = ["sax"]
24
25try:
26 from . import genshi # noqa
27except ImportError:
28 pass
29else:
30 __all__.append("genshi")
diff --git a/venv/lib/python3.7/site-packages/pip-10.0.1-py3.7.egg/pip/_vendor/html5lib/treeadapters/genshi.py b/venv/lib/python3.7/site-packages/pip-10.0.1-py3.7.egg/pip/_vendor/html5lib/treeadapters/genshi.py
new file mode 100644
index 0000000..73c70c6
--- /dev/null
+++ b/venv/lib/python3.7/site-packages/pip-10.0.1-py3.7.egg/pip/_vendor/html5lib/treeadapters/genshi.py
@@ -0,0 +1,54 @@
1from __future__ import absolute_import, division, unicode_literals
2
3from genshi.core import QName, Attrs
4from genshi.core import START, END, TEXT, COMMENT, DOCTYPE
5
6
7def to_genshi(walker):
8 """Convert a tree to a genshi tree
9
10 :arg walker: the treewalker to use to walk the tree to convert it
11
12 :returns: generator of genshi nodes
13
14 """
15 text = []
16 for token in walker:
17 type = token["type"]
18 if type in ("Characters", "SpaceCharacters"):
19 text.append(token["data"])
20 elif text:
21 yield TEXT, "".join(text), (None, -1, -1)
22 text = []
23
24 if type in ("StartTag", "EmptyTag"):
25 if token["namespace"]:
26 name = "{%s}%s" % (token["namespace"], token["name"])
27 else:
28 name = token["name"]
29 attrs = Attrs([(QName("{%s}%s" % attr if attr[0] is not None else attr[1]), value)
30 for attr, value in token["data"].items()])
31 yield (START, (QName(name), attrs), (None, -1, -1))
32 if type == "EmptyTag":
33 type = "EndTag"
34
35 if type == "EndTag":
36 if token["namespace"]:
37 name = "{%s}%s" % (token["namespace"], token["name"])
38 else:
39 name = token["name"]
40
41 yield END, QName(name), (None, -1, -1)
42
43 elif type == "Comment":
44 yield COMMENT, token["data"], (None, -1, -1)
45
46 elif type == "Doctype":
47 yield DOCTYPE, (token["name"], token["publicId"],
48 token["systemId"]), (None, -1, -1)
49
50 else:
51 pass # FIXME: What to do?
52
53 if text:
54 yield TEXT, "".join(text), (None, -1, -1)
diff --git a/venv/lib/python3.7/site-packages/pip-10.0.1-py3.7.egg/pip/_vendor/html5lib/treeadapters/sax.py b/venv/lib/python3.7/site-packages/pip-10.0.1-py3.7.egg/pip/_vendor/html5lib/treeadapters/sax.py
new file mode 100644
index 0000000..1f06d13
--- /dev/null
+++ b/venv/lib/python3.7/site-packages/pip-10.0.1-py3.7.egg/pip/_vendor/html5lib/treeadapters/sax.py
@@ -0,0 +1,50 @@
1from __future__ import absolute_import, division, unicode_literals
2
3from xml.sax.xmlreader import AttributesNSImpl
4
5from ..constants import adjustForeignAttributes, unadjustForeignAttributes
6
7prefix_mapping = {}
8for prefix, localName, namespace in adjustForeignAttributes.values():
9 if prefix is not None:
10 prefix_mapping[prefix] = namespace
11
12
13def to_sax(walker, handler):
14 """Call SAX-like content handler based on treewalker walker
15
16 :arg walker: the treewalker to use to walk the tree to convert it
17
18 :arg handler: SAX handler to use
19
20 """
21 handler.startDocument()
22 for prefix, namespace in prefix_mapping.items():
23 handler.startPrefixMapping(prefix, namespace)
24
25 for token in walker:
26 type = token["type"]
27 if type == "Doctype":
28 continue
29 elif type in ("StartTag", "EmptyTag"):
30 attrs = AttributesNSImpl(token["data"],
31 unadjustForeignAttributes)
32 handler.startElementNS((token["namespace"], token["name"]),
33 token["name"],
34 attrs)
35 if type == "EmptyTag":
36 handler.endElementNS((token["namespace"], token["name"]),
37 token["name"])
38 elif type == "EndTag":
39 handler.endElementNS((token["namespace"], token["name"]),
40 token["name"])
41 elif type in ("Characters", "SpaceCharacters"):
42 handler.characters(token["data"])
43 elif type == "Comment":
44 pass
45 else:
46 assert False, "Unknown token type"
47
48 for prefix, namespace in prefix_mapping.items():
49 handler.endPrefixMapping(prefix)
50 handler.endDocument()
diff --git a/venv/lib/python3.7/site-packages/pip-10.0.1-py3.7.egg/pip/_vendor/html5lib/treebuilders/__init__.py b/venv/lib/python3.7/site-packages/pip-10.0.1-py3.7.egg/pip/_vendor/html5lib/treebuilders/__init__.py
new file mode 100644
index 0000000..2ce5c87
--- /dev/null
+++ b/venv/lib/python3.7/site-packages/pip-10.0.1-py3.7.egg/pip/_vendor/html5lib/treebuilders/__init__.py
@@ -0,0 +1,88 @@
1"""A collection of modules for building different kinds of trees from HTML
2documents.
3
4To create a treebuilder for a new type of tree, you need to do
5implement several things:
6
71. A set of classes for various types of elements: Document, Doctype, Comment,
8 Element. These must implement the interface of ``base.treebuilders.Node``
9 (although comment nodes have a different signature for their constructor,
10 see ``treebuilders.etree.Comment``) Textual content may also be implemented
11 as another node type, or not, as your tree implementation requires.
12
132. A treebuilder object (called ``TreeBuilder`` by convention) that inherits
14 from ``treebuilders.base.TreeBuilder``. This has 4 required attributes:
15
16 * ``documentClass`` - the class to use for the bottommost node of a document
17 * ``elementClass`` - the class to use for HTML Elements
18 * ``commentClass`` - the class to use for comments
19 * ``doctypeClass`` - the class to use for doctypes
20
21 It also has one required method:
22
23 * ``getDocument`` - Returns the root node of the complete document tree
24
253. If you wish to run the unit tests, you must also create a ``testSerializer``
26 method on your treebuilder which accepts a node and returns a string
27 containing Node and its children serialized according to the format used in
28 the unittests
29
30"""
31
32from __future__ import absolute_import, division, unicode_literals
33
34from .._utils import default_etree
35
36treeBuilderCache = {}
37
38
39def getTreeBuilder(treeType, implementation=None, **kwargs):
40 """Get a TreeBuilder class for various types of trees with built-in support
41
42 :arg treeType: the name of the tree type required (case-insensitive). Supported
43 values are:
44
45 * "dom" - A generic builder for DOM implementations, defaulting to a
46 xml.dom.minidom based implementation.
47 * "etree" - A generic builder for tree implementations exposing an
48 ElementTree-like interface, defaulting to xml.etree.cElementTree if
49 available and xml.etree.ElementTree if not.
50 * "lxml" - A etree-based builder for lxml.etree, handling limitations
51 of lxml's implementation.
52
53 :arg implementation: (Currently applies to the "etree" and "dom" tree
54 types). A module implementing the tree type e.g. xml.etree.ElementTree
55 or xml.etree.cElementTree.
56
57 :arg kwargs: Any additional options to pass to the TreeBuilder when
58 creating it.
59
60 Example:
61
62 >>> from html5lib.treebuilders import getTreeBuilder
63 >>> builder = getTreeBuilder('etree')
64
65 """
66
67 treeType = treeType.lower()
68 if treeType not in treeBuilderCache:
69 if treeType == "dom":
70 from . import dom
71 # Come up with a sane default (pref. from the stdlib)
72 if implementation is None:
73 from xml.dom import minidom
74 implementation = minidom
75 # NEVER cache here, caching is done in the dom submodule
76 return dom.getDomModule(implementation, **kwargs).TreeBuilder
77 elif treeType == "lxml":
78 from . import etree_lxml
79 treeBuilderCache[treeType] = etree_lxml.TreeBuilder
80 elif treeType == "etree":
81 from . import etree
82 if implementation is None:
83 implementation = default_etree
84 # NEVER cache here, caching is done in the etree submodule
85 return etree.getETreeModule(implementation, **kwargs).TreeBuilder
86 else:
87 raise ValueError("""Unrecognised treebuilder "%s" """ % treeType)
88 return treeBuilderCache.get(treeType)
diff --git a/venv/lib/python3.7/site-packages/pip-10.0.1-py3.7.egg/pip/_vendor/html5lib/treebuilders/base.py b/venv/lib/python3.7/site-packages/pip-10.0.1-py3.7.egg/pip/_vendor/html5lib/treebuilders/base.py
new file mode 100644
index 0000000..ed32fcb
--- /dev/null
+++ b/venv/lib/python3.7/site-packages/pip-10.0.1-py3.7.egg/pip/_vendor/html5lib/treebuilders/base.py
@@ -0,0 +1,417 @@
1from __future__ import absolute_import, division, unicode_literals
2from pip._vendor.six import text_type
3
4from ..constants import scopingElements, tableInsertModeElements, namespaces
5
6# The scope markers are inserted when entering object elements,
7# marquees, table cells, and table captions, and are used to prevent formatting
8# from "leaking" into tables, object elements, and marquees.
9Marker = None
10
11listElementsMap = {
12 None: (frozenset(scopingElements), False),
13 "button": (frozenset(scopingElements | set([(namespaces["html"], "button")])), False),
14 "list": (frozenset(scopingElements | set([(namespaces["html"], "ol"),
15 (namespaces["html"], "ul")])), False),
16 "table": (frozenset([(namespaces["html"], "html"),
17 (namespaces["html"], "table")]), False),
18 "select": (frozenset([(namespaces["html"], "optgroup"),
19 (namespaces["html"], "option")]), True)
20}
21
22
23class Node(object):
24 """Represents an item in the tree"""
25 def __init__(self, name):
26 """Creates a Node
27
28 :arg name: The tag name associated with the node
29
30 """
31 # The tag name assocaited with the node
32 self.name = name
33 # The parent of the current node (or None for the document node)
34 self.parent = None
35 # The value of the current node (applies to text nodes and comments)
36 self.value = None
37 # A dict holding name -> value pairs for attributes of the node
38 self.attributes = {}
39 # A list of child nodes of the current node. This must include all
40 # elements but not necessarily other node types.
41 self.childNodes = []
42 # A list of miscellaneous flags that can be set on the node.
43 self._flags = []
44
45 def __str__(self):
46 attributesStr = " ".join(["%s=\"%s\"" % (name, value)
47 for name, value in
48 self.attributes.items()])
49 if attributesStr:
50 return "<%s %s>" % (self.name, attributesStr)
51 else:
52 return "<%s>" % (self.name)
53
54 def __repr__(self):
55 return "<%s>" % (self.name)
56
57 def appendChild(self, node):
58 """Insert node as a child of the current node
59
60 :arg node: the node to insert
61
62 """
63 raise NotImplementedError
64
65 def insertText(self, data, insertBefore=None):
66 """Insert data as text in the current node, positioned before the
67 start of node insertBefore or to the end of the node's text.
68
69 :arg data: the data to insert
70
71 :arg insertBefore: True if you want to insert the text before the node
72 and False if you want to insert it after the node
73
74 """
75 raise NotImplementedError
76
77 def insertBefore(self, node, refNode):
78 """Insert node as a child of the current node, before refNode in the
79 list of child nodes. Raises ValueError if refNode is not a child of
80 the current node
81
82 :arg node: the node to insert
83
84 :arg refNode: the child node to insert the node before
85
86 """
87 raise NotImplementedError
88
89 def removeChild(self, node):
90 """Remove node from the children of the current node
91
92 :arg node: the child node to remove
93
94 """
95 raise NotImplementedError
96
97 def reparentChildren(self, newParent):
98 """Move all the children of the current node to newParent.
99 This is needed so that trees that don't store text as nodes move the
100 text in the correct way
101
102 :arg newParent: the node to move all this node's children to
103
104 """
105 # XXX - should this method be made more general?
106 for child in self.childNodes:
107 newParent.appendChild(child)
108 self.childNodes = []
109
110 def cloneNode(self):
111 """Return a shallow copy of the current node i.e. a node with the same
112 name and attributes but with no parent or child nodes
113 """
114 raise NotImplementedError
115
116 def hasContent(self):
117 """Return true if the node has children or text, false otherwise
118 """
119 raise NotImplementedError
120
121
122class ActiveFormattingElements(list):
123 def append(self, node):
124 equalCount = 0
125 if node != Marker:
126 for element in self[::-1]:
127 if element == Marker:
128 break
129 if self.nodesEqual(element, node):
130 equalCount += 1
131 if equalCount == 3:
132 self.remove(element)
133 break
134 list.append(self, node)
135
136 def nodesEqual(self, node1, node2):
137 if not node1.nameTuple == node2.nameTuple:
138 return False
139
140 if not node1.attributes == node2.attributes:
141 return False
142
143 return True
144
145
146class TreeBuilder(object):
147 """Base treebuilder implementation
148
149 * documentClass - the class to use for the bottommost node of a document
150 * elementClass - the class to use for HTML Elements
151 * commentClass - the class to use for comments
152 * doctypeClass - the class to use for doctypes
153
154 """
155 # pylint:disable=not-callable
156
157 # Document class
158 documentClass = None
159
160 # The class to use for creating a node
161 elementClass = None
162
163 # The class to use for creating comments
164 commentClass = None
165
166 # The class to use for creating doctypes
167 doctypeClass = None
168
169 # Fragment class
170 fragmentClass = None
171
172 def __init__(self, namespaceHTMLElements):
173 """Create a TreeBuilder
174
175 :arg namespaceHTMLElements: whether or not to namespace HTML elements
176
177 """
178 if namespaceHTMLElements:
179 self.defaultNamespace = "http://www.w3.org/1999/xhtml"
180 else:
181 self.defaultNamespace = None
182 self.reset()
183
184 def reset(self):
185 self.openElements = []
186 self.activeFormattingElements = ActiveFormattingElements()
187
188 # XXX - rename these to headElement, formElement
189 self.headPointer = None
190 self.formPointer = None
191
192 self.insertFromTable = False
193
194 self.document = self.documentClass()
195
196 def elementInScope(self, target, variant=None):
197
198 # If we pass a node in we match that. if we pass a string
199 # match any node with that name
200 exactNode = hasattr(target, "nameTuple")
201 if not exactNode:
202 if isinstance(target, text_type):
203 target = (namespaces["html"], target)
204 assert isinstance(target, tuple)
205
206 listElements, invert = listElementsMap[variant]
207
208 for node in reversed(self.openElements):
209 if exactNode and node == target:
210 return True
211 elif not exactNode and node.nameTuple == target:
212 return True
213 elif (invert ^ (node.nameTuple in listElements)):
214 return False
215
216 assert False # We should never reach this point
217
218 def reconstructActiveFormattingElements(self):
219 # Within this algorithm the order of steps described in the
220 # specification is not quite the same as the order of steps in the
221 # code. It should still do the same though.
222
223 # Step 1: stop the algorithm when there's nothing to do.
224 if not self.activeFormattingElements:
225 return
226
227 # Step 2 and step 3: we start with the last element. So i is -1.
228 i = len(self.activeFormattingElements) - 1
229 entry = self.activeFormattingElements[i]
230 if entry == Marker or entry in self.openElements:
231 return
232
233 # Step 6
234 while entry != Marker and entry not in self.openElements:
235 if i == 0:
236 # This will be reset to 0 below
237 i = -1
238 break
239 i -= 1
240 # Step 5: let entry be one earlier in the list.
241 entry = self.activeFormattingElements[i]
242
243 while True:
244 # Step 7
245 i += 1
246
247 # Step 8
248 entry = self.activeFormattingElements[i]
249 clone = entry.cloneNode() # Mainly to get a new copy of the attributes
250
251 # Step 9
252 element = self.insertElement({"type": "StartTag",
253 "name": clone.name,
254 "namespace": clone.namespace,
255 "data": clone.attributes})
256
257 # Step 10
258 self.activeFormattingElements[i] = element
259
260 # Step 11
261 if element == self.activeFormattingElements[-1]:
262 break
263
264 def clearActiveFormattingElements(self):
265 entry = self.activeFormattingElements.pop()
266 while self.activeFormattingElements and entry != Marker:
267 entry = self.activeFormattingElements.pop()
268
269 def elementInActiveFormattingElements(self, name):
270 """Check if an element exists between the end of the active
271 formatting elements and the last marker. If it does, return it, else
272 return false"""
273
274 for item in self.activeFormattingElements[::-1]:
275 # Check for Marker first because if it's a Marker it doesn't have a
276 # name attribute.
277 if item == Marker:
278 break
279 elif item.name == name:
280 return item
281 return False
282
283 def insertRoot(self, token):
284 element = self.createElement(token)
285 self.openElements.append(element)
286 self.document.appendChild(element)
287
288 def insertDoctype(self, token):
289 name = token["name"]
290 publicId = token["publicId"]
291 systemId = token["systemId"]
292
293 doctype = self.doctypeClass(name, publicId, systemId)
294 self.document.appendChild(doctype)
295
296 def insertComment(self, token, parent=None):
297 if parent is None:
298 parent = self.openElements[-1]
299 parent.appendChild(self.commentClass(token["data"]))
300
301 def createElement(self, token):
302 """Create an element but don't insert it anywhere"""
303 name = token["name"]
304 namespace = token.get("namespace", self.defaultNamespace)
305 element = self.elementClass(name, namespace)
306 element.attributes = token["data"]
307 return element
308
309 def _getInsertFromTable(self):
310 return self._insertFromTable
311
312 def _setInsertFromTable(self, value):
313 """Switch the function used to insert an element from the
314 normal one to the misnested table one and back again"""
315 self._insertFromTable = value
316 if value:
317 self.insertElement = self.insertElementTable
318 else:
319 self.insertElement = self.insertElementNormal
320
321 insertFromTable = property(_getInsertFromTable, _setInsertFromTable)
322
323 def insertElementNormal(self, token):
324 name = token["name"]
325 assert isinstance(name, text_type), "Element %s not unicode" % name
326 namespace = token.get("namespace", self.defaultNamespace)
327 element = self.elementClass(name, namespace)
328 element.attributes = token["data"]
329 self.openElements[-1].appendChild(element)
330 self.openElements.append(element)
331 return element
332
333 def insertElementTable(self, token):
334 """Create an element and insert it into the tree"""
335 element = self.createElement(token)
336 if self.openElements[-1].name not in tableInsertModeElements:
337 return self.insertElementNormal(token)
338 else:
339 # We should be in the InTable mode. This means we want to do
340 # special magic element rearranging
341 parent, insertBefore = self.getTableMisnestedNodePosition()
342 if insertBefore is None:
343 parent.appendChild(element)
344 else:
345 parent.insertBefore(element, insertBefore)
346 self.openElements.append(element)
347 return element
348
349 def insertText(self, data, parent=None):
350 """Insert text data."""
351 if parent is None:
352 parent = self.openElements[-1]
353
354 if (not self.insertFromTable or (self.insertFromTable and
355 self.openElements[-1].name
356 not in tableInsertModeElements)):
357 parent.insertText(data)
358 else:
359 # We should be in the InTable mode. This means we want to do
360 # special magic element rearranging
361 parent, insertBefore = self.getTableMisnestedNodePosition()
362 parent.insertText(data, insertBefore)
363
364 def getTableMisnestedNodePosition(self):
365 """Get the foster parent element, and sibling to insert before
366 (or None) when inserting a misnested table node"""
367 # The foster parent element is the one which comes before the most
368 # recently opened table element
369 # XXX - this is really inelegant
370 lastTable = None
371 fosterParent = None
372 insertBefore = None
373 for elm in self.openElements[::-1]:
374 if elm.name == "table":
375 lastTable = elm
376 break
377 if lastTable:
378 # XXX - we should really check that this parent is actually a
379 # node here
380 if lastTable.parent:
381 fosterParent = lastTable.parent
382 insertBefore = lastTable
383 else:
384 fosterParent = self.openElements[
385 self.openElements.index(lastTable) - 1]
386 else:
387 fosterParent = self.openElements[0]
388 return fosterParent, insertBefore
389
390 def generateImpliedEndTags(self, exclude=None):
391 name = self.openElements[-1].name
392 # XXX td, th and tr are not actually needed
393 if (name in frozenset(("dd", "dt", "li", "option", "optgroup", "p", "rp", "rt")) and
394 name != exclude):
395 self.openElements.pop()
396 # XXX This is not entirely what the specification says. We should
397 # investigate it more closely.
398 self.generateImpliedEndTags(exclude)
399
400 def getDocument(self):
401 """Return the final tree"""
402 return self.document
403
404 def getFragment(self):
405 """Return the final fragment"""
406 # assert self.innerHTML
407 fragment = self.fragmentClass()
408 self.openElements[0].reparentChildren(fragment)
409 return fragment
410
411 def testSerializer(self, node):
412 """Serialize the subtree of node in the format required by unit tests
413
414 :arg node: the node from which to start serializing
415
416 """
417 raise NotImplementedError
diff --git a/venv/lib/python3.7/site-packages/pip-10.0.1-py3.7.egg/pip/_vendor/html5lib/treebuilders/dom.py b/venv/lib/python3.7/site-packages/pip-10.0.1-py3.7.egg/pip/_vendor/html5lib/treebuilders/dom.py
new file mode 100644
index 0000000..8117b2d
--- /dev/null
+++ b/venv/lib/python3.7/site-packages/pip-10.0.1-py3.7.egg/pip/_vendor/html5lib/treebuilders/dom.py
@@ -0,0 +1,236 @@
1from __future__ import absolute_import, division, unicode_literals
2
3
4from collections import MutableMapping
5from xml.dom import minidom, Node
6import weakref
7
8from . import base
9from .. import constants
10from ..constants import namespaces
11from .._utils import moduleFactoryFactory
12
13
14def getDomBuilder(DomImplementation):
15 Dom = DomImplementation
16
17 class AttrList(MutableMapping):
18 def __init__(self, element):
19 self.element = element
20
21 def __iter__(self):
22 return iter(self.element.attributes.keys())
23
24 def __setitem__(self, name, value):
25 if isinstance(name, tuple):
26 raise NotImplementedError
27 else:
28 attr = self.element.ownerDocument.createAttribute(name)
29 attr.value = value
30 self.element.attributes[name] = attr
31
32 def __len__(self):
33 return len(self.element.attributes)
34
35 def items(self):
36 return list(self.element.attributes.items())
37
38 def values(self):
39 return list(self.element.attributes.values())
40
41 def __getitem__(self, name):
42 if isinstance(name, tuple):
43 raise NotImplementedError
44 else:
45 return self.element.attributes[name].value
46
47 def __delitem__(self, name):
48 if isinstance(name, tuple):
49 raise NotImplementedError
50 else:
51 del self.element.attributes[name]
52
53 class NodeBuilder(base.Node):
54 def __init__(self, element):
55 base.Node.__init__(self, element.nodeName)
56 self.element = element
57
58 namespace = property(lambda self: hasattr(self.element, "namespaceURI") and
59 self.element.namespaceURI or None)
60
61 def appendChild(self, node):
62 node.parent = self
63 self.element.appendChild(node.element)
64
65 def insertText(self, data, insertBefore=None):
66 text = self.element.ownerDocument.createTextNode(data)
67 if insertBefore:
68 self.element.insertBefore(text, insertBefore.element)
69 else:
70 self.element.appendChild(text)
71
72 def insertBefore(self, node, refNode):
73 self.element.insertBefore(node.element, refNode.element)
74 node.parent = self
75
76 def removeChild(self, node):
77 if node.element.parentNode == self.element:
78 self.element.removeChild(node.element)
79 node.parent = None
80
81 def reparentChildren(self, newParent):
82 while self.element.hasChildNodes():
83 child = self.element.firstChild
84 self.element.removeChild(child)
85 newParent.element.appendChild(child)
86 self.childNodes = []
87
88 def getAttributes(self):
89 return AttrList(self.element)
90
91 def setAttributes(self, attributes):
92 if attributes:
93 for name, value in list(attributes.items()):
94 if isinstance(name, tuple):
95 if name[0] is not None:
96 qualifiedName = (name[0] + ":" + name[1])
97 else:
98 qualifiedName = name[1]
99 self.element.setAttributeNS(name[2], qualifiedName,
100 value)
101 else:
102 self.element.setAttribute(
103 name, value)
104 attributes = property(getAttributes, setAttributes)
105
106 def cloneNode(self):
107 return NodeBuilder(self.element.cloneNode(False))
108
109 def hasContent(self):
110 return self.element.hasChildNodes()
111
112 def getNameTuple(self):
113 if self.namespace is None:
114 return namespaces["html"], self.name
115 else:
116 return self.namespace, self.name
117
118 nameTuple = property(getNameTuple)
119
120 class TreeBuilder(base.TreeBuilder): # pylint:disable=unused-variable
121 def documentClass(self):
122 self.dom = Dom.getDOMImplementation().createDocument(None, None, None)
123 return weakref.proxy(self)
124
125 def insertDoctype(self, token):
126 name = token["name"]
127 publicId = token["publicId"]
128 systemId = token["systemId"]
129
130 domimpl = Dom.getDOMImplementation()
131 doctype = domimpl.createDocumentType(name, publicId, systemId)
132 self.document.appendChild(NodeBuilder(doctype))
133 if Dom == minidom:
134 doctype.ownerDocument = self.dom
135
136 def elementClass(self, name, namespace=None):
137 if namespace is None and self.defaultNamespace is None:
138 node = self.dom.createElement(name)
139 else:
140 node = self.dom.createElementNS(namespace, name)
141
142 return NodeBuilder(node)
143
144 def commentClass(self, data):
145 return NodeBuilder(self.dom.createComment(data))
146
147 def fragmentClass(self):
148 return NodeBuilder(self.dom.createDocumentFragment())
149
150 def appendChild(self, node):
151 self.dom.appendChild(node.element)
152
153 def testSerializer(self, element):
154 return testSerializer(element)
155
156 def getDocument(self):
157 return self.dom
158
159 def getFragment(self):
160 return base.TreeBuilder.getFragment(self).element
161
162 def insertText(self, data, parent=None):
163 data = data
164 if parent != self:
165 base.TreeBuilder.insertText(self, data, parent)
166 else:
167 # HACK: allow text nodes as children of the document node
168 if hasattr(self.dom, '_child_node_types'):
169 # pylint:disable=protected-access
170 if Node.TEXT_NODE not in self.dom._child_node_types:
171 self.dom._child_node_types = list(self.dom._child_node_types)
172 self.dom._child_node_types.append(Node.TEXT_NODE)
173 self.dom.appendChild(self.dom.createTextNode(data))
174
175 implementation = DomImplementation
176 name = None
177
178 def testSerializer(element):
179 element.normalize()
180 rv = []
181
182 def serializeElement(element, indent=0):
183 if element.nodeType == Node.DOCUMENT_TYPE_NODE:
184 if element.name:
185 if element.publicId or element.systemId:
186 publicId = element.publicId or ""
187 systemId = element.systemId or ""
188 rv.append("""|%s<!DOCTYPE %s "%s" "%s">""" %
189 (' ' * indent, element.name, publicId, systemId))
190 else:
191 rv.append("|%s<!DOCTYPE %s>" % (' ' * indent, element.name))
192 else:
193 rv.append("|%s<!DOCTYPE >" % (' ' * indent,))
194 elif element.nodeType == Node.DOCUMENT_NODE:
195 rv.append("#document")
196 elif element.nodeType == Node.DOCUMENT_FRAGMENT_NODE:
197 rv.append("#document-fragment")
198 elif element.nodeType == Node.COMMENT_NODE:
199 rv.append("|%s<!-- %s -->" % (' ' * indent, element.nodeValue))
200 elif element.nodeType == Node.TEXT_NODE:
201 rv.append("|%s\"%s\"" % (' ' * indent, element.nodeValue))
202 else:
203 if (hasattr(element, "namespaceURI") and
204 element.namespaceURI is not None):
205 name = "%s %s" % (constants.prefixes[element.namespaceURI],
206 element.nodeName)
207 else:
208 name = element.nodeName
209 rv.append("|%s<%s>" % (' ' * indent, name))
210 if element.hasAttributes():
211 attributes = []
212 for i in range(len(element.attributes)):
213 attr = element.attributes.item(i)
214 name = attr.nodeName
215 value = attr.value
216 ns = attr.namespaceURI
217 if ns:
218 name = "%s %s" % (constants.prefixes[ns], attr.localName)
219 else:
220 name = attr.nodeName
221 attributes.append((name, value))
222
223 for name, value in sorted(attributes):
224 rv.append('|%s%s="%s"' % (' ' * (indent + 2), name, value))
225 indent += 2
226 for child in element.childNodes:
227 serializeElement(child, indent)
228 serializeElement(element, 0)
229
230 return "\n".join(rv)
231
232 return locals()
233
234
235# The actual means to get a module!
236getDomModule = moduleFactoryFactory(getDomBuilder)
diff --git a/venv/lib/python3.7/site-packages/pip-10.0.1-py3.7.egg/pip/_vendor/html5lib/treebuilders/etree.py b/venv/lib/python3.7/site-packages/pip-10.0.1-py3.7.egg/pip/_vendor/html5lib/treebuilders/etree.py
new file mode 100644
index 0000000..9a4aa95
--- /dev/null
+++ b/venv/lib/python3.7/site-packages/pip-10.0.1-py3.7.egg/pip/_vendor/html5lib/treebuilders/etree.py
@@ -0,0 +1,340 @@
1from __future__ import absolute_import, division, unicode_literals
2# pylint:disable=protected-access
3
4from pip._vendor.six import text_type
5
6import re
7
8from . import base
9from .. import _ihatexml
10from .. import constants
11from ..constants import namespaces
12from .._utils import moduleFactoryFactory
13
14tag_regexp = re.compile("{([^}]*)}(.*)")
15
16
17def getETreeBuilder(ElementTreeImplementation, fullTree=False):
18 ElementTree = ElementTreeImplementation
19 ElementTreeCommentType = ElementTree.Comment("asd").tag
20
21 class Element(base.Node):
22 def __init__(self, name, namespace=None):
23 self._name = name
24 self._namespace = namespace
25 self._element = ElementTree.Element(self._getETreeTag(name,
26 namespace))
27 if namespace is None:
28 self.nameTuple = namespaces["html"], self._name
29 else:
30 self.nameTuple = self._namespace, self._name
31 self.parent = None
32 self._childNodes = []
33 self._flags = []
34
35 def _getETreeTag(self, name, namespace):
36 if namespace is None:
37 etree_tag = name
38 else:
39 etree_tag = "{%s}%s" % (namespace, name)
40 return etree_tag
41
42 def _setName(self, name):
43 self._name = name
44 self._element.tag = self._getETreeTag(self._name, self._namespace)
45
46 def _getName(self):
47 return self._name
48
49 name = property(_getName, _setName)
50
51 def _setNamespace(self, namespace):
52 self._namespace = namespace
53 self._element.tag = self._getETreeTag(self._name, self._namespace)
54
55 def _getNamespace(self):
56 return self._namespace
57
58 namespace = property(_getNamespace, _setNamespace)
59
60 def _getAttributes(self):
61 return self._element.attrib
62
63 def _setAttributes(self, attributes):
64 # Delete existing attributes first
65 # XXX - there may be a better way to do this...
66 for key in list(self._element.attrib.keys()):
67 del self._element.attrib[key]
68 for key, value in attributes.items():
69 if isinstance(key, tuple):
70 name = "{%s}%s" % (key[2], key[1])
71 else:
72 name = key
73 self._element.set(name, value)
74
75 attributes = property(_getAttributes, _setAttributes)
76
77 def _getChildNodes(self):
78 return self._childNodes
79
80 def _setChildNodes(self, value):
81 del self._element[:]
82 self._childNodes = []
83 for element in value:
84 self.insertChild(element)
85
86 childNodes = property(_getChildNodes, _setChildNodes)
87
88 def hasContent(self):
89 """Return true if the node has children or text"""
90 return bool(self._element.text or len(self._element))
91
92 def appendChild(self, node):
93 self._childNodes.append(node)
94 self._element.append(node._element)
95 node.parent = self
96
97 def insertBefore(self, node, refNode):
98 index = list(self._element).index(refNode._element)
99 self._element.insert(index, node._element)
100 node.parent = self
101
102 def removeChild(self, node):
103 self._childNodes.remove(node)
104 self._element.remove(node._element)
105 node.parent = None
106
107 def insertText(self, data, insertBefore=None):
108 if not(len(self._element)):
109 if not self._element.text:
110 self._element.text = ""
111 self._element.text += data
112 elif insertBefore is None:
113 # Insert the text as the tail of the last child element
114 if not self._element[-1].tail:
115 self._element[-1].tail = ""
116 self._element[-1].tail += data
117 else:
118 # Insert the text before the specified node
119 children = list(self._element)
120 index = children.index(insertBefore._element)
121 if index > 0:
122 if not self._element[index - 1].tail:
123 self._element[index - 1].tail = ""
124 self._element[index - 1].tail += data
125 else:
126 if not self._element.text:
127 self._element.text = ""
128 self._element.text += data
129
130 def cloneNode(self):
131 element = type(self)(self.name, self.namespace)
132 for name, value in self.attributes.items():
133 element.attributes[name] = value
134 return element
135
136 def reparentChildren(self, newParent):
137 if newParent.childNodes:
138 newParent.childNodes[-1]._element.tail += self._element.text
139 else:
140 if not newParent._element.text:
141 newParent._element.text = ""
142 if self._element.text is not None:
143 newParent._element.text += self._element.text
144 self._element.text = ""
145 base.Node.reparentChildren(self, newParent)
146
147 class Comment(Element):
148 def __init__(self, data):
149 # Use the superclass constructor to set all properties on the
150 # wrapper element
151 self._element = ElementTree.Comment(data)
152 self.parent = None
153 self._childNodes = []
154 self._flags = []
155
156 def _getData(self):
157 return self._element.text
158
159 def _setData(self, value):
160 self._element.text = value
161
162 data = property(_getData, _setData)
163
164 class DocumentType(Element):
165 def __init__(self, name, publicId, systemId):
166 Element.__init__(self, "<!DOCTYPE>")
167 self._element.text = name
168 self.publicId = publicId
169 self.systemId = systemId
170
171 def _getPublicId(self):
172 return self._element.get("publicId", "")
173
174 def _setPublicId(self, value):
175 if value is not None:
176 self._element.set("publicId", value)
177
178 publicId = property(_getPublicId, _setPublicId)
179
180 def _getSystemId(self):
181 return self._element.get("systemId", "")
182
183 def _setSystemId(self, value):
184 if value is not None:
185 self._element.set("systemId", value)
186
187 systemId = property(_getSystemId, _setSystemId)
188
189 class Document(Element):
190 def __init__(self):
191 Element.__init__(self, "DOCUMENT_ROOT")
192
193 class DocumentFragment(Element):
194 def __init__(self):
195 Element.__init__(self, "DOCUMENT_FRAGMENT")
196
197 def testSerializer(element):
198 rv = []
199
200 def serializeElement(element, indent=0):
201 if not(hasattr(element, "tag")):
202 element = element.getroot()
203 if element.tag == "<!DOCTYPE>":
204 if element.get("publicId") or element.get("systemId"):
205 publicId = element.get("publicId") or ""
206 systemId = element.get("systemId") or ""
207 rv.append("""<!DOCTYPE %s "%s" "%s">""" %
208 (element.text, publicId, systemId))
209 else:
210 rv.append("<!DOCTYPE %s>" % (element.text,))
211 elif element.tag == "DOCUMENT_ROOT":
212 rv.append("#document")
213 if element.text is not None:
214 rv.append("|%s\"%s\"" % (' ' * (indent + 2), element.text))
215 if element.tail is not None:
216 raise TypeError("Document node cannot have tail")
217 if hasattr(element, "attrib") and len(element.attrib):
218 raise TypeError("Document node cannot have attributes")
219 elif element.tag == ElementTreeCommentType:
220 rv.append("|%s<!-- %s -->" % (' ' * indent, element.text))
221 else:
222 assert isinstance(element.tag, text_type), \
223 "Expected unicode, got %s, %s" % (type(element.tag), element.tag)
224 nsmatch = tag_regexp.match(element.tag)
225
226 if nsmatch is None:
227 name = element.tag
228 else:
229 ns, name = nsmatch.groups()
230 prefix = constants.prefixes[ns]
231 name = "%s %s" % (prefix, name)
232 rv.append("|%s<%s>" % (' ' * indent, name))
233
234 if hasattr(element, "attrib"):
235 attributes = []
236 for name, value in element.attrib.items():
237 nsmatch = tag_regexp.match(name)
238 if nsmatch is not None:
239 ns, name = nsmatch.groups()
240 prefix = constants.prefixes[ns]
241 attr_string = "%s %s" % (prefix, name)
242 else:
243 attr_string = name
244 attributes.append((attr_string, value))
245
246 for name, value in sorted(attributes):
247 rv.append('|%s%s="%s"' % (' ' * (indent + 2), name, value))
248 if element.text:
249 rv.append("|%s\"%s\"" % (' ' * (indent + 2), element.text))
250 indent += 2
251 for child in element:
252 serializeElement(child, indent)
253 if element.tail:
254 rv.append("|%s\"%s\"" % (' ' * (indent - 2), element.tail))
255 serializeElement(element, 0)
256
257 return "\n".join(rv)
258
259 def tostring(element): # pylint:disable=unused-variable
260 """Serialize an element and its child nodes to a string"""
261 rv = []
262 filter = _ihatexml.InfosetFilter()
263
264 def serializeElement(element):
265 if isinstance(element, ElementTree.ElementTree):
266 element = element.getroot()
267
268 if element.tag == "<!DOCTYPE>":
269 if element.get("publicId") or element.get("systemId"):
270 publicId = element.get("publicId") or ""
271 systemId = element.get("systemId") or ""
272 rv.append("""<!DOCTYPE %s PUBLIC "%s" "%s">""" %
273 (element.text, publicId, systemId))
274 else:
275 rv.append("<!DOCTYPE %s>" % (element.text,))
276 elif element.tag == "DOCUMENT_ROOT":
277 if element.text is not None:
278 rv.append(element.text)
279 if element.tail is not None:
280 raise TypeError("Document node cannot have tail")
281 if hasattr(element, "attrib") and len(element.attrib):
282 raise TypeError("Document node cannot have attributes")
283
284 for child in element:
285 serializeElement(child)
286
287 elif element.tag == ElementTreeCommentType:
288 rv.append("<!--%s-->" % (element.text,))
289 else:
290 # This is assumed to be an ordinary element
291 if not element.attrib:
292 rv.append("<%s>" % (filter.fromXmlName(element.tag),))
293 else:
294 attr = " ".join(["%s=\"%s\"" % (
295 filter.fromXmlName(name), value)
296 for name, value in element.attrib.items()])
297 rv.append("<%s %s>" % (element.tag, attr))
298 if element.text:
299 rv.append(element.text)
300
301 for child in element:
302 serializeElement(child)
303
304 rv.append("</%s>" % (element.tag,))
305
306 if element.tail:
307 rv.append(element.tail)
308
309 serializeElement(element)
310
311 return "".join(rv)
312
313 class TreeBuilder(base.TreeBuilder): # pylint:disable=unused-variable
314 documentClass = Document
315 doctypeClass = DocumentType
316 elementClass = Element
317 commentClass = Comment
318 fragmentClass = DocumentFragment
319 implementation = ElementTreeImplementation
320
321 def testSerializer(self, element):
322 return testSerializer(element)
323
324 def getDocument(self):
325 if fullTree:
326 return self.document._element
327 else:
328 if self.defaultNamespace is not None:
329 return self.document._element.find(
330 "{%s}html" % self.defaultNamespace)
331 else:
332 return self.document._element.find("html")
333
334 def getFragment(self):
335 return base.TreeBuilder.getFragment(self)._element
336
337 return locals()
338
339
340getETreeModule = moduleFactoryFactory(getETreeBuilder)
diff --git a/venv/lib/python3.7/site-packages/pip-10.0.1-py3.7.egg/pip/_vendor/html5lib/treebuilders/etree_lxml.py b/venv/lib/python3.7/site-packages/pip-10.0.1-py3.7.egg/pip/_vendor/html5lib/treebuilders/etree_lxml.py
new file mode 100644
index 0000000..66a9ba3
--- /dev/null
+++ b/venv/lib/python3.7/site-packages/pip-10.0.1-py3.7.egg/pip/_vendor/html5lib/treebuilders/etree_lxml.py
@@ -0,0 +1,366 @@
1"""Module for supporting the lxml.etree library. The idea here is to use as much
2of the native library as possible, without using fragile hacks like custom element
3names that break between releases. The downside of this is that we cannot represent
4all possible trees; specifically the following are known to cause problems:
5
6Text or comments as siblings of the root element
7Docypes with no name
8
9When any of these things occur, we emit a DataLossWarning
10"""
11
12from __future__ import absolute_import, division, unicode_literals
13# pylint:disable=protected-access
14
15import warnings
16import re
17import sys
18
19from . import base
20from ..constants import DataLossWarning
21from .. import constants
22from . import etree as etree_builders
23from .. import _ihatexml
24
25import lxml.etree as etree
26
27
28fullTree = True
29tag_regexp = re.compile("{([^}]*)}(.*)")
30
31comment_type = etree.Comment("asd").tag
32
33
34class DocumentType(object):
35 def __init__(self, name, publicId, systemId):
36 self.name = name
37 self.publicId = publicId
38 self.systemId = systemId
39
40
41class Document(object):
42 def __init__(self):
43 self._elementTree = None
44 self._childNodes = []
45
46 def appendChild(self, element):
47 self._elementTree.getroot().addnext(element._element)
48
49 def _getChildNodes(self):
50 return self._childNodes
51
52 childNodes = property(_getChildNodes)
53
54
55def testSerializer(element):
56 rv = []
57 infosetFilter = _ihatexml.InfosetFilter(preventDoubleDashComments=True)
58
59 def serializeElement(element, indent=0):
60 if not hasattr(element, "tag"):
61 if hasattr(element, "getroot"):
62 # Full tree case
63 rv.append("#document")
64 if element.docinfo.internalDTD:
65 if not (element.docinfo.public_id or
66 element.docinfo.system_url):
67 dtd_str = "<!DOCTYPE %s>" % element.docinfo.root_name
68 else:
69 dtd_str = """<!DOCTYPE %s "%s" "%s">""" % (
70 element.docinfo.root_name,
71 element.docinfo.public_id,
72 element.docinfo.system_url)
73 rv.append("|%s%s" % (' ' * (indent + 2), dtd_str))
74 next_element = element.getroot()
75 while next_element.getprevious() is not None:
76 next_element = next_element.getprevious()
77 while next_element is not None:
78 serializeElement(next_element, indent + 2)
79 next_element = next_element.getnext()
80 elif isinstance(element, str) or isinstance(element, bytes):
81 # Text in a fragment
82 assert isinstance(element, str) or sys.version_info[0] == 2
83 rv.append("|%s\"%s\"" % (' ' * indent, element))
84 else:
85 # Fragment case
86 rv.append("#document-fragment")
87 for next_element in element:
88 serializeElement(next_element, indent + 2)
89 elif element.tag == comment_type:
90 rv.append("|%s<!-- %s -->" % (' ' * indent, element.text))
91 if hasattr(element, "tail") and element.tail:
92 rv.append("|%s\"%s\"" % (' ' * indent, element.tail))
93 else:
94 assert isinstance(element, etree._Element)
95 nsmatch = etree_builders.tag_regexp.match(element.tag)
96 if nsmatch is not None:
97 ns = nsmatch.group(1)
98 tag = nsmatch.group(2)
99 prefix = constants.prefixes[ns]
100 rv.append("|%s<%s %s>" % (' ' * indent, prefix,
101 infosetFilter.fromXmlName(tag)))
102 else:
103 rv.append("|%s<%s>" % (' ' * indent,
104 infosetFilter.fromXmlName(element.tag)))
105
106 if hasattr(element, "attrib"):
107 attributes = []
108 for name, value in element.attrib.items():
109 nsmatch = tag_regexp.match(name)
110 if nsmatch is not None:
111 ns, name = nsmatch.groups()
112 name = infosetFilter.fromXmlName(name)
113 prefix = constants.prefixes[ns]
114 attr_string = "%s %s" % (prefix, name)
115 else:
116 attr_string = infosetFilter.fromXmlName(name)
117 attributes.append((attr_string, value))
118
119 for name, value in sorted(attributes):
120 rv.append('|%s%s="%s"' % (' ' * (indent + 2), name, value))
121
122 if element.text:
123 rv.append("|%s\"%s\"" % (' ' * (indent + 2), element.text))
124 indent += 2
125 for child in element:
126 serializeElement(child, indent)
127 if hasattr(element, "tail") and element.tail:
128 rv.append("|%s\"%s\"" % (' ' * (indent - 2), element.tail))
129 serializeElement(element, 0)
130
131 return "\n".join(rv)
132
133
134def tostring(element):
135 """Serialize an element and its child nodes to a string"""
136 rv = []
137
138 def serializeElement(element):
139 if not hasattr(element, "tag"):
140 if element.docinfo.internalDTD:
141 if element.docinfo.doctype:
142 dtd_str = element.docinfo.doctype
143 else:
144 dtd_str = "<!DOCTYPE %s>" % element.docinfo.root_name
145 rv.append(dtd_str)
146 serializeElement(element.getroot())
147
148 elif element.tag == comment_type:
149 rv.append("<!--%s-->" % (element.text,))
150
151 else:
152 # This is assumed to be an ordinary element
153 if not element.attrib:
154 rv.append("<%s>" % (element.tag,))
155 else:
156 attr = " ".join(["%s=\"%s\"" % (name, value)
157 for name, value in element.attrib.items()])
158 rv.append("<%s %s>" % (element.tag, attr))
159 if element.text:
160 rv.append(element.text)
161
162 for child in element:
163 serializeElement(child)
164
165 rv.append("</%s>" % (element.tag,))
166
167 if hasattr(element, "tail") and element.tail:
168 rv.append(element.tail)
169
170 serializeElement(element)
171
172 return "".join(rv)
173
174
175class TreeBuilder(base.TreeBuilder):
176 documentClass = Document
177 doctypeClass = DocumentType
178 elementClass = None
179 commentClass = None
180 fragmentClass = Document
181 implementation = etree
182
183 def __init__(self, namespaceHTMLElements, fullTree=False):
184 builder = etree_builders.getETreeModule(etree, fullTree=fullTree)
185 infosetFilter = self.infosetFilter = _ihatexml.InfosetFilter(preventDoubleDashComments=True)
186 self.namespaceHTMLElements = namespaceHTMLElements
187
188 class Attributes(dict):
189 def __init__(self, element, value=None):
190 if value is None:
191 value = {}
192 self._element = element
193 dict.__init__(self, value) # pylint:disable=non-parent-init-called
194 for key, value in self.items():
195 if isinstance(key, tuple):
196 name = "{%s}%s" % (key[2], infosetFilter.coerceAttribute(key[1]))
197 else:
198 name = infosetFilter.coerceAttribute(key)
199 self._element._element.attrib[name] = value
200
201 def __setitem__(self, key, value):
202 dict.__setitem__(self, key, value)
203 if isinstance(key, tuple):
204 name = "{%s}%s" % (key[2], infosetFilter.coerceAttribute(key[1]))
205 else:
206 name = infosetFilter.coerceAttribute(key)
207 self._element._element.attrib[name] = value
208
209 class Element(builder.Element):
210 def __init__(self, name, namespace):
211 name = infosetFilter.coerceElement(name)
212 builder.Element.__init__(self, name, namespace=namespace)
213 self._attributes = Attributes(self)
214
215 def _setName(self, name):
216 self._name = infosetFilter.coerceElement(name)
217 self._element.tag = self._getETreeTag(
218 self._name, self._namespace)
219
220 def _getName(self):
221 return infosetFilter.fromXmlName(self._name)
222
223 name = property(_getName, _setName)
224
225 def _getAttributes(self):
226 return self._attributes
227
228 def _setAttributes(self, attributes):
229 self._attributes = Attributes(self, attributes)
230
231 attributes = property(_getAttributes, _setAttributes)
232
233 def insertText(self, data, insertBefore=None):
234 data = infosetFilter.coerceCharacters(data)
235 builder.Element.insertText(self, data, insertBefore)
236
237 def appendChild(self, child):
238 builder.Element.appendChild(self, child)
239
240 class Comment(builder.Comment):
241 def __init__(self, data):
242 data = infosetFilter.coerceComment(data)
243 builder.Comment.__init__(self, data)
244
245 def _setData(self, data):
246 data = infosetFilter.coerceComment(data)
247 self._element.text = data
248
249 def _getData(self):
250 return self._element.text
251
252 data = property(_getData, _setData)
253
254 self.elementClass = Element
255 self.commentClass = Comment
256 # self.fragmentClass = builder.DocumentFragment
257 base.TreeBuilder.__init__(self, namespaceHTMLElements)
258
259 def reset(self):
260 base.TreeBuilder.reset(self)
261 self.insertComment = self.insertCommentInitial
262 self.initial_comments = []
263 self.doctype = None
264
265 def testSerializer(self, element):
266 return testSerializer(element)
267
268 def getDocument(self):
269 if fullTree:
270 return self.document._elementTree
271 else:
272 return self.document._elementTree.getroot()
273
274 def getFragment(self):
275 fragment = []
276 element = self.openElements[0]._element
277 if element.text:
278 fragment.append(element.text)
279 fragment.extend(list(element))
280 if element.tail:
281 fragment.append(element.tail)
282 return fragment
283
284 def insertDoctype(self, token):
285 name = token["name"]
286 publicId = token["publicId"]
287 systemId = token["systemId"]
288
289 if not name:
290 warnings.warn("lxml cannot represent empty doctype", DataLossWarning)
291 self.doctype = None
292 else:
293 coercedName = self.infosetFilter.coerceElement(name)
294 if coercedName != name:
295 warnings.warn("lxml cannot represent non-xml doctype", DataLossWarning)
296
297 doctype = self.doctypeClass(coercedName, publicId, systemId)
298 self.doctype = doctype
299
300 def insertCommentInitial(self, data, parent=None):
301 assert parent is None or parent is self.document
302 assert self.document._elementTree is None
303 self.initial_comments.append(data)
304
305 def insertCommentMain(self, data, parent=None):
306 if (parent == self.document and
307 self.document._elementTree.getroot()[-1].tag == comment_type):
308 warnings.warn("lxml cannot represent adjacent comments beyond the root elements", DataLossWarning)
309 super(TreeBuilder, self).insertComment(data, parent)
310
311 def insertRoot(self, token):
312 # Because of the way libxml2 works, it doesn't seem to be possible to
313 # alter information like the doctype after the tree has been parsed.
314 # Therefore we need to use the built-in parser to create our initial
315 # tree, after which we can add elements like normal
316 docStr = ""
317 if self.doctype:
318 assert self.doctype.name
319 docStr += "<!DOCTYPE %s" % self.doctype.name
320 if (self.doctype.publicId is not None or
321 self.doctype.systemId is not None):
322 docStr += (' PUBLIC "%s" ' %
323 (self.infosetFilter.coercePubid(self.doctype.publicId or "")))
324 if self.doctype.systemId:
325 sysid = self.doctype.systemId
326 if sysid.find("'") >= 0 and sysid.find('"') >= 0:
327 warnings.warn("DOCTYPE system cannot contain single and double quotes", DataLossWarning)
328 sysid = sysid.replace("'", 'U00027')
329 if sysid.find("'") >= 0:
330 docStr += '"%s"' % sysid
331 else:
332 docStr += "'%s'" % sysid
333 else:
334 docStr += "''"
335 docStr += ">"
336 if self.doctype.name != token["name"]:
337 warnings.warn("lxml cannot represent doctype with a different name to the root element", DataLossWarning)
338 docStr += "<THIS_SHOULD_NEVER_APPEAR_PUBLICLY/>"
339 root = etree.fromstring(docStr)
340
341 # Append the initial comments:
342 for comment_token in self.initial_comments:
343 comment = self.commentClass(comment_token["data"])
344 root.addprevious(comment._element)
345
346 # Create the root document and add the ElementTree to it
347 self.document = self.documentClass()
348 self.document._elementTree = root.getroottree()
349
350 # Give the root element the right name
351 name = token["name"]
352 namespace = token.get("namespace", self.defaultNamespace)
353 if namespace is None:
354 etree_tag = name
355 else:
356 etree_tag = "{%s}%s" % (namespace, name)
357 root.tag = etree_tag
358
359 # Add the root element to the internal child/open data structures
360 root_element = self.elementClass(name, namespace)
361 root_element._element = root
362 self.document._childNodes.append(root_element)
363 self.openElements.append(root_element)
364
365 # Reset to the default insert comment function
366 self.insertComment = self.insertCommentMain
diff --git a/venv/lib/python3.7/site-packages/pip-10.0.1-py3.7.egg/pip/_vendor/html5lib/treewalkers/__init__.py b/venv/lib/python3.7/site-packages/pip-10.0.1-py3.7.egg/pip/_vendor/html5lib/treewalkers/__init__.py
new file mode 100644
index 0000000..31a173d
--- /dev/null
+++ b/venv/lib/python3.7/site-packages/pip-10.0.1-py3.7.egg/pip/_vendor/html5lib/treewalkers/__init__.py
@@ -0,0 +1,154 @@
1"""A collection of modules for iterating through different kinds of
2tree, generating tokens identical to those produced by the tokenizer
3module.
4
5To create a tree walker for a new type of tree, you need to do
6implement a tree walker object (called TreeWalker by convention) that
7implements a 'serialize' method taking a tree as sole argument and
8returning an iterator generating tokens.
9"""
10
11from __future__ import absolute_import, division, unicode_literals
12
13from .. import constants
14from .._utils import default_etree
15
16__all__ = ["getTreeWalker", "pprint"]
17
18treeWalkerCache = {}
19
20
21def getTreeWalker(treeType, implementation=None, **kwargs):
22 """Get a TreeWalker class for various types of tree with built-in support
23
24 :arg str treeType: the name of the tree type required (case-insensitive).
25 Supported values are:
26
27 * "dom": The xml.dom.minidom DOM implementation
28 * "etree": A generic walker for tree implementations exposing an
29 elementtree-like interface (known to work with ElementTree,
30 cElementTree and lxml.etree).
31 * "lxml": Optimized walker for lxml.etree
32 * "genshi": a Genshi stream
33
34 :arg implementation: A module implementing the tree type e.g.
35 xml.etree.ElementTree or cElementTree (Currently applies to the "etree"
36 tree type only).
37
38 :arg kwargs: keyword arguments passed to the etree walker--for other
39 walkers, this has no effect
40
41 :returns: a TreeWalker class
42
43 """
44
45 treeType = treeType.lower()
46 if treeType not in treeWalkerCache:
47 if treeType == "dom":
48 from . import dom
49 treeWalkerCache[treeType] = dom.TreeWalker
50 elif treeType == "genshi":
51 from . import genshi
52 treeWalkerCache[treeType] = genshi.TreeWalker
53 elif treeType == "lxml":
54 from . import etree_lxml
55 treeWalkerCache[treeType] = etree_lxml.TreeWalker
56 elif treeType == "etree":
57 from . import etree
58 if implementation is None:
59 implementation = default_etree
60 # XXX: NEVER cache here, caching is done in the etree submodule
61 return etree.getETreeModule(implementation, **kwargs).TreeWalker
62 return treeWalkerCache.get(treeType)
63
64
65def concatenateCharacterTokens(tokens):
66 pendingCharacters = []
67 for token in tokens:
68 type = token["type"]
69 if type in ("Characters", "SpaceCharacters"):
70 pendingCharacters.append(token["data"])
71 else:
72 if pendingCharacters:
73 yield {"type": "Characters", "data": "".join(pendingCharacters)}
74 pendingCharacters = []
75 yield token
76 if pendingCharacters:
77 yield {"type": "Characters", "data": "".join(pendingCharacters)}
78
79
80def pprint(walker):
81 """Pretty printer for tree walkers
82
83 Takes a TreeWalker instance and pretty prints the output of walking the tree.
84
85 :arg walker: a TreeWalker instance
86
87 """
88 output = []
89 indent = 0
90 for token in concatenateCharacterTokens(walker):
91 type = token["type"]
92 if type in ("StartTag", "EmptyTag"):
93 # tag name
94 if token["namespace"] and token["namespace"] != constants.namespaces["html"]:
95 if token["namespace"] in constants.prefixes:
96 ns = constants.prefixes[token["namespace"]]
97 else:
98 ns = token["namespace"]
99 name = "%s %s" % (ns, token["name"])
100 else:
101 name = token["name"]
102 output.append("%s<%s>" % (" " * indent, name))
103 indent += 2
104 # attributes (sorted for consistent ordering)
105 attrs = token["data"]
106 for (namespace, localname), value in sorted(attrs.items()):
107 if namespace:
108 if namespace in constants.prefixes:
109 ns = constants.prefixes[namespace]
110 else:
111 ns = namespace
112 name = "%s %s" % (ns, localname)
113 else:
114 name = localname
115 output.append("%s%s=\"%s\"" % (" " * indent, name, value))
116 # self-closing
117 if type == "EmptyTag":
118 indent -= 2
119
120 elif type == "EndTag":
121 indent -= 2
122
123 elif type == "Comment":
124 output.append("%s<!-- %s -->" % (" " * indent, token["data"]))
125
126 elif type == "Doctype":
127 if token["name"]:
128 if token["publicId"]:
129 output.append("""%s<!DOCTYPE %s "%s" "%s">""" %
130 (" " * indent,
131 token["name"],
132 token["publicId"],
133 token["systemId"] if token["systemId"] else ""))
134 elif token["systemId"]:
135 output.append("""%s<!DOCTYPE %s "" "%s">""" %
136 (" " * indent,
137 token["name"],
138 token["systemId"]))
139 else:
140 output.append("%s<!DOCTYPE %s>" % (" " * indent,
141 token["name"]))
142 else:
143 output.append("%s<!DOCTYPE >" % (" " * indent,))
144
145 elif type == "Characters":
146 output.append("%s\"%s\"" % (" " * indent, token["data"]))
147
148 elif type == "SpaceCharacters":
149 assert False, "concatenateCharacterTokens should have got rid of all Space tokens"
150
151 else:
152 raise ValueError("Unknown token type, %s" % type)
153
154 return "\n".join(output)
diff --git a/venv/lib/python3.7/site-packages/pip-10.0.1-py3.7.egg/pip/_vendor/html5lib/treewalkers/base.py b/venv/lib/python3.7/site-packages/pip-10.0.1-py3.7.egg/pip/_vendor/html5lib/treewalkers/base.py
new file mode 100644
index 0000000..f82984b
--- /dev/null
+++ b/venv/lib/python3.7/site-packages/pip-10.0.1-py3.7.egg/pip/_vendor/html5lib/treewalkers/base.py
@@ -0,0 +1,252 @@
1from __future__ import absolute_import, division, unicode_literals
2
3from xml.dom import Node
4from ..constants import namespaces, voidElements, spaceCharacters
5
6__all__ = ["DOCUMENT", "DOCTYPE", "TEXT", "ELEMENT", "COMMENT", "ENTITY", "UNKNOWN",
7 "TreeWalker", "NonRecursiveTreeWalker"]
8
9DOCUMENT = Node.DOCUMENT_NODE
10DOCTYPE = Node.DOCUMENT_TYPE_NODE
11TEXT = Node.TEXT_NODE
12ELEMENT = Node.ELEMENT_NODE
13COMMENT = Node.COMMENT_NODE
14ENTITY = Node.ENTITY_NODE
15UNKNOWN = "<#UNKNOWN#>"
16
17spaceCharacters = "".join(spaceCharacters)
18
19
20class TreeWalker(object):
21 """Walks a tree yielding tokens
22
23 Tokens are dicts that all have a ``type`` field specifying the type of the
24 token.
25
26 """
27 def __init__(self, tree):
28 """Creates a TreeWalker
29
30 :arg tree: the tree to walk
31
32 """
33 self.tree = tree
34
35 def __iter__(self):
36 raise NotImplementedError
37
38 def error(self, msg):
39 """Generates an error token with the given message
40
41 :arg msg: the error message
42
43 :returns: SerializeError token
44
45 """
46 return {"type": "SerializeError", "data": msg}
47
48 def emptyTag(self, namespace, name, attrs, hasChildren=False):
49 """Generates an EmptyTag token
50
51 :arg namespace: the namespace of the token--can be ``None``
52
53 :arg name: the name of the element
54
55 :arg attrs: the attributes of the element as a dict
56
57 :arg hasChildren: whether or not to yield a SerializationError because
58 this tag shouldn't have children
59
60 :returns: EmptyTag token
61
62 """
63 yield {"type": "EmptyTag", "name": name,
64 "namespace": namespace,
65 "data": attrs}
66 if hasChildren:
67 yield self.error("Void element has children")
68
69 def startTag(self, namespace, name, attrs):
70 """Generates a StartTag token
71
72 :arg namespace: the namespace of the token--can be ``None``
73
74 :arg name: the name of the element
75
76 :arg attrs: the attributes of the element as a dict
77
78 :returns: StartTag token
79
80 """
81 return {"type": "StartTag",
82 "name": name,
83 "namespace": namespace,
84 "data": attrs}
85
86 def endTag(self, namespace, name):
87 """Generates an EndTag token
88
89 :arg namespace: the namespace of the token--can be ``None``
90
91 :arg name: the name of the element
92
93 :returns: EndTag token
94
95 """
96 return {"type": "EndTag",
97 "name": name,
98 "namespace": namespace}
99
100 def text(self, data):
101 """Generates SpaceCharacters and Characters tokens
102
103 Depending on what's in the data, this generates one or more
104 ``SpaceCharacters`` and ``Characters`` tokens.
105
106 For example:
107
108 >>> from html5lib.treewalkers.base import TreeWalker
109 >>> # Give it an empty tree just so it instantiates
110 >>> walker = TreeWalker([])
111 >>> list(walker.text(''))
112 []
113 >>> list(walker.text(' '))
114 [{u'data': ' ', u'type': u'SpaceCharacters'}]
115 >>> list(walker.text(' abc ')) # doctest: +NORMALIZE_WHITESPACE
116 [{u'data': ' ', u'type': u'SpaceCharacters'},
117 {u'data': u'abc', u'type': u'Characters'},
118 {u'data': u' ', u'type': u'SpaceCharacters'}]
119
120 :arg data: the text data
121
122 :returns: one or more ``SpaceCharacters`` and ``Characters`` tokens
123
124 """
125 data = data
126 middle = data.lstrip(spaceCharacters)
127 left = data[:len(data) - len(middle)]
128 if left:
129 yield {"type": "SpaceCharacters", "data": left}
130 data = middle
131 middle = data.rstrip(spaceCharacters)
132 right = data[len(middle):]
133 if middle:
134 yield {"type": "Characters", "data": middle}
135 if right:
136 yield {"type": "SpaceCharacters", "data": right}
137
138 def comment(self, data):
139 """Generates a Comment token
140
141 :arg data: the comment
142
143 :returns: Comment token
144
145 """
146 return {"type": "Comment", "data": data}
147
148 def doctype(self, name, publicId=None, systemId=None):
149 """Generates a Doctype token
150
151 :arg name:
152
153 :arg publicId:
154
155 :arg systemId:
156
157 :returns: the Doctype token
158
159 """
160 return {"type": "Doctype",
161 "name": name,
162 "publicId": publicId,
163 "systemId": systemId}
164
165 def entity(self, name):
166 """Generates an Entity token
167
168 :arg name: the entity name
169
170 :returns: an Entity token
171
172 """
173 return {"type": "Entity", "name": name}
174
175 def unknown(self, nodeType):
176 """Handles unknown node types"""
177 return self.error("Unknown node type: " + nodeType)
178
179
180class NonRecursiveTreeWalker(TreeWalker):
181 def getNodeDetails(self, node):
182 raise NotImplementedError
183
184 def getFirstChild(self, node):
185 raise NotImplementedError
186
187 def getNextSibling(self, node):
188 raise NotImplementedError
189
190 def getParentNode(self, node):
191 raise NotImplementedError
192
193 def __iter__(self):
194 currentNode = self.tree
195 while currentNode is not None:
196 details = self.getNodeDetails(currentNode)
197 type, details = details[0], details[1:]
198 hasChildren = False
199
200 if type == DOCTYPE:
201 yield self.doctype(*details)
202
203 elif type == TEXT:
204 for token in self.text(*details):
205 yield token
206
207 elif type == ELEMENT:
208 namespace, name, attributes, hasChildren = details
209 if (not namespace or namespace == namespaces["html"]) and name in voidElements:
210 for token in self.emptyTag(namespace, name, attributes,
211 hasChildren):
212 yield token
213 hasChildren = False
214 else:
215 yield self.startTag(namespace, name, attributes)
216
217 elif type == COMMENT:
218 yield self.comment(details[0])
219
220 elif type == ENTITY:
221 yield self.entity(details[0])
222
223 elif type == DOCUMENT:
224 hasChildren = True
225
226 else:
227 yield self.unknown(details[0])
228
229 if hasChildren:
230 firstChild = self.getFirstChild(currentNode)
231 else:
232 firstChild = None
233
234 if firstChild is not None:
235 currentNode = firstChild
236 else:
237 while currentNode is not None:
238 details = self.getNodeDetails(currentNode)
239 type, details = details[0], details[1:]
240 if type == ELEMENT:
241 namespace, name, attributes, hasChildren = details
242 if (namespace and namespace != namespaces["html"]) or name not in voidElements:
243 yield self.endTag(namespace, name)
244 if self.tree is currentNode:
245 currentNode = None
246 break
247 nextSibling = self.getNextSibling(currentNode)
248 if nextSibling is not None:
249 currentNode = nextSibling
250 break
251 else:
252 currentNode = self.getParentNode(currentNode)
diff --git a/venv/lib/python3.7/site-packages/pip-10.0.1-py3.7.egg/pip/_vendor/html5lib/treewalkers/dom.py b/venv/lib/python3.7/site-packages/pip-10.0.1-py3.7.egg/pip/_vendor/html5lib/treewalkers/dom.py
new file mode 100644
index 0000000..b3e2753
--- /dev/null
+++ b/venv/lib/python3.7/site-packages/pip-10.0.1-py3.7.egg/pip/_vendor/html5lib/treewalkers/dom.py
@@ -0,0 +1,43 @@
1from __future__ import absolute_import, division, unicode_literals
2
3from xml.dom import Node
4
5from . import base
6
7
8class TreeWalker(base.NonRecursiveTreeWalker):
9 def getNodeDetails(self, node):
10 if node.nodeType == Node.DOCUMENT_TYPE_NODE:
11 return base.DOCTYPE, node.name, node.publicId, node.systemId
12
13 elif node.nodeType in (Node.TEXT_NODE, Node.CDATA_SECTION_NODE):
14 return base.TEXT, node.nodeValue
15
16 elif node.nodeType == Node.ELEMENT_NODE:
17 attrs = {}
18 for attr in list(node.attributes.keys()):
19 attr = node.getAttributeNode(attr)
20 if attr.namespaceURI:
21 attrs[(attr.namespaceURI, attr.localName)] = attr.value
22 else:
23 attrs[(None, attr.name)] = attr.value
24 return (base.ELEMENT, node.namespaceURI, node.nodeName,
25 attrs, node.hasChildNodes())
26
27 elif node.nodeType == Node.COMMENT_NODE:
28 return base.COMMENT, node.nodeValue
29
30 elif node.nodeType in (Node.DOCUMENT_NODE, Node.DOCUMENT_FRAGMENT_NODE):
31 return (base.DOCUMENT,)
32
33 else:
34 return base.UNKNOWN, node.nodeType
35
36 def getFirstChild(self, node):
37 return node.firstChild
38
39 def getNextSibling(self, node):
40 return node.nextSibling
41
42 def getParentNode(self, node):
43 return node.parentNode
diff --git a/venv/lib/python3.7/site-packages/pip-10.0.1-py3.7.egg/pip/_vendor/html5lib/treewalkers/etree.py b/venv/lib/python3.7/site-packages/pip-10.0.1-py3.7.egg/pip/_vendor/html5lib/treewalkers/etree.py
new file mode 100644
index 0000000..1a35add
--- /dev/null
+++ b/venv/lib/python3.7/site-packages/pip-10.0.1-py3.7.egg/pip/_vendor/html5lib/treewalkers/etree.py
@@ -0,0 +1,130 @@
1from __future__ import absolute_import, division, unicode_literals
2
3from collections import OrderedDict
4import re
5
6from pip._vendor.six import string_types
7
8from . import base
9from .._utils import moduleFactoryFactory
10
11tag_regexp = re.compile("{([^}]*)}(.*)")
12
13
14def getETreeBuilder(ElementTreeImplementation):
15 ElementTree = ElementTreeImplementation
16 ElementTreeCommentType = ElementTree.Comment("asd").tag
17
18 class TreeWalker(base.NonRecursiveTreeWalker): # pylint:disable=unused-variable
19 """Given the particular ElementTree representation, this implementation,
20 to avoid using recursion, returns "nodes" as tuples with the following
21 content:
22
23 1. The current element
24
25 2. The index of the element relative to its parent
26
27 3. A stack of ancestor elements
28
29 4. A flag "text", "tail" or None to indicate if the current node is a
30 text node; either the text or tail of the current element (1)
31 """
32 def getNodeDetails(self, node):
33 if isinstance(node, tuple): # It might be the root Element
34 elt, _, _, flag = node
35 if flag in ("text", "tail"):
36 return base.TEXT, getattr(elt, flag)
37 else:
38 node = elt
39
40 if not(hasattr(node, "tag")):
41 node = node.getroot()
42
43 if node.tag in ("DOCUMENT_ROOT", "DOCUMENT_FRAGMENT"):
44 return (base.DOCUMENT,)
45
46 elif node.tag == "<!DOCTYPE>":
47 return (base.DOCTYPE, node.text,
48 node.get("publicId"), node.get("systemId"))
49
50 elif node.tag == ElementTreeCommentType:
51 return base.COMMENT, node.text
52
53 else:
54 assert isinstance(node.tag, string_types), type(node.tag)
55 # This is assumed to be an ordinary element
56 match = tag_regexp.match(node.tag)
57 if match:
58 namespace, tag = match.groups()
59 else:
60 namespace = None
61 tag = node.tag
62 attrs = OrderedDict()
63 for name, value in list(node.attrib.items()):
64 match = tag_regexp.match(name)
65 if match:
66 attrs[(match.group(1), match.group(2))] = value
67 else:
68 attrs[(None, name)] = value
69 return (base.ELEMENT, namespace, tag,
70 attrs, len(node) or node.text)
71
72 def getFirstChild(self, node):
73 if isinstance(node, tuple):
74 element, key, parents, flag = node
75 else:
76 element, key, parents, flag = node, None, [], None
77
78 if flag in ("text", "tail"):
79 return None
80 else:
81 if element.text:
82 return element, key, parents, "text"
83 elif len(element):
84 parents.append(element)
85 return element[0], 0, parents, None
86 else:
87 return None
88
89 def getNextSibling(self, node):
90 if isinstance(node, tuple):
91 element, key, parents, flag = node
92 else:
93 return None
94
95 if flag == "text":
96 if len(element):
97 parents.append(element)
98 return element[0], 0, parents, None
99 else:
100 return None
101 else:
102 if element.tail and flag != "tail":
103 return element, key, parents, "tail"
104 elif key < len(parents[-1]) - 1:
105 return parents[-1][key + 1], key + 1, parents, None
106 else:
107 return None
108
109 def getParentNode(self, node):
110 if isinstance(node, tuple):
111 element, key, parents, flag = node
112 else:
113 return None
114
115 if flag == "text":
116 if not parents:
117 return element
118 else:
119 return element, key, parents, None
120 else:
121 parent = parents.pop()
122 if not parents:
123 return parent
124 else:
125 assert list(parents[-1]).count(parent) == 1
126 return parent, list(parents[-1]).index(parent), parents, None
127
128 return locals()
129
130getETreeModule = moduleFactoryFactory(getETreeBuilder)
diff --git a/venv/lib/python3.7/site-packages/pip-10.0.1-py3.7.egg/pip/_vendor/html5lib/treewalkers/etree_lxml.py b/venv/lib/python3.7/site-packages/pip-10.0.1-py3.7.egg/pip/_vendor/html5lib/treewalkers/etree_lxml.py
new file mode 100644
index 0000000..f6f395a
--- /dev/null
+++ b/venv/lib/python3.7/site-packages/pip-10.0.1-py3.7.egg/pip/_vendor/html5lib/treewalkers/etree_lxml.py
@@ -0,0 +1,213 @@
1from __future__ import absolute_import, division, unicode_literals
2from pip._vendor.six import text_type
3
4from lxml import etree
5from ..treebuilders.etree import tag_regexp
6
7from . import base
8
9from .. import _ihatexml
10
11
12def ensure_str(s):
13 if s is None:
14 return None
15 elif isinstance(s, text_type):
16 return s
17 else:
18 return s.decode("ascii", "strict")
19
20
21class Root(object):
22 def __init__(self, et):
23 self.elementtree = et
24 self.children = []
25
26 try:
27 if et.docinfo.internalDTD:
28 self.children.append(Doctype(self,
29 ensure_str(et.docinfo.root_name),
30 ensure_str(et.docinfo.public_id),
31 ensure_str(et.docinfo.system_url)))
32 except AttributeError:
33 pass
34
35 try:
36 node = et.getroot()
37 except AttributeError:
38 node = et
39
40 while node.getprevious() is not None:
41 node = node.getprevious()
42 while node is not None:
43 self.children.append(node)
44 node = node.getnext()
45
46 self.text = None
47 self.tail = None
48
49 def __getitem__(self, key):
50 return self.children[key]
51
52 def getnext(self):
53 return None
54
55 def __len__(self):
56 return 1
57
58
59class Doctype(object):
60 def __init__(self, root_node, name, public_id, system_id):
61 self.root_node = root_node
62 self.name = name
63 self.public_id = public_id
64 self.system_id = system_id
65
66 self.text = None
67 self.tail = None
68
69 def getnext(self):
70 return self.root_node.children[1]
71
72
73class FragmentRoot(Root):
74 def __init__(self, children):
75 self.children = [FragmentWrapper(self, child) for child in children]
76 self.text = self.tail = None
77
78 def getnext(self):
79 return None
80
81
82class FragmentWrapper(object):
83 def __init__(self, fragment_root, obj):
84 self.root_node = fragment_root
85 self.obj = obj
86 if hasattr(self.obj, 'text'):
87 self.text = ensure_str(self.obj.text)
88 else:
89 self.text = None
90 if hasattr(self.obj, 'tail'):
91 self.tail = ensure_str(self.obj.tail)
92 else:
93 self.tail = None
94
95 def __getattr__(self, name):
96 return getattr(self.obj, name)
97
98 def getnext(self):
99 siblings = self.root_node.children
100 idx = siblings.index(self)
101 if idx < len(siblings) - 1:
102 return siblings[idx + 1]
103 else:
104 return None
105
106 def __getitem__(self, key):
107 return self.obj[key]
108
109 def __bool__(self):
110 return bool(self.obj)
111
112 def getparent(self):
113 return None
114
115 def __str__(self):
116 return str(self.obj)
117
118 def __unicode__(self):
119 return str(self.obj)
120
121 def __len__(self):
122 return len(self.obj)
123
124
125class TreeWalker(base.NonRecursiveTreeWalker):
126 def __init__(self, tree):
127 # pylint:disable=redefined-variable-type
128 if isinstance(tree, list):
129 self.fragmentChildren = set(tree)
130 tree = FragmentRoot(tree)
131 else:
132 self.fragmentChildren = set()
133 tree = Root(tree)
134 base.NonRecursiveTreeWalker.__init__(self, tree)
135 self.filter = _ihatexml.InfosetFilter()
136
137 def getNodeDetails(self, node):
138 if isinstance(node, tuple): # Text node
139 node, key = node
140 assert key in ("text", "tail"), "Text nodes are text or tail, found %s" % key
141 return base.TEXT, ensure_str(getattr(node, key))
142
143 elif isinstance(node, Root):
144 return (base.DOCUMENT,)
145
146 elif isinstance(node, Doctype):
147 return base.DOCTYPE, node.name, node.public_id, node.system_id
148
149 elif isinstance(node, FragmentWrapper) and not hasattr(node, "tag"):
150 return base.TEXT, ensure_str(node.obj)
151
152 elif node.tag == etree.Comment:
153 return base.COMMENT, ensure_str(node.text)
154
155 elif node.tag == etree.Entity:
156 return base.ENTITY, ensure_str(node.text)[1:-1] # strip &;
157
158 else:
159 # This is assumed to be an ordinary element
160 match = tag_regexp.match(ensure_str(node.tag))
161 if match:
162 namespace, tag = match.groups()
163 else:
164 namespace = None
165 tag = ensure_str(node.tag)
166 attrs = {}
167 for name, value in list(node.attrib.items()):
168 name = ensure_str(name)
169 value = ensure_str(value)
170 match = tag_regexp.match(name)
171 if match:
172 attrs[(match.group(1), match.group(2))] = value
173 else:
174 attrs[(None, name)] = value
175 return (base.ELEMENT, namespace, self.filter.fromXmlName(tag),
176 attrs, len(node) > 0 or node.text)
177
178 def getFirstChild(self, node):
179 assert not isinstance(node, tuple), "Text nodes have no children"
180
181 assert len(node) or node.text, "Node has no children"
182 if node.text:
183 return (node, "text")
184 else:
185 return node[0]
186
187 def getNextSibling(self, node):
188 if isinstance(node, tuple): # Text node
189 node, key = node
190 assert key in ("text", "tail"), "Text nodes are text or tail, found %s" % key
191 if key == "text":
192 # XXX: we cannot use a "bool(node) and node[0] or None" construct here
193 # because node[0] might evaluate to False if it has no child element
194 if len(node):
195 return node[0]
196 else:
197 return None
198 else: # tail
199 return node.getnext()
200
201 return (node, "tail") if node.tail else node.getnext()
202
203 def getParentNode(self, node):
204 if isinstance(node, tuple): # Text node
205 node, key = node
206 assert key in ("text", "tail"), "Text nodes are text or tail, found %s" % key
207 if key == "text":
208 return node
209 # else: fallback to "normal" processing
210 elif node in self.fragmentChildren:
211 return None
212
213 return node.getparent()
diff --git a/venv/lib/python3.7/site-packages/pip-10.0.1-py3.7.egg/pip/_vendor/html5lib/treewalkers/genshi.py b/venv/lib/python3.7/site-packages/pip-10.0.1-py3.7.egg/pip/_vendor/html5lib/treewalkers/genshi.py
new file mode 100644
index 0000000..42cd559
--- /dev/null
+++ b/venv/lib/python3.7/site-packages/pip-10.0.1-py3.7.egg/pip/_vendor/html5lib/treewalkers/genshi.py
@@ -0,0 +1,69 @@
1from __future__ import absolute_import, division, unicode_literals
2
3from genshi.core import QName
4from genshi.core import START, END, XML_NAMESPACE, DOCTYPE, TEXT
5from genshi.core import START_NS, END_NS, START_CDATA, END_CDATA, PI, COMMENT
6
7from . import base
8
9from ..constants import voidElements, namespaces
10
11
12class TreeWalker(base.TreeWalker):
13 def __iter__(self):
14 # Buffer the events so we can pass in the following one
15 previous = None
16 for event in self.tree:
17 if previous is not None:
18 for token in self.tokens(previous, event):
19 yield token
20 previous = event
21
22 # Don't forget the final event!
23 if previous is not None:
24 for token in self.tokens(previous, None):
25 yield token
26
27 def tokens(self, event, next):
28 kind, data, _ = event
29 if kind == START:
30 tag, attribs = data
31 name = tag.localname
32 namespace = tag.namespace
33 converted_attribs = {}
34 for k, v in attribs:
35 if isinstance(k, QName):
36 converted_attribs[(k.namespace, k.localname)] = v
37 else:
38 converted_attribs[(None, k)] = v
39
40 if namespace == namespaces["html"] and name in voidElements:
41 for token in self.emptyTag(namespace, name, converted_attribs,
42 not next or next[0] != END or
43 next[1] != tag):
44 yield token
45 else:
46 yield self.startTag(namespace, name, converted_attribs)
47
48 elif kind == END:
49 name = data.localname
50 namespace = data.namespace
51 if namespace != namespaces["html"] or name not in voidElements:
52 yield self.endTag(namespace, name)
53
54 elif kind == COMMENT:
55 yield self.comment(data)
56
57 elif kind == TEXT:
58 for token in self.text(data):
59 yield token
60
61 elif kind == DOCTYPE:
62 yield self.doctype(*data)
63
64 elif kind in (XML_NAMESPACE, DOCTYPE, START_NS, END_NS,
65 START_CDATA, END_CDATA, PI):
66 pass
67
68 else:
69 yield self.unknown(kind)