summaryrefslogtreecommitdiff
path: root/venv/lib/python3.7/site-packages/pip-10.0.1-py3.7.egg/pip/_vendor/distlib/locators.py
diff options
context:
space:
mode:
authorShubham Saini <shubham6405@gmail.com>2019-08-05 08:32:33 +0000
committerShubham Saini <shubham6405@gmail.com>2019-08-05 08:32:33 +0000
commit227b2d30a8675b44918f9d9ca89b24144a938215 (patch)
tree9f8e6a28724514b6fdf463a9ab2067a7ef309b72 /venv/lib/python3.7/site-packages/pip-10.0.1-py3.7.egg/pip/_vendor/distlib/locators.py
parent842a8cfbbbdb1f92889d892e4859dbd5d40c5be8 (diff)
removing venv files
Diffstat (limited to 'venv/lib/python3.7/site-packages/pip-10.0.1-py3.7.egg/pip/_vendor/distlib/locators.py')
-rw-r--r--venv/lib/python3.7/site-packages/pip-10.0.1-py3.7.egg/pip/_vendor/distlib/locators.py1292
1 files changed, 0 insertions, 1292 deletions
diff --git a/venv/lib/python3.7/site-packages/pip-10.0.1-py3.7.egg/pip/_vendor/distlib/locators.py b/venv/lib/python3.7/site-packages/pip-10.0.1-py3.7.egg/pip/_vendor/distlib/locators.py
deleted file mode 100644
index 9131b77..0000000
--- a/venv/lib/python3.7/site-packages/pip-10.0.1-py3.7.egg/pip/_vendor/distlib/locators.py
+++ /dev/null
@@ -1,1292 +0,0 @@
1# -*- coding: utf-8 -*-
2#
3# Copyright (C) 2012-2015 Vinay Sajip.
4# Licensed to the Python Software Foundation under a contributor agreement.
5# See LICENSE.txt and CONTRIBUTORS.txt.
6#
7
8import gzip
9from io import BytesIO
10import json
11import logging
12import os
13import posixpath
14import re
15try:
16 import threading
17except ImportError: # pragma: no cover
18 import dummy_threading as threading
19import zlib
20
21from . import DistlibException
22from .compat import (urljoin, urlparse, urlunparse, url2pathname, pathname2url,
23 queue, quote, unescape, string_types, build_opener,
24 HTTPRedirectHandler as BaseRedirectHandler, text_type,
25 Request, HTTPError, URLError)
26from .database import Distribution, DistributionPath, make_dist
27from .metadata import Metadata, MetadataInvalidError
28from .util import (cached_property, parse_credentials, ensure_slash,
29 split_filename, get_project_data, parse_requirement,
30 parse_name_and_version, ServerProxy, normalize_name)
31from .version import get_scheme, UnsupportedVersionError
32from .wheel import Wheel, is_compatible
33
34logger = logging.getLogger(__name__)
35
36HASHER_HASH = re.compile(r'^(\w+)=([a-f0-9]+)')
37CHARSET = re.compile(r';\s*charset\s*=\s*(.*)\s*$', re.I)
38HTML_CONTENT_TYPE = re.compile('text/html|application/x(ht)?ml')
39DEFAULT_INDEX = 'https://pypi.python.org/pypi'
40
41def get_all_distribution_names(url=None):
42 """
43 Return all distribution names known by an index.
44 :param url: The URL of the index.
45 :return: A list of all known distribution names.
46 """
47 if url is None:
48 url = DEFAULT_INDEX
49 client = ServerProxy(url, timeout=3.0)
50 try:
51 return client.list_packages()
52 finally:
53 client('close')()
54
55class RedirectHandler(BaseRedirectHandler):
56 """
57 A class to work around a bug in some Python 3.2.x releases.
58 """
59 # There's a bug in the base version for some 3.2.x
60 # (e.g. 3.2.2 on Ubuntu Oneiric). If a Location header
61 # returns e.g. /abc, it bails because it says the scheme ''
62 # is bogus, when actually it should use the request's
63 # URL for the scheme. See Python issue #13696.
64 def http_error_302(self, req, fp, code, msg, headers):
65 # Some servers (incorrectly) return multiple Location headers
66 # (so probably same goes for URI). Use first header.
67 newurl = None
68 for key in ('location', 'uri'):
69 if key in headers:
70 newurl = headers[key]
71 break
72 if newurl is None: # pragma: no cover
73 return
74 urlparts = urlparse(newurl)
75 if urlparts.scheme == '':
76 newurl = urljoin(req.get_full_url(), newurl)
77 if hasattr(headers, 'replace_header'):
78 headers.replace_header(key, newurl)
79 else:
80 headers[key] = newurl
81 return BaseRedirectHandler.http_error_302(self, req, fp, code, msg,
82 headers)
83
84 http_error_301 = http_error_303 = http_error_307 = http_error_302
85
86class Locator(object):
87 """
88 A base class for locators - things that locate distributions.
89 """
90 source_extensions = ('.tar.gz', '.tar.bz2', '.tar', '.zip', '.tgz', '.tbz')
91 binary_extensions = ('.egg', '.exe', '.whl')
92 excluded_extensions = ('.pdf',)
93
94 # A list of tags indicating which wheels you want to match. The default
95 # value of None matches against the tags compatible with the running
96 # Python. If you want to match other values, set wheel_tags on a locator
97 # instance to a list of tuples (pyver, abi, arch) which you want to match.
98 wheel_tags = None
99
100 downloadable_extensions = source_extensions + ('.whl',)
101
102 def __init__(self, scheme='default'):
103 """
104 Initialise an instance.
105 :param scheme: Because locators look for most recent versions, they
106 need to know the version scheme to use. This specifies
107 the current PEP-recommended scheme - use ``'legacy'``
108 if you need to support existing distributions on PyPI.
109 """
110 self._cache = {}
111 self.scheme = scheme
112 # Because of bugs in some of the handlers on some of the platforms,
113 # we use our own opener rather than just using urlopen.
114 self.opener = build_opener(RedirectHandler())
115 # If get_project() is called from locate(), the matcher instance
116 # is set from the requirement passed to locate(). See issue #18 for
117 # why this can be useful to know.
118 self.matcher = None
119 self.errors = queue.Queue()
120
121 def get_errors(self):
122 """
123 Return any errors which have occurred.
124 """
125 result = []
126 while not self.errors.empty(): # pragma: no cover
127 try:
128 e = self.errors.get(False)
129 result.append(e)
130 except self.errors.Empty:
131 continue
132 self.errors.task_done()
133 return result
134
135 def clear_errors(self):
136 """
137 Clear any errors which may have been logged.
138 """
139 # Just get the errors and throw them away
140 self.get_errors()
141
142 def clear_cache(self):
143 self._cache.clear()
144
145 def _get_scheme(self):
146 return self._scheme
147
148 def _set_scheme(self, value):
149 self._scheme = value
150
151 scheme = property(_get_scheme, _set_scheme)
152
153 def _get_project(self, name):
154 """
155 For a given project, get a dictionary mapping available versions to Distribution
156 instances.
157
158 This should be implemented in subclasses.
159
160 If called from a locate() request, self.matcher will be set to a
161 matcher for the requirement to satisfy, otherwise it will be None.
162 """
163 raise NotImplementedError('Please implement in the subclass')
164
165 def get_distribution_names(self):
166 """
167 Return all the distribution names known to this locator.
168 """
169 raise NotImplementedError('Please implement in the subclass')
170
171 def get_project(self, name):
172 """
173 For a given project, get a dictionary mapping available versions to Distribution
174 instances.
175
176 This calls _get_project to do all the work, and just implements a caching layer on top.
177 """
178 if self._cache is None: # pragma: no cover
179 result = self._get_project(name)
180 elif name in self._cache:
181 result = self._cache[name]
182 else:
183 self.clear_errors()
184 result = self._get_project(name)
185 self._cache[name] = result
186 return result
187
188 def score_url(self, url):
189 """
190 Give an url a score which can be used to choose preferred URLs
191 for a given project release.
192 """
193 t = urlparse(url)
194 basename = posixpath.basename(t.path)
195 compatible = True
196 is_wheel = basename.endswith('.whl')
197 is_downloadable = basename.endswith(self.downloadable_extensions)
198 if is_wheel:
199 compatible = is_compatible(Wheel(basename), self.wheel_tags)
200 return (t.scheme == 'https', 'pypi.python.org' in t.netloc,
201 is_downloadable, is_wheel, compatible, basename)
202
203 def prefer_url(self, url1, url2):
204 """
205 Choose one of two URLs where both are candidates for distribution
206 archives for the same version of a distribution (for example,
207 .tar.gz vs. zip).
208
209 The current implementation favours https:// URLs over http://, archives
210 from PyPI over those from other locations, wheel compatibility (if a
211 wheel) and then the archive name.
212 """
213 result = url2
214 if url1:
215 s1 = self.score_url(url1)
216 s2 = self.score_url(url2)
217 if s1 > s2:
218 result = url1
219 if result != url2:
220 logger.debug('Not replacing %r with %r', url1, url2)
221 else:
222 logger.debug('Replacing %r with %r', url1, url2)
223 return result
224
225 def split_filename(self, filename, project_name):
226 """
227 Attempt to split a filename in project name, version and Python version.
228 """
229 return split_filename(filename, project_name)
230
231 def convert_url_to_download_info(self, url, project_name):
232 """
233 See if a URL is a candidate for a download URL for a project (the URL
234 has typically been scraped from an HTML page).
235
236 If it is, a dictionary is returned with keys "name", "version",
237 "filename" and "url"; otherwise, None is returned.
238 """
239 def same_project(name1, name2):
240 return normalize_name(name1) == normalize_name(name2)
241
242 result = None
243 scheme, netloc, path, params, query, frag = urlparse(url)
244 if frag.lower().startswith('egg='): # pragma: no cover
245 logger.debug('%s: version hint in fragment: %r',
246 project_name, frag)
247 m = HASHER_HASH.match(frag)
248 if m:
249 algo, digest = m.groups()
250 else:
251 algo, digest = None, None
252 origpath = path
253 if path and path[-1] == '/': # pragma: no cover
254 path = path[:-1]
255 if path.endswith('.whl'):
256 try:
257 wheel = Wheel(path)
258 if is_compatible(wheel, self.wheel_tags):
259 if project_name is None:
260 include = True
261 else:
262 include = same_project(wheel.name, project_name)
263 if include:
264 result = {
265 'name': wheel.name,
266 'version': wheel.version,
267 'filename': wheel.filename,
268 'url': urlunparse((scheme, netloc, origpath,
269 params, query, '')),
270 'python-version': ', '.join(
271 ['.'.join(list(v[2:])) for v in wheel.pyver]),
272 }
273 except Exception as e: # pragma: no cover
274 logger.warning('invalid path for wheel: %s', path)
275 elif not path.endswith(self.downloadable_extensions): # pragma: no cover
276 logger.debug('Not downloadable: %s', path)
277 else: # downloadable extension
278 path = filename = posixpath.basename(path)
279 for ext in self.downloadable_extensions:
280 if path.endswith(ext):
281 path = path[:-len(ext)]
282 t = self.split_filename(path, project_name)
283 if not t: # pragma: no cover
284 logger.debug('No match for project/version: %s', path)
285 else:
286 name, version, pyver = t
287 if not project_name or same_project(project_name, name):
288 result = {
289 'name': name,
290 'version': version,
291 'filename': filename,
292 'url': urlunparse((scheme, netloc, origpath,
293 params, query, '')),
294 #'packagetype': 'sdist',
295 }
296 if pyver: # pragma: no cover
297 result['python-version'] = pyver
298 break
299 if result and algo:
300 result['%s_digest' % algo] = digest
301 return result
302
303 def _get_digest(self, info):
304 """
305 Get a digest from a dictionary by looking at keys of the form
306 'algo_digest'.
307
308 Returns a 2-tuple (algo, digest) if found, else None. Currently
309 looks only for SHA256, then MD5.
310 """
311 result = None
312 for algo in ('sha256', 'md5'):
313 key = '%s_digest' % algo
314 if key in info:
315 result = (algo, info[key])
316 break
317 return result
318
319 def _update_version_data(self, result, info):
320 """
321 Update a result dictionary (the final result from _get_project) with a
322 dictionary for a specific version, which typically holds information
323 gleaned from a filename or URL for an archive for the distribution.
324 """
325 name = info.pop('name')
326 version = info.pop('version')
327 if version in result:
328 dist = result[version]
329 md = dist.metadata
330 else:
331 dist = make_dist(name, version, scheme=self.scheme)
332 md = dist.metadata
333 dist.digest = digest = self._get_digest(info)
334 url = info['url']
335 result['digests'][url] = digest
336 if md.source_url != info['url']:
337 md.source_url = self.prefer_url(md.source_url, url)
338 result['urls'].setdefault(version, set()).add(url)
339 dist.locator = self
340 result[version] = dist
341
342 def locate(self, requirement, prereleases=False):
343 """
344 Find the most recent distribution which matches the given
345 requirement.
346
347 :param requirement: A requirement of the form 'foo (1.0)' or perhaps
348 'foo (>= 1.0, < 2.0, != 1.3)'
349 :param prereleases: If ``True``, allow pre-release versions
350 to be located. Otherwise, pre-release versions
351 are not returned.
352 :return: A :class:`Distribution` instance, or ``None`` if no such
353 distribution could be located.
354 """
355 result = None
356 r = parse_requirement(requirement)
357 if r is None: # pragma: no cover
358 raise DistlibException('Not a valid requirement: %r' % requirement)
359 scheme = get_scheme(self.scheme)
360 self.matcher = matcher = scheme.matcher(r.requirement)
361 logger.debug('matcher: %s (%s)', matcher, type(matcher).__name__)
362 versions = self.get_project(r.name)
363 if len(versions) > 2: # urls and digests keys are present
364 # sometimes, versions are invalid
365 slist = []
366 vcls = matcher.version_class
367 for k in versions:
368 if k in ('urls', 'digests'):
369 continue
370 try:
371 if not matcher.match(k):
372 logger.debug('%s did not match %r', matcher, k)
373 else:
374 if prereleases or not vcls(k).is_prerelease:
375 slist.append(k)
376 else:
377 logger.debug('skipping pre-release '
378 'version %s of %s', k, matcher.name)
379 except Exception: # pragma: no cover
380 logger.warning('error matching %s with %r', matcher, k)
381 pass # slist.append(k)
382 if len(slist) > 1:
383 slist = sorted(slist, key=scheme.key)
384 if slist:
385 logger.debug('sorted list: %s', slist)
386 version = slist[-1]
387 result = versions[version]
388 if result:
389 if r.extras:
390 result.extras = r.extras
391 result.download_urls = versions.get('urls', {}).get(version, set())
392 d = {}
393 sd = versions.get('digests', {})
394 for url in result.download_urls:
395 if url in sd: # pragma: no cover
396 d[url] = sd[url]
397 result.digests = d
398 self.matcher = None
399 return result
400
401
402class PyPIRPCLocator(Locator):
403 """
404 This locator uses XML-RPC to locate distributions. It therefore
405 cannot be used with simple mirrors (that only mirror file content).
406 """
407 def __init__(self, url, **kwargs):
408 """
409 Initialise an instance.
410
411 :param url: The URL to use for XML-RPC.
412 :param kwargs: Passed to the superclass constructor.
413 """
414 super(PyPIRPCLocator, self).__init__(**kwargs)
415 self.base_url = url
416 self.client = ServerProxy(url, timeout=3.0)
417
418 def get_distribution_names(self):
419 """
420 Return all the distribution names known to this locator.
421 """
422 return set(self.client.list_packages())
423
424 def _get_project(self, name):
425 result = {'urls': {}, 'digests': {}}
426 versions = self.client.package_releases(name, True)
427 for v in versions:
428 urls = self.client.release_urls(name, v)
429 data = self.client.release_data(name, v)
430 metadata = Metadata(scheme=self.scheme)
431 metadata.name = data['name']
432 metadata.version = data['version']
433 metadata.license = data.get('license')
434 metadata.keywords = data.get('keywords', [])
435 metadata.summary = data.get('summary')
436 dist = Distribution(metadata)
437 if urls:
438 info = urls[0]
439 metadata.source_url = info['url']
440 dist.digest = self._get_digest(info)
441 dist.locator = self
442 result[v] = dist
443 for info in urls:
444 url = info['url']
445 digest = self._get_digest(info)
446 result['urls'].setdefault(v, set()).add(url)
447 result['digests'][url] = digest
448 return result
449
450class PyPIJSONLocator(Locator):
451 """
452 This locator uses PyPI's JSON interface. It's very limited in functionality
453 and probably not worth using.
454 """
455 def __init__(self, url, **kwargs):
456 super(PyPIJSONLocator, self).__init__(**kwargs)
457 self.base_url = ensure_slash(url)
458
459 def get_distribution_names(self):
460 """
461 Return all the distribution names known to this locator.
462 """
463 raise NotImplementedError('Not available from this locator')
464
465 def _get_project(self, name):
466 result = {'urls': {}, 'digests': {}}
467 url = urljoin(self.base_url, '%s/json' % quote(name))
468 try:
469 resp = self.opener.open(url)
470 data = resp.read().decode() # for now
471 d = json.loads(data)
472 md = Metadata(scheme=self.scheme)
473 data = d['info']
474 md.name = data['name']
475 md.version = data['version']
476 md.license = data.get('license')
477 md.keywords = data.get('keywords', [])
478 md.summary = data.get('summary')
479 dist = Distribution(md)
480 dist.locator = self
481 urls = d['urls']
482 result[md.version] = dist
483 for info in d['urls']:
484 url = info['url']
485 dist.download_urls.add(url)
486 dist.digests[url] = self._get_digest(info)
487 result['urls'].setdefault(md.version, set()).add(url)
488 result['digests'][url] = self._get_digest(info)
489 # Now get other releases
490 for version, infos in d['releases'].items():
491 if version == md.version:
492 continue # already done
493 omd = Metadata(scheme=self.scheme)
494 omd.name = md.name
495 omd.version = version
496 odist = Distribution(omd)
497 odist.locator = self
498 result[version] = odist
499 for info in infos:
500 url = info['url']
501 odist.download_urls.add(url)
502 odist.digests[url] = self._get_digest(info)
503 result['urls'].setdefault(version, set()).add(url)
504 result['digests'][url] = self._get_digest(info)
505# for info in urls:
506# md.source_url = info['url']
507# dist.digest = self._get_digest(info)
508# dist.locator = self
509# for info in urls:
510# url = info['url']
511# result['urls'].setdefault(md.version, set()).add(url)
512# result['digests'][url] = self._get_digest(info)
513 except Exception as e:
514 self.errors.put(text_type(e))
515 logger.exception('JSON fetch failed: %s', e)
516 return result
517
518
519class Page(object):
520 """
521 This class represents a scraped HTML page.
522 """
523 # The following slightly hairy-looking regex just looks for the contents of
524 # an anchor link, which has an attribute "href" either immediately preceded
525 # or immediately followed by a "rel" attribute. The attribute values can be
526 # declared with double quotes, single quotes or no quotes - which leads to
527 # the length of the expression.
528 _href = re.compile("""
529(rel\\s*=\\s*(?:"(?P<rel1>[^"]*)"|'(?P<rel2>[^']*)'|(?P<rel3>[^>\\s\n]*))\\s+)?
530href\\s*=\\s*(?:"(?P<url1>[^"]*)"|'(?P<url2>[^']*)'|(?P<url3>[^>\\s\n]*))
531(\\s+rel\\s*=\\s*(?:"(?P<rel4>[^"]*)"|'(?P<rel5>[^']*)'|(?P<rel6>[^>\\s\n]*)))?
532""", re.I | re.S | re.X)
533 _base = re.compile(r"""<base\s+href\s*=\s*['"]?([^'">]+)""", re.I | re.S)
534
535 def __init__(self, data, url):
536 """
537 Initialise an instance with the Unicode page contents and the URL they
538 came from.
539 """
540 self.data = data
541 self.base_url = self.url = url
542 m = self._base.search(self.data)
543 if m:
544 self.base_url = m.group(1)
545
546 _clean_re = re.compile(r'[^a-z0-9$&+,/:;=?@.#%_\\|-]', re.I)
547
548 @cached_property
549 def links(self):
550 """
551 Return the URLs of all the links on a page together with information
552 about their "rel" attribute, for determining which ones to treat as
553 downloads and which ones to queue for further scraping.
554 """
555 def clean(url):
556 "Tidy up an URL."
557 scheme, netloc, path, params, query, frag = urlparse(url)
558 return urlunparse((scheme, netloc, quote(path),
559 params, query, frag))
560
561 result = set()
562 for match in self._href.finditer(self.data):
563 d = match.groupdict('')
564 rel = (d['rel1'] or d['rel2'] or d['rel3'] or
565 d['rel4'] or d['rel5'] or d['rel6'])
566 url = d['url1'] or d['url2'] or d['url3']
567 url = urljoin(self.base_url, url)
568 url = unescape(url)
569 url = self._clean_re.sub(lambda m: '%%%2x' % ord(m.group(0)), url)
570 result.add((url, rel))
571 # We sort the result, hoping to bring the most recent versions
572 # to the front
573 result = sorted(result, key=lambda t: t[0], reverse=True)
574 return result
575
576
577class SimpleScrapingLocator(Locator):
578 """
579 A locator which scrapes HTML pages to locate downloads for a distribution.
580 This runs multiple threads to do the I/O; performance is at least as good
581 as pip's PackageFinder, which works in an analogous fashion.
582 """
583
584 # These are used to deal with various Content-Encoding schemes.
585 decoders = {
586 'deflate': zlib.decompress,
587 'gzip': lambda b: gzip.GzipFile(fileobj=BytesIO(d)).read(),
588 'none': lambda b: b,
589 }
590
591 def __init__(self, url, timeout=None, num_workers=10, **kwargs):
592 """
593 Initialise an instance.
594 :param url: The root URL to use for scraping.
595 :param timeout: The timeout, in seconds, to be applied to requests.
596 This defaults to ``None`` (no timeout specified).
597 :param num_workers: The number of worker threads you want to do I/O,
598 This defaults to 10.
599 :param kwargs: Passed to the superclass.
600 """
601 super(SimpleScrapingLocator, self).__init__(**kwargs)
602 self.base_url = ensure_slash(url)
603 self.timeout = timeout
604 self._page_cache = {}
605 self._seen = set()
606 self._to_fetch = queue.Queue()
607 self._bad_hosts = set()
608 self.skip_externals = False
609 self.num_workers = num_workers
610 self._lock = threading.RLock()
611 # See issue #45: we need to be resilient when the locator is used
612 # in a thread, e.g. with concurrent.futures. We can't use self._lock
613 # as it is for coordinating our internal threads - the ones created
614 # in _prepare_threads.
615 self._gplock = threading.RLock()
616
617 def _prepare_threads(self):
618 """
619 Threads are created only when get_project is called, and terminate
620 before it returns. They are there primarily to parallelise I/O (i.e.
621 fetching web pages).
622 """
623 self._threads = []
624 for i in range(self.num_workers):
625 t = threading.Thread(target=self._fetch)
626 t.setDaemon(True)
627 t.start()
628 self._threads.append(t)
629
630 def _wait_threads(self):
631 """
632 Tell all the threads to terminate (by sending a sentinel value) and
633 wait for them to do so.
634 """
635 # Note that you need two loops, since you can't say which
636 # thread will get each sentinel
637 for t in self._threads:
638 self._to_fetch.put(None) # sentinel
639 for t in self._threads:
640 t.join()
641 self._threads = []
642
643 def _get_project(self, name):
644 result = {'urls': {}, 'digests': {}}
645 with self._gplock:
646 self.result = result
647 self.project_name = name
648 url = urljoin(self.base_url, '%s/' % quote(name))
649 self._seen.clear()
650 self._page_cache.clear()
651 self._prepare_threads()
652 try:
653 logger.debug('Queueing %s', url)
654 self._to_fetch.put(url)
655 self._to_fetch.join()
656 finally:
657 self._wait_threads()
658 del self.result
659 return result
660
661 platform_dependent = re.compile(r'\b(linux-(i\d86|x86_64|arm\w+)|'
662 r'win(32|-amd64)|macosx-?\d+)\b', re.I)
663
664 def _is_platform_dependent(self, url):
665 """
666 Does an URL refer to a platform-specific download?
667 """
668 return self.platform_dependent.search(url)
669
670 def _process_download(self, url):
671 """
672 See if an URL is a suitable download for a project.
673
674 If it is, register information in the result dictionary (for
675 _get_project) about the specific version it's for.
676
677 Note that the return value isn't actually used other than as a boolean
678 value.
679 """
680 if self._is_platform_dependent(url):
681 info = None
682 else:
683 info = self.convert_url_to_download_info(url, self.project_name)
684 logger.debug('process_download: %s -> %s', url, info)
685 if info:
686 with self._lock: # needed because self.result is shared
687 self._update_version_data(self.result, info)
688 return info
689
690 def _should_queue(self, link, referrer, rel):
691 """
692 Determine whether a link URL from a referring page and with a
693 particular "rel" attribute should be queued for scraping.
694 """
695 scheme, netloc, path, _, _, _ = urlparse(link)
696 if path.endswith(self.source_extensions + self.binary_extensions +
697 self.excluded_extensions):
698 result = False
699 elif self.skip_externals and not link.startswith(self.base_url):
700 result = False
701 elif not referrer.startswith(self.base_url):
702 result = False
703 elif rel not in ('homepage', 'download'):
704 result = False
705 elif scheme not in ('http', 'https', 'ftp'):
706 result = False
707 elif self._is_platform_dependent(link):
708 result = False
709 else:
710 host = netloc.split(':', 1)[0]
711 if host.lower() == 'localhost':
712 result = False
713 else:
714 result = True
715 logger.debug('should_queue: %s (%s) from %s -> %s', link, rel,
716 referrer, result)
717 return result
718
719 def _fetch(self):
720 """
721 Get a URL to fetch from the work queue, get the HTML page, examine its
722 links for download candidates and candidates for further scraping.
723
724 This is a handy method to run in a thread.
725 """
726 while True:
727 url = self._to_fetch.get()
728 try:
729 if url:
730 page = self.get_page(url)
731 if page is None: # e.g. after an error
732 continue
733 for link, rel in page.links:
734 if link not in self._seen:
735 try:
736 self._seen.add(link)
737 if (not self._process_download(link) and
738 self._should_queue(link, url, rel)):
739 logger.debug('Queueing %s from %s', link, url)
740 self._to_fetch.put(link)
741 except MetadataInvalidError: # e.g. invalid versions
742 pass
743 except Exception as e: # pragma: no cover
744 self.errors.put(text_type(e))
745 finally:
746 # always do this, to avoid hangs :-)
747 self._to_fetch.task_done()
748 if not url:
749 #logger.debug('Sentinel seen, quitting.')
750 break
751
752 def get_page(self, url):
753 """
754 Get the HTML for an URL, possibly from an in-memory cache.
755
756 XXX TODO Note: this cache is never actually cleared. It's assumed that
757 the data won't get stale over the lifetime of a locator instance (not
758 necessarily true for the default_locator).
759 """
760 # http://peak.telecommunity.com/DevCenter/EasyInstall#package-index-api
761 scheme, netloc, path, _, _, _ = urlparse(url)
762 if scheme == 'file' and os.path.isdir(url2pathname(path)):
763 url = urljoin(ensure_slash(url), 'index.html')
764
765 if url in self._page_cache:
766 result = self._page_cache[url]
767 logger.debug('Returning %s from cache: %s', url, result)
768 else:
769 host = netloc.split(':', 1)[0]
770 result = None
771 if host in self._bad_hosts:
772 logger.debug('Skipping %s due to bad host %s', url, host)
773 else:
774 req = Request(url, headers={'Accept-encoding': 'identity'})
775 try:
776 logger.debug('Fetching %s', url)
777 resp = self.opener.open(req, timeout=self.timeout)
778 logger.debug('Fetched %s', url)
779 headers = resp.info()
780 content_type = headers.get('Content-Type', '')
781 if HTML_CONTENT_TYPE.match(content_type):
782 final_url = resp.geturl()
783 data = resp.read()
784 encoding = headers.get('Content-Encoding')
785 if encoding:
786 decoder = self.decoders[encoding] # fail if not found
787 data = decoder(data)
788 encoding = 'utf-8'
789 m = CHARSET.search(content_type)
790 if m:
791 encoding = m.group(1)
792 try:
793 data = data.decode(encoding)
794 except UnicodeError: # pragma: no cover
795 data = data.decode('latin-1') # fallback
796 result = Page(data, final_url)
797 self._page_cache[final_url] = result
798 except HTTPError as e:
799 if e.code != 404:
800 logger.exception('Fetch failed: %s: %s', url, e)
801 except URLError as e: # pragma: no cover
802 logger.exception('Fetch failed: %s: %s', url, e)
803 with self._lock:
804 self._bad_hosts.add(host)
805 except Exception as e: # pragma: no cover
806 logger.exception('Fetch failed: %s: %s', url, e)
807 finally:
808 self._page_cache[url] = result # even if None (failure)
809 return result
810
811 _distname_re = re.compile('<a href=[^>]*>([^<]+)<')
812
813 def get_distribution_names(self):
814 """
815 Return all the distribution names known to this locator.
816 """
817 result = set()
818 page = self.get_page(self.base_url)
819 if not page:
820 raise DistlibException('Unable to get %s' % self.base_url)
821 for match in self._distname_re.finditer(page.data):
822 result.add(match.group(1))
823 return result
824
825class DirectoryLocator(Locator):
826 """
827 This class locates distributions in a directory tree.
828 """
829
830 def __init__(self, path, **kwargs):
831 """
832 Initialise an instance.
833 :param path: The root of the directory tree to search.
834 :param kwargs: Passed to the superclass constructor,
835 except for:
836 * recursive - if True (the default), subdirectories are
837 recursed into. If False, only the top-level directory
838 is searched,
839 """
840 self.recursive = kwargs.pop('recursive', True)
841 super(DirectoryLocator, self).__init__(**kwargs)
842 path = os.path.abspath(path)
843 if not os.path.isdir(path): # pragma: no cover
844 raise DistlibException('Not a directory: %r' % path)
845 self.base_dir = path
846
847 def should_include(self, filename, parent):
848 """
849 Should a filename be considered as a candidate for a distribution
850 archive? As well as the filename, the directory which contains it
851 is provided, though not used by the current implementation.
852 """
853 return filename.endswith(self.downloadable_extensions)
854
855 def _get_project(self, name):
856 result = {'urls': {}, 'digests': {}}
857 for root, dirs, files in os.walk(self.base_dir):
858 for fn in files:
859 if self.should_include(fn, root):
860 fn = os.path.join(root, fn)
861 url = urlunparse(('file', '',
862 pathname2url(os.path.abspath(fn)),
863 '', '', ''))
864 info = self.convert_url_to_download_info(url, name)
865 if info:
866 self._update_version_data(result, info)
867 if not self.recursive:
868 break
869 return result
870
871 def get_distribution_names(self):
872 """
873 Return all the distribution names known to this locator.
874 """
875 result = set()
876 for root, dirs, files in os.walk(self.base_dir):
877 for fn in files:
878 if self.should_include(fn, root):
879 fn = os.path.join(root, fn)
880 url = urlunparse(('file', '',
881 pathname2url(os.path.abspath(fn)),
882 '', '', ''))
883 info = self.convert_url_to_download_info(url, None)
884 if info:
885 result.add(info['name'])
886 if not self.recursive:
887 break
888 return result
889
890class JSONLocator(Locator):
891 """
892 This locator uses special extended metadata (not available on PyPI) and is
893 the basis of performant dependency resolution in distlib. Other locators
894 require archive downloads before dependencies can be determined! As you
895 might imagine, that can be slow.
896 """
897 def get_distribution_names(self):
898 """
899 Return all the distribution names known to this locator.
900 """
901 raise NotImplementedError('Not available from this locator')
902
903 def _get_project(self, name):
904 result = {'urls': {}, 'digests': {}}
905 data = get_project_data(name)
906 if data:
907 for info in data.get('files', []):
908 if info['ptype'] != 'sdist' or info['pyversion'] != 'source':
909 continue
910 # We don't store summary in project metadata as it makes
911 # the data bigger for no benefit during dependency
912 # resolution
913 dist = make_dist(data['name'], info['version'],
914 summary=data.get('summary',
915 'Placeholder for summary'),
916 scheme=self.scheme)
917 md = dist.metadata
918 md.source_url = info['url']
919 # TODO SHA256 digest
920 if 'digest' in info and info['digest']:
921 dist.digest = ('md5', info['digest'])
922 md.dependencies = info.get('requirements', {})
923 dist.exports = info.get('exports', {})
924 result[dist.version] = dist
925 result['urls'].setdefault(dist.version, set()).add(info['url'])
926 return result
927
928class DistPathLocator(Locator):
929 """
930 This locator finds installed distributions in a path. It can be useful for
931 adding to an :class:`AggregatingLocator`.
932 """
933 def __init__(self, distpath, **kwargs):
934 """
935 Initialise an instance.
936
937 :param distpath: A :class:`DistributionPath` instance to search.
938 """
939 super(DistPathLocator, self).__init__(**kwargs)
940 assert isinstance(distpath, DistributionPath)
941 self.distpath = distpath
942
943 def _get_project(self, name):
944 dist = self.distpath.get_distribution(name)
945 if dist is None:
946 result = {'urls': {}, 'digests': {}}
947 else:
948 result = {
949 dist.version: dist,
950 'urls': {dist.version: set([dist.source_url])},
951 'digests': {dist.version: set([None])}
952 }
953 return result
954
955
956class AggregatingLocator(Locator):
957 """
958 This class allows you to chain and/or merge a list of locators.
959 """
960 def __init__(self, *locators, **kwargs):
961 """
962 Initialise an instance.
963
964 :param locators: The list of locators to search.
965 :param kwargs: Passed to the superclass constructor,
966 except for:
967 * merge - if False (the default), the first successful
968 search from any of the locators is returned. If True,
969 the results from all locators are merged (this can be
970 slow).
971 """
972 self.merge = kwargs.pop('merge', False)
973 self.locators = locators
974 super(AggregatingLocator, self).__init__(**kwargs)
975
976 def clear_cache(self):
977 super(AggregatingLocator, self).clear_cache()
978 for locator in self.locators:
979 locator.clear_cache()
980
981 def _set_scheme(self, value):
982 self._scheme = value
983 for locator in self.locators:
984 locator.scheme = value
985
986 scheme = property(Locator.scheme.fget, _set_scheme)
987
988 def _get_project(self, name):
989 result = {}
990 for locator in self.locators:
991 d = locator.get_project(name)
992 if d:
993 if self.merge:
994 files = result.get('urls', {})
995 digests = result.get('digests', {})
996 # next line could overwrite result['urls'], result['digests']
997 result.update(d)
998 df = result.get('urls')
999 if files and df:
1000 for k, v in files.items():
1001 if k in df:
1002 df[k] |= v
1003 else:
1004 df[k] = v
1005 dd = result.get('digests')
1006 if digests and dd:
1007 dd.update(digests)
1008 else:
1009 # See issue #18. If any dists are found and we're looking
1010 # for specific constraints, we only return something if
1011 # a match is found. For example, if a DirectoryLocator
1012 # returns just foo (1.0) while we're looking for
1013 # foo (>= 2.0), we'll pretend there was nothing there so
1014 # that subsequent locators can be queried. Otherwise we
1015 # would just return foo (1.0) which would then lead to a
1016 # failure to find foo (>= 2.0), because other locators
1017 # weren't searched. Note that this only matters when
1018 # merge=False.
1019 if self.matcher is None:
1020 found = True
1021 else:
1022 found = False
1023 for k in d:
1024 if self.matcher.match(k):
1025 found = True
1026 break
1027 if found:
1028 result = d
1029 break
1030 return result
1031
1032 def get_distribution_names(self):
1033 """
1034 Return all the distribution names known to this locator.
1035 """
1036 result = set()
1037 for locator in self.locators:
1038 try:
1039 result |= locator.get_distribution_names()
1040 except NotImplementedError:
1041 pass
1042 return result
1043
1044
1045# We use a legacy scheme simply because most of the dists on PyPI use legacy
1046# versions which don't conform to PEP 426 / PEP 440.
1047default_locator = AggregatingLocator(
1048 JSONLocator(),
1049 SimpleScrapingLocator('https://pypi.python.org/simple/',
1050 timeout=3.0),
1051 scheme='legacy')
1052
1053locate = default_locator.locate
1054
1055NAME_VERSION_RE = re.compile(r'(?P<name>[\w-]+)\s*'
1056 r'\(\s*(==\s*)?(?P<ver>[^)]+)\)$')
1057
1058class DependencyFinder(object):
1059 """
1060 Locate dependencies for distributions.
1061 """
1062
1063 def __init__(self, locator=None):
1064 """
1065 Initialise an instance, using the specified locator
1066 to locate distributions.
1067 """
1068 self.locator = locator or default_locator
1069 self.scheme = get_scheme(self.locator.scheme)
1070
1071 def add_distribution(self, dist):
1072 """
1073 Add a distribution to the finder. This will update internal information
1074 about who provides what.
1075 :param dist: The distribution to add.
1076 """
1077 logger.debug('adding distribution %s', dist)
1078 name = dist.key
1079 self.dists_by_name[name] = dist
1080 self.dists[(name, dist.version)] = dist
1081 for p in dist.provides:
1082 name, version = parse_name_and_version(p)
1083 logger.debug('Add to provided: %s, %s, %s', name, version, dist)
1084 self.provided.setdefault(name, set()).add((version, dist))
1085
1086 def remove_distribution(self, dist):
1087 """
1088 Remove a distribution from the finder. This will update internal
1089 information about who provides what.
1090 :param dist: The distribution to remove.
1091 """
1092 logger.debug('removing distribution %s', dist)
1093 name = dist.key
1094 del self.dists_by_name[name]
1095 del self.dists[(name, dist.version)]
1096 for p in dist.provides:
1097 name, version = parse_name_and_version(p)
1098 logger.debug('Remove from provided: %s, %s, %s', name, version, dist)
1099 s = self.provided[name]
1100 s.remove((version, dist))
1101 if not s:
1102 del self.provided[name]
1103
1104 def get_matcher(self, reqt):
1105 """
1106 Get a version matcher for a requirement.
1107 :param reqt: The requirement
1108 :type reqt: str
1109 :return: A version matcher (an instance of
1110 :class:`distlib.version.Matcher`).
1111 """
1112 try:
1113 matcher = self.scheme.matcher(reqt)
1114 except UnsupportedVersionError: # pragma: no cover
1115 # XXX compat-mode if cannot read the version
1116 name = reqt.split()[0]
1117 matcher = self.scheme.matcher(name)
1118 return matcher
1119
1120 def find_providers(self, reqt):
1121 """
1122 Find the distributions which can fulfill a requirement.
1123
1124 :param reqt: The requirement.
1125 :type reqt: str
1126 :return: A set of distribution which can fulfill the requirement.
1127 """
1128 matcher = self.get_matcher(reqt)
1129 name = matcher.key # case-insensitive
1130 result = set()
1131 provided = self.provided
1132 if name in provided:
1133 for version, provider in provided[name]:
1134 try:
1135 match = matcher.match(version)
1136 except UnsupportedVersionError:
1137 match = False
1138
1139 if match:
1140 result.add(provider)
1141 break
1142 return result
1143
1144 def try_to_replace(self, provider, other, problems):
1145 """
1146 Attempt to replace one provider with another. This is typically used
1147 when resolving dependencies from multiple sources, e.g. A requires
1148 (B >= 1.0) while C requires (B >= 1.1).
1149
1150 For successful replacement, ``provider`` must meet all the requirements
1151 which ``other`` fulfills.
1152
1153 :param provider: The provider we are trying to replace with.
1154 :param other: The provider we're trying to replace.
1155 :param problems: If False is returned, this will contain what
1156 problems prevented replacement. This is currently
1157 a tuple of the literal string 'cantreplace',
1158 ``provider``, ``other`` and the set of requirements
1159 that ``provider`` couldn't fulfill.
1160 :return: True if we can replace ``other`` with ``provider``, else
1161 False.
1162 """
1163 rlist = self.reqts[other]
1164 unmatched = set()
1165 for s in rlist:
1166 matcher = self.get_matcher(s)
1167 if not matcher.match(provider.version):
1168 unmatched.add(s)
1169 if unmatched:
1170 # can't replace other with provider
1171 problems.add(('cantreplace', provider, other,
1172 frozenset(unmatched)))
1173 result = False
1174 else:
1175 # can replace other with provider
1176 self.remove_distribution(other)
1177 del self.reqts[other]
1178 for s in rlist:
1179 self.reqts.setdefault(provider, set()).add(s)
1180 self.add_distribution(provider)
1181 result = True
1182 return result
1183
1184 def find(self, requirement, meta_extras=None, prereleases=False):
1185 """
1186 Find a distribution and all distributions it depends on.
1187
1188 :param requirement: The requirement specifying the distribution to
1189 find, or a Distribution instance.
1190 :param meta_extras: A list of meta extras such as :test:, :build: and
1191 so on.
1192 :param prereleases: If ``True``, allow pre-release versions to be
1193 returned - otherwise, don't return prereleases
1194 unless they're all that's available.
1195
1196 Return a set of :class:`Distribution` instances and a set of
1197 problems.
1198
1199 The distributions returned should be such that they have the
1200 :attr:`required` attribute set to ``True`` if they were
1201 from the ``requirement`` passed to ``find()``, and they have the
1202 :attr:`build_time_dependency` attribute set to ``True`` unless they
1203 are post-installation dependencies of the ``requirement``.
1204
1205 The problems should be a tuple consisting of the string
1206 ``'unsatisfied'`` and the requirement which couldn't be satisfied
1207 by any distribution known to the locator.
1208 """
1209
1210 self.provided = {}
1211 self.dists = {}
1212 self.dists_by_name = {}
1213 self.reqts = {}
1214
1215 meta_extras = set(meta_extras or [])
1216 if ':*:' in meta_extras:
1217 meta_extras.remove(':*:')
1218 # :meta: and :run: are implicitly included
1219 meta_extras |= set([':test:', ':build:', ':dev:'])
1220
1221 if isinstance(requirement, Distribution):
1222 dist = odist = requirement
1223 logger.debug('passed %s as requirement', odist)
1224 else:
1225 dist = odist = self.locator.locate(requirement,
1226 prereleases=prereleases)
1227 if dist is None:
1228 raise DistlibException('Unable to locate %r' % requirement)
1229 logger.debug('located %s', odist)
1230 dist.requested = True
1231 problems = set()
1232 todo = set([dist])
1233 install_dists = set([odist])
1234 while todo:
1235 dist = todo.pop()
1236 name = dist.key # case-insensitive
1237 if name not in self.dists_by_name:
1238 self.add_distribution(dist)
1239 else:
1240 #import pdb; pdb.set_trace()
1241 other = self.dists_by_name[name]
1242 if other != dist:
1243 self.try_to_replace(dist, other, problems)
1244
1245 ireqts = dist.run_requires | dist.meta_requires
1246 sreqts = dist.build_requires
1247 ereqts = set()
1248 if meta_extras and dist in install_dists:
1249 for key in ('test', 'build', 'dev'):
1250 e = ':%s:' % key
1251 if e in meta_extras:
1252 ereqts |= getattr(dist, '%s_requires' % key)
1253 all_reqts = ireqts | sreqts | ereqts
1254 for r in all_reqts:
1255 providers = self.find_providers(r)
1256 if not providers:
1257 logger.debug('No providers found for %r', r)
1258 provider = self.locator.locate(r, prereleases=prereleases)
1259 # If no provider is found and we didn't consider
1260 # prereleases, consider them now.
1261 if provider is None and not prereleases:
1262 provider = self.locator.locate(r, prereleases=True)
1263 if provider is None:
1264 logger.debug('Cannot satisfy %r', r)
1265 problems.add(('unsatisfied', r))
1266 else:
1267 n, v = provider.key, provider.version
1268 if (n, v) not in self.dists:
1269 todo.add(provider)
1270 providers.add(provider)
1271 if r in ireqts and dist in install_dists:
1272 install_dists.add(provider)
1273 logger.debug('Adding %s to install_dists',
1274 provider.name_and_version)
1275 for p in providers:
1276 name = p.key
1277 if name not in self.dists_by_name:
1278 self.reqts.setdefault(p, set()).add(r)
1279 else:
1280 other = self.dists_by_name[name]
1281 if other != p:
1282 # see if other can be replaced by p
1283 self.try_to_replace(p, other, problems)
1284
1285 dists = set(self.dists.values())
1286 for dist in dists:
1287 dist.build_time_dependency = dist not in install_dists
1288 if dist.build_time_dependency:
1289 logger.debug('%s is a build-time dependency only.',
1290 dist.name_and_version)
1291 logger.debug('find done for %s', odist)
1292 return dists, problems