From 4f9a84d624e9a7a6a0d1902c065219cda7560cfb Mon Sep 17 00:00:00 2001 From: Roy <88516395+moraroy@users.noreply.github.com> Date: Wed, 3 Jan 2024 20:19:09 -0800 Subject: [PATCH] Add files via upload --- Modules/urllib3/__init__.py | 148 ++++ Modules/urllib3/_base_connection.py | 172 ++++ Modules/urllib3/_collections.py | 483 +++++++++++ Modules/urllib3/_request_methods.py | 217 +++++ Modules/urllib3/_version.py | 4 + Modules/urllib3/connection.py | 905 ++++++++++++++++++++ Modules/urllib3/connectionpool.py | 1182 +++++++++++++++++++++++++++ Modules/urllib3/exceptions.py | 318 +++++++ Modules/urllib3/fields.py | 345 ++++++++ Modules/urllib3/filepost.py | 89 ++ Modules/urllib3/poolmanager.py | 638 +++++++++++++++ Modules/urllib3/py.typed | 2 + Modules/urllib3/response.py | 1130 +++++++++++++++++++++++++ 13 files changed, 5633 insertions(+) create mode 100644 Modules/urllib3/_base_connection.py create mode 100644 Modules/urllib3/_collections.py create mode 100644 Modules/urllib3/_request_methods.py create mode 100644 Modules/urllib3/_version.py create mode 100644 Modules/urllib3/connection.py create mode 100644 Modules/urllib3/connectionpool.py create mode 100644 Modules/urllib3/exceptions.py create mode 100644 Modules/urllib3/fields.py create mode 100644 Modules/urllib3/filepost.py create mode 100644 Modules/urllib3/poolmanager.py create mode 100644 Modules/urllib3/py.typed create mode 100644 Modules/urllib3/response.py diff --git a/Modules/urllib3/__init__.py b/Modules/urllib3/__init__.py index 8b13789..46c8976 100644 --- a/Modules/urllib3/__init__.py +++ b/Modules/urllib3/__init__.py @@ -1 +1,149 @@ +""" +Python HTTP library with thread-safe connection pooling, file post support, user friendly, and more +""" +from __future__ import annotations + +# Set default logging handler to avoid "No handler found" warnings. +import logging +import typing +import warnings +from logging import NullHandler + +from . import exceptions +from ._base_connection import _TYPE_BODY +from ._collections import HTTPHeaderDict +from ._version import __version__ +from .connectionpool import HTTPConnectionPool, HTTPSConnectionPool, connection_from_url +from .filepost import _TYPE_FIELDS, encode_multipart_formdata +from .poolmanager import PoolManager, ProxyManager, proxy_from_url +from .response import BaseHTTPResponse, HTTPResponse +from .util.request import make_headers +from .util.retry import Retry +from .util.timeout import Timeout + +# Ensure that Python is compiled with OpenSSL 1.1.1+ +# If the 'ssl' module isn't available at all that's +# fine, we only care if the module is available. +try: + import ssl +except ImportError: + pass +else: + if not ssl.OPENSSL_VERSION.startswith("OpenSSL "): # Defensive: + warnings.warn( + "urllib3 v2 only supports OpenSSL 1.1.1+, currently " + f"the 'ssl' module is compiled with {ssl.OPENSSL_VERSION!r}. " + "See: https://github.com/urllib3/urllib3/issues/3020", + exceptions.NotOpenSSLWarning, + ) + elif ssl.OPENSSL_VERSION_INFO < (1, 1, 1): # Defensive: + raise ImportError( + "urllib3 v2 only supports OpenSSL 1.1.1+, currently " + f"the 'ssl' module is compiled with {ssl.OPENSSL_VERSION!r}. " + "See: https://github.com/urllib3/urllib3/issues/2168" + ) + +__author__ = "Andrey Petrov (andrey.petrov@shazow.net)" +__license__ = "MIT" +__version__ = __version__ + +__all__ = ( + "HTTPConnectionPool", + "HTTPHeaderDict", + "HTTPSConnectionPool", + "PoolManager", + "ProxyManager", + "HTTPResponse", + "Retry", + "Timeout", + "add_stderr_logger", + "connection_from_url", + "disable_warnings", + "encode_multipart_formdata", + "make_headers", + "proxy_from_url", + "request", + "BaseHTTPResponse", +) + +logging.getLogger(__name__).addHandler(NullHandler()) + + +def add_stderr_logger( + level: int = logging.DEBUG, +) -> logging.StreamHandler[typing.TextIO]: + """ + Helper for quickly adding a StreamHandler to the logger. Useful for + debugging. + + Returns the handler after adding it. + """ + # This method needs to be in this __init__.py to get the __name__ correct + # even if urllib3 is vendored within another package. + logger = logging.getLogger(__name__) + handler = logging.StreamHandler() + handler.setFormatter(logging.Formatter("%(asctime)s %(levelname)s %(message)s")) + logger.addHandler(handler) + logger.setLevel(level) + logger.debug("Added a stderr logging handler to logger: %s", __name__) + return handler + + +# ... Clean up. +del NullHandler + + +# All warning filters *must* be appended unless you're really certain that they +# shouldn't be: otherwise, it's very hard for users to use most Python +# mechanisms to silence them. +# SecurityWarning's always go off by default. +warnings.simplefilter("always", exceptions.SecurityWarning, append=True) +# InsecurePlatformWarning's don't vary between requests, so we keep it default. +warnings.simplefilter("default", exceptions.InsecurePlatformWarning, append=True) + + +def disable_warnings(category: type[Warning] = exceptions.HTTPWarning) -> None: + """ + Helper for quickly disabling all urllib3 warnings. + """ + warnings.simplefilter("ignore", category) + + +_DEFAULT_POOL = PoolManager() + + +def request( + method: str, + url: str, + *, + body: _TYPE_BODY | None = None, + fields: _TYPE_FIELDS | None = None, + headers: typing.Mapping[str, str] | None = None, + preload_content: bool | None = True, + decode_content: bool | None = True, + redirect: bool | None = True, + retries: Retry | bool | int | None = None, + timeout: Timeout | float | int | None = 3, + json: typing.Any | None = None, +) -> BaseHTTPResponse: + """ + A convenience, top-level request method. It uses a module-global ``PoolManager`` instance. + Therefore, its side effects could be shared across dependencies relying on it. + To avoid side effects create a new ``PoolManager`` instance and use it instead. + The method does not accept low-level ``**urlopen_kw`` keyword arguments. + """ + + return _DEFAULT_POOL.request( + method, + url, + body=body, + fields=fields, + headers=headers, + preload_content=preload_content, + decode_content=decode_content, + redirect=redirect, + retries=retries, + timeout=timeout, + json=json, + ) diff --git a/Modules/urllib3/_base_connection.py b/Modules/urllib3/_base_connection.py new file mode 100644 index 0000000..bb349c7 --- /dev/null +++ b/Modules/urllib3/_base_connection.py @@ -0,0 +1,172 @@ +from __future__ import annotations + +import typing + +from .util.connection import _TYPE_SOCKET_OPTIONS +from .util.timeout import _DEFAULT_TIMEOUT, _TYPE_TIMEOUT +from .util.url import Url + +_TYPE_BODY = typing.Union[bytes, typing.IO[typing.Any], typing.Iterable[bytes], str] + + +class ProxyConfig(typing.NamedTuple): + ssl_context: ssl.SSLContext | None + use_forwarding_for_https: bool + assert_hostname: None | str | Literal[False] + assert_fingerprint: str | None + + +class _ResponseOptions(typing.NamedTuple): + # TODO: Remove this in favor of a better + # HTTP request/response lifecycle tracking. + request_method: str + request_url: str + preload_content: bool + decode_content: bool + enforce_content_length: bool + + +if typing.TYPE_CHECKING: + import ssl + from typing import Literal, Protocol + + from .response import BaseHTTPResponse + + class BaseHTTPConnection(Protocol): + default_port: typing.ClassVar[int] + default_socket_options: typing.ClassVar[_TYPE_SOCKET_OPTIONS] + + host: str + port: int + timeout: None | ( + float + ) # Instance doesn't store _DEFAULT_TIMEOUT, must be resolved. + blocksize: int + source_address: tuple[str, int] | None + socket_options: _TYPE_SOCKET_OPTIONS | None + + proxy: Url | None + proxy_config: ProxyConfig | None + + is_verified: bool + proxy_is_verified: bool | None + + def __init__( + self, + host: str, + port: int | None = None, + *, + timeout: _TYPE_TIMEOUT = _DEFAULT_TIMEOUT, + source_address: tuple[str, int] | None = None, + blocksize: int = 8192, + socket_options: _TYPE_SOCKET_OPTIONS | None = ..., + proxy: Url | None = None, + proxy_config: ProxyConfig | None = None, + ) -> None: + ... + + def set_tunnel( + self, + host: str, + port: int | None = None, + headers: typing.Mapping[str, str] | None = None, + scheme: str = "http", + ) -> None: + ... + + def connect(self) -> None: + ... + + def request( + self, + method: str, + url: str, + body: _TYPE_BODY | None = None, + headers: typing.Mapping[str, str] | None = None, + # We know *at least* botocore is depending on the order of the + # first 3 parameters so to be safe we only mark the later ones + # as keyword-only to ensure we have space to extend. + *, + chunked: bool = False, + preload_content: bool = True, + decode_content: bool = True, + enforce_content_length: bool = True, + ) -> None: + ... + + def getresponse(self) -> BaseHTTPResponse: + ... + + def close(self) -> None: + ... + + @property + def is_closed(self) -> bool: + """Whether the connection either is brand new or has been previously closed. + If this property is True then both ``is_connected`` and ``has_connected_to_proxy`` + properties must be False. + """ + + @property + def is_connected(self) -> bool: + """Whether the connection is actively connected to any origin (proxy or target)""" + + @property + def has_connected_to_proxy(self) -> bool: + """Whether the connection has successfully connected to its proxy. + This returns False if no proxy is in use. Used to determine whether + errors are coming from the proxy layer or from tunnelling to the target origin. + """ + + class BaseHTTPSConnection(BaseHTTPConnection, Protocol): + default_port: typing.ClassVar[int] + default_socket_options: typing.ClassVar[_TYPE_SOCKET_OPTIONS] + + # Certificate verification methods + cert_reqs: int | str | None + assert_hostname: None | str | Literal[False] + assert_fingerprint: str | None + ssl_context: ssl.SSLContext | None + + # Trusted CAs + ca_certs: str | None + ca_cert_dir: str | None + ca_cert_data: None | str | bytes + + # TLS version + ssl_minimum_version: int | None + ssl_maximum_version: int | None + ssl_version: int | str | None # Deprecated + + # Client certificates + cert_file: str | None + key_file: str | None + key_password: str | None + + def __init__( + self, + host: str, + port: int | None = None, + *, + timeout: _TYPE_TIMEOUT = _DEFAULT_TIMEOUT, + source_address: tuple[str, int] | None = None, + blocksize: int = 16384, + socket_options: _TYPE_SOCKET_OPTIONS | None = ..., + proxy: Url | None = None, + proxy_config: ProxyConfig | None = None, + cert_reqs: int | str | None = None, + assert_hostname: None | str | Literal[False] = None, + assert_fingerprint: str | None = None, + server_hostname: str | None = None, + ssl_context: ssl.SSLContext | None = None, + ca_certs: str | None = None, + ca_cert_dir: str | None = None, + ca_cert_data: None | str | bytes = None, + ssl_minimum_version: int | None = None, + ssl_maximum_version: int | None = None, + ssl_version: int | str | None = None, # Deprecated + cert_file: str | None = None, + key_file: str | None = None, + key_password: str | None = None, + ) -> None: + ... diff --git a/Modules/urllib3/_collections.py b/Modules/urllib3/_collections.py new file mode 100644 index 0000000..55b0324 --- /dev/null +++ b/Modules/urllib3/_collections.py @@ -0,0 +1,483 @@ +from __future__ import annotations + +import typing +from collections import OrderedDict +from enum import Enum, auto +from threading import RLock + +if typing.TYPE_CHECKING: + # We can only import Protocol if TYPE_CHECKING because it's a development + # dependency, and is not available at runtime. + from typing import Protocol + + from typing_extensions import Self + + class HasGettableStringKeys(Protocol): + def keys(self) -> typing.Iterator[str]: + ... + + def __getitem__(self, key: str) -> str: + ... + + +__all__ = ["RecentlyUsedContainer", "HTTPHeaderDict"] + + +# Key type +_KT = typing.TypeVar("_KT") +# Value type +_VT = typing.TypeVar("_VT") +# Default type +_DT = typing.TypeVar("_DT") + +ValidHTTPHeaderSource = typing.Union[ + "HTTPHeaderDict", + typing.Mapping[str, str], + typing.Iterable[typing.Tuple[str, str]], + "HasGettableStringKeys", +] + + +class _Sentinel(Enum): + not_passed = auto() + + +def ensure_can_construct_http_header_dict( + potential: object, +) -> ValidHTTPHeaderSource | None: + if isinstance(potential, HTTPHeaderDict): + return potential + elif isinstance(potential, typing.Mapping): + # Full runtime checking of the contents of a Mapping is expensive, so for the + # purposes of typechecking, we assume that any Mapping is the right shape. + return typing.cast(typing.Mapping[str, str], potential) + elif isinstance(potential, typing.Iterable): + # Similarly to Mapping, full runtime checking of the contents of an Iterable is + # expensive, so for the purposes of typechecking, we assume that any Iterable + # is the right shape. + return typing.cast(typing.Iterable[typing.Tuple[str, str]], potential) + elif hasattr(potential, "keys") and hasattr(potential, "__getitem__"): + return typing.cast("HasGettableStringKeys", potential) + else: + return None + + +class RecentlyUsedContainer(typing.Generic[_KT, _VT], typing.MutableMapping[_KT, _VT]): + """ + Provides a thread-safe dict-like container which maintains up to + ``maxsize`` keys while throwing away the least-recently-used keys beyond + ``maxsize``. + + :param maxsize: + Maximum number of recent elements to retain. + + :param dispose_func: + Every time an item is evicted from the container, + ``dispose_func(value)`` is called. Callback which will get called + """ + + _container: typing.OrderedDict[_KT, _VT] + _maxsize: int + dispose_func: typing.Callable[[_VT], None] | None + lock: RLock + + def __init__( + self, + maxsize: int = 10, + dispose_func: typing.Callable[[_VT], None] | None = None, + ) -> None: + super().__init__() + self._maxsize = maxsize + self.dispose_func = dispose_func + self._container = OrderedDict() + self.lock = RLock() + + def __getitem__(self, key: _KT) -> _VT: + # Re-insert the item, moving it to the end of the eviction line. + with self.lock: + item = self._container.pop(key) + self._container[key] = item + return item + + def __setitem__(self, key: _KT, value: _VT) -> None: + evicted_item = None + with self.lock: + # Possibly evict the existing value of 'key' + try: + # If the key exists, we'll overwrite it, which won't change the + # size of the pool. Because accessing a key should move it to + # the end of the eviction line, we pop it out first. + evicted_item = key, self._container.pop(key) + self._container[key] = value + except KeyError: + # When the key does not exist, we insert the value first so that + # evicting works in all cases, including when self._maxsize is 0 + self._container[key] = value + if len(self._container) > self._maxsize: + # If we didn't evict an existing value, and we've hit our maximum + # size, then we have to evict the least recently used item from + # the beginning of the container. + evicted_item = self._container.popitem(last=False) + + # After releasing the lock on the pool, dispose of any evicted value. + if evicted_item is not None and self.dispose_func: + _, evicted_value = evicted_item + self.dispose_func(evicted_value) + + def __delitem__(self, key: _KT) -> None: + with self.lock: + value = self._container.pop(key) + + if self.dispose_func: + self.dispose_func(value) + + def __len__(self) -> int: + with self.lock: + return len(self._container) + + def __iter__(self) -> typing.NoReturn: + raise NotImplementedError( + "Iteration over this class is unlikely to be threadsafe." + ) + + def clear(self) -> None: + with self.lock: + # Copy pointers to all values, then wipe the mapping + values = list(self._container.values()) + self._container.clear() + + if self.dispose_func: + for value in values: + self.dispose_func(value) + + def keys(self) -> set[_KT]: # type: ignore[override] + with self.lock: + return set(self._container.keys()) + + +class HTTPHeaderDictItemView(typing.Set[typing.Tuple[str, str]]): + """ + HTTPHeaderDict is unusual for a Mapping[str, str] in that it has two modes of + address. + + If we directly try to get an item with a particular name, we will get a string + back that is the concatenated version of all the values: + + >>> d['X-Header-Name'] + 'Value1, Value2, Value3' + + However, if we iterate over an HTTPHeaderDict's items, we will optionally combine + these values based on whether combine=True was called when building up the dictionary + + >>> d = HTTPHeaderDict({"A": "1", "B": "foo"}) + >>> d.add("A", "2", combine=True) + >>> d.add("B", "bar") + >>> list(d.items()) + [ + ('A', '1, 2'), + ('B', 'foo'), + ('B', 'bar'), + ] + + This class conforms to the interface required by the MutableMapping ABC while + also giving us the nonstandard iteration behavior we want; items with duplicate + keys, ordered by time of first insertion. + """ + + _headers: HTTPHeaderDict + + def __init__(self, headers: HTTPHeaderDict) -> None: + self._headers = headers + + def __len__(self) -> int: + return len(list(self._headers.iteritems())) + + def __iter__(self) -> typing.Iterator[tuple[str, str]]: + return self._headers.iteritems() + + def __contains__(self, item: object) -> bool: + if isinstance(item, tuple) and len(item) == 2: + passed_key, passed_val = item + if isinstance(passed_key, str) and isinstance(passed_val, str): + return self._headers._has_value_for_header(passed_key, passed_val) + return False + + +class HTTPHeaderDict(typing.MutableMapping[str, str]): + """ + :param headers: + An iterable of field-value pairs. Must not contain multiple field names + when compared case-insensitively. + + :param kwargs: + Additional field-value pairs to pass in to ``dict.update``. + + A ``dict`` like container for storing HTTP Headers. + + Field names are stored and compared case-insensitively in compliance with + RFC 7230. Iteration provides the first case-sensitive key seen for each + case-insensitive pair. + + Using ``__setitem__`` syntax overwrites fields that compare equal + case-insensitively in order to maintain ``dict``'s api. For fields that + compare equal, instead create a new ``HTTPHeaderDict`` and use ``.add`` + in a loop. + + If multiple fields that are equal case-insensitively are passed to the + constructor or ``.update``, the behavior is undefined and some will be + lost. + + >>> headers = HTTPHeaderDict() + >>> headers.add('Set-Cookie', 'foo=bar') + >>> headers.add('set-cookie', 'baz=quxx') + >>> headers['content-length'] = '7' + >>> headers['SET-cookie'] + 'foo=bar, baz=quxx' + >>> headers['Content-Length'] + '7' + """ + + _container: typing.MutableMapping[str, list[str]] + + def __init__(self, headers: ValidHTTPHeaderSource | None = None, **kwargs: str): + super().__init__() + self._container = {} # 'dict' is insert-ordered + if headers is not None: + if isinstance(headers, HTTPHeaderDict): + self._copy_from(headers) + else: + self.extend(headers) + if kwargs: + self.extend(kwargs) + + def __setitem__(self, key: str, val: str) -> None: + # avoid a bytes/str comparison by decoding before httplib + if isinstance(key, bytes): + key = key.decode("latin-1") + self._container[key.lower()] = [key, val] + + def __getitem__(self, key: str) -> str: + val = self._container[key.lower()] + return ", ".join(val[1:]) + + def __delitem__(self, key: str) -> None: + del self._container[key.lower()] + + def __contains__(self, key: object) -> bool: + if isinstance(key, str): + return key.lower() in self._container + return False + + def setdefault(self, key: str, default: str = "") -> str: + return super().setdefault(key, default) + + def __eq__(self, other: object) -> bool: + maybe_constructable = ensure_can_construct_http_header_dict(other) + if maybe_constructable is None: + return False + else: + other_as_http_header_dict = type(self)(maybe_constructable) + + return {k.lower(): v for k, v in self.itermerged()} == { + k.lower(): v for k, v in other_as_http_header_dict.itermerged() + } + + def __ne__(self, other: object) -> bool: + return not self.__eq__(other) + + def __len__(self) -> int: + return len(self._container) + + def __iter__(self) -> typing.Iterator[str]: + # Only provide the originally cased names + for vals in self._container.values(): + yield vals[0] + + def discard(self, key: str) -> None: + try: + del self[key] + except KeyError: + pass + + def add(self, key: str, val: str, *, combine: bool = False) -> None: + """Adds a (name, value) pair, doesn't overwrite the value if it already + exists. + + If this is called with combine=True, instead of adding a new header value + as a distinct item during iteration, this will instead append the value to + any existing header value with a comma. If no existing header value exists + for the key, then the value will simply be added, ignoring the combine parameter. + + >>> headers = HTTPHeaderDict(foo='bar') + >>> headers.add('Foo', 'baz') + >>> headers['foo'] + 'bar, baz' + >>> list(headers.items()) + [('foo', 'bar'), ('foo', 'baz')] + >>> headers.add('foo', 'quz', combine=True) + >>> list(headers.items()) + [('foo', 'bar, baz, quz')] + """ + # avoid a bytes/str comparison by decoding before httplib + if isinstance(key, bytes): + key = key.decode("latin-1") + key_lower = key.lower() + new_vals = [key, val] + # Keep the common case aka no item present as fast as possible + vals = self._container.setdefault(key_lower, new_vals) + if new_vals is not vals: + # if there are values here, then there is at least the initial + # key/value pair + assert len(vals) >= 2 + if combine: + vals[-1] = vals[-1] + ", " + val + else: + vals.append(val) + + def extend(self, *args: ValidHTTPHeaderSource, **kwargs: str) -> None: + """Generic import function for any type of header-like object. + Adapted version of MutableMapping.update in order to insert items + with self.add instead of self.__setitem__ + """ + if len(args) > 1: + raise TypeError( + f"extend() takes at most 1 positional arguments ({len(args)} given)" + ) + other = args[0] if len(args) >= 1 else () + + if isinstance(other, HTTPHeaderDict): + for key, val in other.iteritems(): + self.add(key, val) + elif isinstance(other, typing.Mapping): + for key, val in other.items(): + self.add(key, val) + elif isinstance(other, typing.Iterable): + other = typing.cast(typing.Iterable[typing.Tuple[str, str]], other) + for key, value in other: + self.add(key, value) + elif hasattr(other, "keys") and hasattr(other, "__getitem__"): + # THIS IS NOT A TYPESAFE BRANCH + # In this branch, the object has a `keys` attr but is not a Mapping or any of + # the other types indicated in the method signature. We do some stuff with + # it as though it partially implements the Mapping interface, but we're not + # doing that stuff safely AT ALL. + for key in other.keys(): + self.add(key, other[key]) + + for key, value in kwargs.items(): + self.add(key, value) + + @typing.overload + def getlist(self, key: str) -> list[str]: + ... + + @typing.overload + def getlist(self, key: str, default: _DT) -> list[str] | _DT: + ... + + def getlist( + self, key: str, default: _Sentinel | _DT = _Sentinel.not_passed + ) -> list[str] | _DT: + """Returns a list of all the values for the named field. Returns an + empty list if the key doesn't exist.""" + try: + vals = self._container[key.lower()] + except KeyError: + if default is _Sentinel.not_passed: + # _DT is unbound; empty list is instance of List[str] + return [] + # _DT is bound; default is instance of _DT + return default + else: + # _DT may or may not be bound; vals[1:] is instance of List[str], which + # meets our external interface requirement of `Union[List[str], _DT]`. + return vals[1:] + + def _prepare_for_method_change(self) -> Self: + """ + Remove content-specific header fields before changing the request + method to GET or HEAD according to RFC 9110, Section 15.4. + """ + content_specific_headers = [ + "Content-Encoding", + "Content-Language", + "Content-Location", + "Content-Type", + "Content-Length", + "Digest", + "Last-Modified", + ] + for header in content_specific_headers: + self.discard(header) + return self + + # Backwards compatibility for httplib + getheaders = getlist + getallmatchingheaders = getlist + iget = getlist + + # Backwards compatibility for http.cookiejar + get_all = getlist + + def __repr__(self) -> str: + return f"{type(self).__name__}({dict(self.itermerged())})" + + def _copy_from(self, other: HTTPHeaderDict) -> None: + for key in other: + val = other.getlist(key) + self._container[key.lower()] = [key, *val] + + def copy(self) -> HTTPHeaderDict: + clone = type(self)() + clone._copy_from(self) + return clone + + def iteritems(self) -> typing.Iterator[tuple[str, str]]: + """Iterate over all header lines, including duplicate ones.""" + for key in self: + vals = self._container[key.lower()] + for val in vals[1:]: + yield vals[0], val + + def itermerged(self) -> typing.Iterator[tuple[str, str]]: + """Iterate over all headers, merging duplicate ones together.""" + for key in self: + val = self._container[key.lower()] + yield val[0], ", ".join(val[1:]) + + def items(self) -> HTTPHeaderDictItemView: # type: ignore[override] + return HTTPHeaderDictItemView(self) + + def _has_value_for_header(self, header_name: str, potential_value: str) -> bool: + if header_name in self: + return potential_value in self._container[header_name.lower()][1:] + return False + + def __ior__(self, other: object) -> HTTPHeaderDict: + # Supports extending a header dict in-place using operator |= + # combining items with add instead of __setitem__ + maybe_constructable = ensure_can_construct_http_header_dict(other) + if maybe_constructable is None: + return NotImplemented + self.extend(maybe_constructable) + return self + + def __or__(self, other: object) -> HTTPHeaderDict: + # Supports merging header dicts using operator | + # combining items with add instead of __setitem__ + maybe_constructable = ensure_can_construct_http_header_dict(other) + if maybe_constructable is None: + return NotImplemented + result = self.copy() + result.extend(maybe_constructable) + return result + + def __ror__(self, other: object) -> HTTPHeaderDict: + # Supports merging header dicts using operator | when other is on left side + # combining items with add instead of __setitem__ + maybe_constructable = ensure_can_construct_http_header_dict(other) + if maybe_constructable is None: + return NotImplemented + result = type(self)(maybe_constructable) + result.extend(self) + return result diff --git a/Modules/urllib3/_request_methods.py b/Modules/urllib3/_request_methods.py new file mode 100644 index 0000000..1d0f346 --- /dev/null +++ b/Modules/urllib3/_request_methods.py @@ -0,0 +1,217 @@ +from __future__ import annotations + +import json as _json +import typing +from urllib.parse import urlencode + +from ._base_connection import _TYPE_BODY +from ._collections import HTTPHeaderDict +from .filepost import _TYPE_FIELDS, encode_multipart_formdata +from .response import BaseHTTPResponse + +__all__ = ["RequestMethods"] + +_TYPE_ENCODE_URL_FIELDS = typing.Union[ + typing.Sequence[typing.Tuple[str, typing.Union[str, bytes]]], + typing.Mapping[str, typing.Union[str, bytes]], +] + + +class RequestMethods: + """ + Convenience mixin for classes who implement a :meth:`urlopen` method, such + as :class:`urllib3.HTTPConnectionPool` and + :class:`urllib3.PoolManager`. + + Provides behavior for making common types of HTTP request methods and + decides which type of request field encoding to use. + + Specifically, + + :meth:`.request_encode_url` is for sending requests whose fields are + encoded in the URL (such as GET, HEAD, DELETE). + + :meth:`.request_encode_body` is for sending requests whose fields are + encoded in the *body* of the request using multipart or www-form-urlencoded + (such as for POST, PUT, PATCH). + + :meth:`.request` is for making any kind of request, it will look up the + appropriate encoding format and use one of the above two methods to make + the request. + + Initializer parameters: + + :param headers: + Headers to include with all requests, unless other headers are given + explicitly. + """ + + _encode_url_methods = {"DELETE", "GET", "HEAD", "OPTIONS"} + + def __init__(self, headers: typing.Mapping[str, str] | None = None) -> None: + self.headers = headers or {} + + def urlopen( + self, + method: str, + url: str, + body: _TYPE_BODY | None = None, + headers: typing.Mapping[str, str] | None = None, + encode_multipart: bool = True, + multipart_boundary: str | None = None, + **kw: typing.Any, + ) -> BaseHTTPResponse: # Abstract + raise NotImplementedError( + "Classes extending RequestMethods must implement " + "their own ``urlopen`` method." + ) + + def request( + self, + method: str, + url: str, + body: _TYPE_BODY | None = None, + fields: _TYPE_FIELDS | None = None, + headers: typing.Mapping[str, str] | None = None, + json: typing.Any | None = None, + **urlopen_kw: typing.Any, + ) -> BaseHTTPResponse: + """ + Make a request using :meth:`urlopen` with the appropriate encoding of + ``fields`` based on the ``method`` used. + + This is a convenience method that requires the least amount of manual + effort. It can be used in most situations, while still having the + option to drop down to more specific methods when necessary, such as + :meth:`request_encode_url`, :meth:`request_encode_body`, + or even the lowest level :meth:`urlopen`. + """ + method = method.upper() + + if json is not None and body is not None: + raise TypeError( + "request got values for both 'body' and 'json' parameters which are mutually exclusive" + ) + + if json is not None: + if headers is None: + headers = self.headers.copy() # type: ignore + if not ("content-type" in map(str.lower, headers.keys())): + headers["Content-Type"] = "application/json" # type: ignore + + body = _json.dumps(json, separators=(",", ":"), ensure_ascii=False).encode( + "utf-8" + ) + + if body is not None: + urlopen_kw["body"] = body + + if method in self._encode_url_methods: + return self.request_encode_url( + method, + url, + fields=fields, # type: ignore[arg-type] + headers=headers, + **urlopen_kw, + ) + else: + return self.request_encode_body( + method, url, fields=fields, headers=headers, **urlopen_kw + ) + + def request_encode_url( + self, + method: str, + url: str, + fields: _TYPE_ENCODE_URL_FIELDS | None = None, + headers: typing.Mapping[str, str] | None = None, + **urlopen_kw: str, + ) -> BaseHTTPResponse: + """ + Make a request using :meth:`urlopen` with the ``fields`` encoded in + the url. This is useful for request methods like GET, HEAD, DELETE, etc. + """ + if headers is None: + headers = self.headers + + extra_kw: dict[str, typing.Any] = {"headers": headers} + extra_kw.update(urlopen_kw) + + if fields: + url += "?" + urlencode(fields) + + return self.urlopen(method, url, **extra_kw) + + def request_encode_body( + self, + method: str, + url: str, + fields: _TYPE_FIELDS | None = None, + headers: typing.Mapping[str, str] | None = None, + encode_multipart: bool = True, + multipart_boundary: str | None = None, + **urlopen_kw: str, + ) -> BaseHTTPResponse: + """ + Make a request using :meth:`urlopen` with the ``fields`` encoded in + the body. This is useful for request methods like POST, PUT, PATCH, etc. + + When ``encode_multipart=True`` (default), then + :func:`urllib3.encode_multipart_formdata` is used to encode + the payload with the appropriate content type. Otherwise + :func:`urllib.parse.urlencode` is used with the + 'application/x-www-form-urlencoded' content type. + + Multipart encoding must be used when posting files, and it's reasonably + safe to use it in other times too. However, it may break request + signing, such as with OAuth. + + Supports an optional ``fields`` parameter of key/value strings AND + key/filetuple. A filetuple is a (filename, data, MIME type) tuple where + the MIME type is optional. For example:: + + fields = { + 'foo': 'bar', + 'fakefile': ('foofile.txt', 'contents of foofile'), + 'realfile': ('barfile.txt', open('realfile').read()), + 'typedfile': ('bazfile.bin', open('bazfile').read(), + 'image/jpeg'), + 'nonamefile': 'contents of nonamefile field', + } + + When uploading a file, providing a filename (the first parameter of the + tuple) is optional but recommended to best mimic behavior of browsers. + + Note that if ``headers`` are supplied, the 'Content-Type' header will + be overwritten because it depends on the dynamic random boundary string + which is used to compose the body of the request. The random boundary + string can be explicitly set with the ``multipart_boundary`` parameter. + """ + if headers is None: + headers = self.headers + + extra_kw: dict[str, typing.Any] = {"headers": HTTPHeaderDict(headers)} + body: bytes | str + + if fields: + if "body" in urlopen_kw: + raise TypeError( + "request got values for both 'fields' and 'body', can only specify one." + ) + + if encode_multipart: + body, content_type = encode_multipart_formdata( + fields, boundary=multipart_boundary + ) + else: + body, content_type = ( + urlencode(fields), # type: ignore[arg-type] + "application/x-www-form-urlencoded", + ) + + extra_kw["body"] = body + extra_kw["headers"].setdefault("Content-Type", content_type) + + extra_kw.update(urlopen_kw) + + return self.urlopen(method, url, **extra_kw) diff --git a/Modules/urllib3/_version.py b/Modules/urllib3/_version.py new file mode 100644 index 0000000..409ba3f --- /dev/null +++ b/Modules/urllib3/_version.py @@ -0,0 +1,4 @@ +# This file is protected via CODEOWNERS +from __future__ import annotations + +__version__ = "2.1.0" diff --git a/Modules/urllib3/connection.py b/Modules/urllib3/connection.py new file mode 100644 index 0000000..38a2fd6 --- /dev/null +++ b/Modules/urllib3/connection.py @@ -0,0 +1,905 @@ +from __future__ import annotations + +import datetime +import logging +import os +import re +import socket +import sys +import typing +import warnings +from http.client import HTTPConnection as _HTTPConnection +from http.client import HTTPException as HTTPException # noqa: F401 +from http.client import ResponseNotReady +from socket import timeout as SocketTimeout + +if typing.TYPE_CHECKING: + from typing import Literal + + from .response import HTTPResponse + from .util.ssl_ import _TYPE_PEER_CERT_RET_DICT + from .util.ssltransport import SSLTransport + +from ._collections import HTTPHeaderDict +from .util.response import assert_header_parsing +from .util.timeout import _DEFAULT_TIMEOUT, _TYPE_TIMEOUT, Timeout +from .util.util import to_str +from .util.wait import wait_for_read + +try: # Compiled with SSL? + import ssl + + BaseSSLError = ssl.SSLError +except (ImportError, AttributeError): + ssl = None # type: ignore[assignment] + + class BaseSSLError(BaseException): # type: ignore[no-redef] + pass + + +from ._base_connection import _TYPE_BODY +from ._base_connection import ProxyConfig as ProxyConfig +from ._base_connection import _ResponseOptions as _ResponseOptions +from ._version import __version__ +from .exceptions import ( + ConnectTimeoutError, + HeaderParsingError, + NameResolutionError, + NewConnectionError, + ProxyError, + SystemTimeWarning, +) +from .util import SKIP_HEADER, SKIPPABLE_HEADERS, connection, ssl_ +from .util.request import body_to_chunks +from .util.ssl_ import assert_fingerprint as _assert_fingerprint +from .util.ssl_ import ( + create_urllib3_context, + is_ipaddress, + resolve_cert_reqs, + resolve_ssl_version, + ssl_wrap_socket, +) +from .util.ssl_match_hostname import CertificateError, match_hostname +from .util.url import Url + +# Not a no-op, we're adding this to the namespace so it can be imported. +ConnectionError = ConnectionError +BrokenPipeError = BrokenPipeError + + +log = logging.getLogger(__name__) + +port_by_scheme = {"http": 80, "https": 443} + +# When it comes time to update this value as a part of regular maintenance +# (ie test_recent_date is failing) update it to ~6 months before the current date. +RECENT_DATE = datetime.date(2022, 1, 1) + +_CONTAINS_CONTROL_CHAR_RE = re.compile(r"[^-!#$%&'*+.^_`|~0-9a-zA-Z]") + +_HAS_SYS_AUDIT = hasattr(sys, "audit") + + +class HTTPConnection(_HTTPConnection): + """ + Based on :class:`http.client.HTTPConnection` but provides an extra constructor + backwards-compatibility layer between older and newer Pythons. + + Additional keyword parameters are used to configure attributes of the connection. + Accepted parameters include: + + - ``source_address``: Set the source address for the current connection. + - ``socket_options``: Set specific options on the underlying socket. If not specified, then + defaults are loaded from ``HTTPConnection.default_socket_options`` which includes disabling + Nagle's algorithm (sets TCP_NODELAY to 1) unless the connection is behind a proxy. + + For example, if you wish to enable TCP Keep Alive in addition to the defaults, + you might pass: + + .. code-block:: python + + HTTPConnection.default_socket_options + [ + (socket.SOL_SOCKET, socket.SO_KEEPALIVE, 1), + ] + + Or you may want to disable the defaults by passing an empty list (e.g., ``[]``). + """ + + default_port: typing.ClassVar[int] = port_by_scheme["http"] # type: ignore[misc] + + #: Disable Nagle's algorithm by default. + #: ``[(socket.IPPROTO_TCP, socket.TCP_NODELAY, 1)]`` + default_socket_options: typing.ClassVar[connection._TYPE_SOCKET_OPTIONS] = [ + (socket.IPPROTO_TCP, socket.TCP_NODELAY, 1) + ] + + #: Whether this connection verifies the host's certificate. + is_verified: bool = False + + #: Whether this proxy connection verified the proxy host's certificate. + # If no proxy is currently connected to the value will be ``None``. + proxy_is_verified: bool | None = None + + blocksize: int + source_address: tuple[str, int] | None + socket_options: connection._TYPE_SOCKET_OPTIONS | None + + _has_connected_to_proxy: bool + _response_options: _ResponseOptions | None + _tunnel_host: str | None + _tunnel_port: int | None + _tunnel_scheme: str | None + + def __init__( + self, + host: str, + port: int | None = None, + *, + timeout: _TYPE_TIMEOUT = _DEFAULT_TIMEOUT, + source_address: tuple[str, int] | None = None, + blocksize: int = 16384, + socket_options: None + | (connection._TYPE_SOCKET_OPTIONS) = default_socket_options, + proxy: Url | None = None, + proxy_config: ProxyConfig | None = None, + ) -> None: + super().__init__( + host=host, + port=port, + timeout=Timeout.resolve_default_timeout(timeout), + source_address=source_address, + blocksize=blocksize, + ) + self.socket_options = socket_options + self.proxy = proxy + self.proxy_config = proxy_config + + self._has_connected_to_proxy = False + self._response_options = None + self._tunnel_host: str | None = None + self._tunnel_port: int | None = None + self._tunnel_scheme: str | None = None + + # https://github.com/python/mypy/issues/4125 + # Mypy treats this as LSP violation, which is considered a bug. + # If `host` is made a property it violates LSP, because a writeable attribute is overridden with a read-only one. + # However, there is also a `host` setter so LSP is not violated. + # Potentially, a `@host.deleter` might be needed depending on how this issue will be fixed. + @property + def host(self) -> str: + """ + Getter method to remove any trailing dots that indicate the hostname is an FQDN. + + In general, SSL certificates don't include the trailing dot indicating a + fully-qualified domain name, and thus, they don't validate properly when + checked against a domain name that includes the dot. In addition, some + servers may not expect to receive the trailing dot when provided. + + However, the hostname with trailing dot is critical to DNS resolution; doing a + lookup with the trailing dot will properly only resolve the appropriate FQDN, + whereas a lookup without a trailing dot will search the system's search domain + list. Thus, it's important to keep the original host around for use only in + those cases where it's appropriate (i.e., when doing DNS lookup to establish the + actual TCP connection across which we're going to send HTTP requests). + """ + return self._dns_host.rstrip(".") + + @host.setter + def host(self, value: str) -> None: + """ + Setter for the `host` property. + + We assume that only urllib3 uses the _dns_host attribute; httplib itself + only uses `host`, and it seems reasonable that other libraries follow suit. + """ + self._dns_host = value + + def _new_conn(self) -> socket.socket: + """Establish a socket connection and set nodelay settings on it. + + :return: New socket connection. + """ + try: + sock = connection.create_connection( + (self._dns_host, self.port), + self.timeout, + source_address=self.source_address, + socket_options=self.socket_options, + ) + except socket.gaierror as e: + raise NameResolutionError(self.host, self, e) from e + except SocketTimeout as e: + raise ConnectTimeoutError( + self, + f"Connection to {self.host} timed out. (connect timeout={self.timeout})", + ) from e + + except OSError as e: + raise NewConnectionError( + self, f"Failed to establish a new connection: {e}" + ) from e + + # Audit hooks are only available in Python 3.8+ + if _HAS_SYS_AUDIT: + sys.audit("http.client.connect", self, self.host, self.port) + + return sock + + def set_tunnel( + self, + host: str, + port: int | None = None, + headers: typing.Mapping[str, str] | None = None, + scheme: str = "http", + ) -> None: + if scheme not in ("http", "https"): + raise ValueError( + f"Invalid proxy scheme for tunneling: {scheme!r}, must be either 'http' or 'https'" + ) + super().set_tunnel(host, port=port, headers=headers) + self._tunnel_scheme = scheme + + def connect(self) -> None: + self.sock = self._new_conn() + if self._tunnel_host: + # If we're tunneling it means we're connected to our proxy. + self._has_connected_to_proxy = True + + # TODO: Fix tunnel so it doesn't depend on self.sock state. + self._tunnel() # type: ignore[attr-defined] + + # If there's a proxy to be connected to we are fully connected. + # This is set twice (once above and here) due to forwarding proxies + # not using tunnelling. + self._has_connected_to_proxy = bool(self.proxy) + + @property + def is_closed(self) -> bool: + return self.sock is None + + @property + def is_connected(self) -> bool: + if self.sock is None: + return False + return not wait_for_read(self.sock, timeout=0.0) + + @property + def has_connected_to_proxy(self) -> bool: + return self._has_connected_to_proxy + + def close(self) -> None: + try: + super().close() + finally: + # Reset all stateful properties so connection + # can be re-used without leaking prior configs. + self.sock = None + self.is_verified = False + self.proxy_is_verified = None + self._has_connected_to_proxy = False + self._response_options = None + self._tunnel_host = None + self._tunnel_port = None + self._tunnel_scheme = None + + def putrequest( + self, + method: str, + url: str, + skip_host: bool = False, + skip_accept_encoding: bool = False, + ) -> None: + """""" + # Empty docstring because the indentation of CPython's implementation + # is broken but we don't want this method in our documentation. + match = _CONTAINS_CONTROL_CHAR_RE.search(method) + if match: + raise ValueError( + f"Method cannot contain non-token characters {method!r} (found at least {match.group()!r})" + ) + + return super().putrequest( + method, url, skip_host=skip_host, skip_accept_encoding=skip_accept_encoding + ) + + def putheader(self, header: str, *values: str) -> None: + """""" + if not any(isinstance(v, str) and v == SKIP_HEADER for v in values): + super().putheader(header, *values) + elif to_str(header.lower()) not in SKIPPABLE_HEADERS: + skippable_headers = "', '".join( + [str.title(header) for header in sorted(SKIPPABLE_HEADERS)] + ) + raise ValueError( + f"urllib3.util.SKIP_HEADER only supports '{skippable_headers}'" + ) + + # `request` method's signature intentionally violates LSP. + # urllib3's API is different from `http.client.HTTPConnection` and the subclassing is only incidental. + def request( # type: ignore[override] + self, + method: str, + url: str, + body: _TYPE_BODY | None = None, + headers: typing.Mapping[str, str] | None = None, + *, + chunked: bool = False, + preload_content: bool = True, + decode_content: bool = True, + enforce_content_length: bool = True, + ) -> None: + # Update the inner socket's timeout value to send the request. + # This only triggers if the connection is re-used. + if self.sock is not None: + self.sock.settimeout(self.timeout) + + # Store these values to be fed into the HTTPResponse + # object later. TODO: Remove this in favor of a real + # HTTP lifecycle mechanism. + + # We have to store these before we call .request() + # because sometimes we can still salvage a response + # off the wire even if we aren't able to completely + # send the request body. + self._response_options = _ResponseOptions( + request_method=method, + request_url=url, + preload_content=preload_content, + decode_content=decode_content, + enforce_content_length=enforce_content_length, + ) + + if headers is None: + headers = {} + header_keys = frozenset(to_str(k.lower()) for k in headers) + skip_accept_encoding = "accept-encoding" in header_keys + skip_host = "host" in header_keys + self.putrequest( + method, url, skip_accept_encoding=skip_accept_encoding, skip_host=skip_host + ) + + # Transform the body into an iterable of sendall()-able chunks + # and detect if an explicit Content-Length is doable. + chunks_and_cl = body_to_chunks(body, method=method, blocksize=self.blocksize) + chunks = chunks_and_cl.chunks + content_length = chunks_and_cl.content_length + + # When chunked is explicit set to 'True' we respect that. + if chunked: + if "transfer-encoding" not in header_keys: + self.putheader("Transfer-Encoding", "chunked") + else: + # Detect whether a framing mechanism is already in use. If so + # we respect that value, otherwise we pick chunked vs content-length + # depending on the type of 'body'. + if "content-length" in header_keys: + chunked = False + elif "transfer-encoding" in header_keys: + chunked = True + + # Otherwise we go off the recommendation of 'body_to_chunks()'. + else: + chunked = False + if content_length is None: + if chunks is not None: + chunked = True + self.putheader("Transfer-Encoding", "chunked") + else: + self.putheader("Content-Length", str(content_length)) + + # Now that framing headers are out of the way we send all the other headers. + if "user-agent" not in header_keys: + self.putheader("User-Agent", _get_default_user_agent()) + for header, value in headers.items(): + self.putheader(header, value) + self.endheaders() + + # If we're given a body we start sending that in chunks. + if chunks is not None: + for chunk in chunks: + # Sending empty chunks isn't allowed for TE: chunked + # as it indicates the end of the body. + if not chunk: + continue + if isinstance(chunk, str): + chunk = chunk.encode("utf-8") + if chunked: + self.send(b"%x\r\n%b\r\n" % (len(chunk), chunk)) + else: + self.send(chunk) + + # Regardless of whether we have a body or not, if we're in + # chunked mode we want to send an explicit empty chunk. + if chunked: + self.send(b"0\r\n\r\n") + + def request_chunked( + self, + method: str, + url: str, + body: _TYPE_BODY | None = None, + headers: typing.Mapping[str, str] | None = None, + ) -> None: + """ + Alternative to the common request method, which sends the + body with chunked encoding and not as one block + """ + warnings.warn( + "HTTPConnection.request_chunked() is deprecated and will be removed " + "in urllib3 v2.1.0. Instead use HTTPConnection.request(..., chunked=True).", + category=DeprecationWarning, + stacklevel=2, + ) + self.request(method, url, body=body, headers=headers, chunked=True) + + def getresponse( # type: ignore[override] + self, + ) -> HTTPResponse: + """ + Get the response from the server. + + If the HTTPConnection is in the correct state, returns an instance of HTTPResponse or of whatever object is returned by the response_class variable. + + If a request has not been sent or if a previous response has not be handled, ResponseNotReady is raised. If the HTTP response indicates that the connection should be closed, then it will be closed before the response is returned. When the connection is closed, the underlying socket is closed. + """ + # Raise the same error as http.client.HTTPConnection + if self._response_options is None: + raise ResponseNotReady() + + # Reset this attribute for being used again. + resp_options = self._response_options + self._response_options = None + + # Since the connection's timeout value may have been updated + # we need to set the timeout on the socket. + self.sock.settimeout(self.timeout) + + # This is needed here to avoid circular import errors + from .response import HTTPResponse + + # Get the response from http.client.HTTPConnection + httplib_response = super().getresponse() + + try: + assert_header_parsing(httplib_response.msg) + except (HeaderParsingError, TypeError) as hpe: + log.warning( + "Failed to parse headers (url=%s): %s", + _url_from_connection(self, resp_options.request_url), + hpe, + exc_info=True, + ) + + headers = HTTPHeaderDict(httplib_response.msg.items()) + + response = HTTPResponse( + body=httplib_response, + headers=headers, + status=httplib_response.status, + version=httplib_response.version, + reason=httplib_response.reason, + preload_content=resp_options.preload_content, + decode_content=resp_options.decode_content, + original_response=httplib_response, + enforce_content_length=resp_options.enforce_content_length, + request_method=resp_options.request_method, + request_url=resp_options.request_url, + ) + return response + + +class HTTPSConnection(HTTPConnection): + """ + Many of the parameters to this constructor are passed to the underlying SSL + socket by means of :py:func:`urllib3.util.ssl_wrap_socket`. + """ + + default_port = port_by_scheme["https"] # type: ignore[misc] + + cert_reqs: int | str | None = None + ca_certs: str | None = None + ca_cert_dir: str | None = None + ca_cert_data: None | str | bytes = None + ssl_version: int | str | None = None + ssl_minimum_version: int | None = None + ssl_maximum_version: int | None = None + assert_fingerprint: str | None = None + + def __init__( + self, + host: str, + port: int | None = None, + *, + timeout: _TYPE_TIMEOUT = _DEFAULT_TIMEOUT, + source_address: tuple[str, int] | None = None, + blocksize: int = 16384, + socket_options: None + | (connection._TYPE_SOCKET_OPTIONS) = HTTPConnection.default_socket_options, + proxy: Url | None = None, + proxy_config: ProxyConfig | None = None, + cert_reqs: int | str | None = None, + assert_hostname: None | str | Literal[False] = None, + assert_fingerprint: str | None = None, + server_hostname: str | None = None, + ssl_context: ssl.SSLContext | None = None, + ca_certs: str | None = None, + ca_cert_dir: str | None = None, + ca_cert_data: None | str | bytes = None, + ssl_minimum_version: int | None = None, + ssl_maximum_version: int | None = None, + ssl_version: int | str | None = None, # Deprecated + cert_file: str | None = None, + key_file: str | None = None, + key_password: str | None = None, + ) -> None: + super().__init__( + host, + port=port, + timeout=timeout, + source_address=source_address, + blocksize=blocksize, + socket_options=socket_options, + proxy=proxy, + proxy_config=proxy_config, + ) + + self.key_file = key_file + self.cert_file = cert_file + self.key_password = key_password + self.ssl_context = ssl_context + self.server_hostname = server_hostname + self.assert_hostname = assert_hostname + self.assert_fingerprint = assert_fingerprint + self.ssl_version = ssl_version + self.ssl_minimum_version = ssl_minimum_version + self.ssl_maximum_version = ssl_maximum_version + self.ca_certs = ca_certs and os.path.expanduser(ca_certs) + self.ca_cert_dir = ca_cert_dir and os.path.expanduser(ca_cert_dir) + self.ca_cert_data = ca_cert_data + + # cert_reqs depends on ssl_context so calculate last. + if cert_reqs is None: + if self.ssl_context is not None: + cert_reqs = self.ssl_context.verify_mode + else: + cert_reqs = resolve_cert_reqs(None) + self.cert_reqs = cert_reqs + + def set_cert( + self, + key_file: str | None = None, + cert_file: str | None = None, + cert_reqs: int | str | None = None, + key_password: str | None = None, + ca_certs: str | None = None, + assert_hostname: None | str | Literal[False] = None, + assert_fingerprint: str | None = None, + ca_cert_dir: str | None = None, + ca_cert_data: None | str | bytes = None, + ) -> None: + """ + This method should only be called once, before the connection is used. + """ + warnings.warn( + "HTTPSConnection.set_cert() is deprecated and will be removed " + "in urllib3 v2.1.0. Instead provide the parameters to the " + "HTTPSConnection constructor.", + category=DeprecationWarning, + stacklevel=2, + ) + + # If cert_reqs is not provided we'll assume CERT_REQUIRED unless we also + # have an SSLContext object in which case we'll use its verify_mode. + if cert_reqs is None: + if self.ssl_context is not None: + cert_reqs = self.ssl_context.verify_mode + else: + cert_reqs = resolve_cert_reqs(None) + + self.key_file = key_file + self.cert_file = cert_file + self.cert_reqs = cert_reqs + self.key_password = key_password + self.assert_hostname = assert_hostname + self.assert_fingerprint = assert_fingerprint + self.ca_certs = ca_certs and os.path.expanduser(ca_certs) + self.ca_cert_dir = ca_cert_dir and os.path.expanduser(ca_cert_dir) + self.ca_cert_data = ca_cert_data + + def connect(self) -> None: + sock: socket.socket | ssl.SSLSocket + self.sock = sock = self._new_conn() + server_hostname: str = self.host + tls_in_tls = False + + # Do we need to establish a tunnel? + if self._tunnel_host is not None: + # We're tunneling to an HTTPS origin so need to do TLS-in-TLS. + if self._tunnel_scheme == "https": + self.sock = sock = self._connect_tls_proxy(self.host, sock) + tls_in_tls = True + + # If we're tunneling it means we're connected to our proxy. + self._has_connected_to_proxy = True + + self._tunnel() # type: ignore[attr-defined] + # Override the host with the one we're requesting data from. + server_hostname = self._tunnel_host + + if self.server_hostname is not None: + server_hostname = self.server_hostname + + is_time_off = datetime.date.today() < RECENT_DATE + if is_time_off: + warnings.warn( + ( + f"System time is way off (before {RECENT_DATE}). This will probably " + "lead to SSL verification errors" + ), + SystemTimeWarning, + ) + + sock_and_verified = _ssl_wrap_socket_and_match_hostname( + sock=sock, + cert_reqs=self.cert_reqs, + ssl_version=self.ssl_version, + ssl_minimum_version=self.ssl_minimum_version, + ssl_maximum_version=self.ssl_maximum_version, + ca_certs=self.ca_certs, + ca_cert_dir=self.ca_cert_dir, + ca_cert_data=self.ca_cert_data, + cert_file=self.cert_file, + key_file=self.key_file, + key_password=self.key_password, + server_hostname=server_hostname, + ssl_context=self.ssl_context, + tls_in_tls=tls_in_tls, + assert_hostname=self.assert_hostname, + assert_fingerprint=self.assert_fingerprint, + ) + self.sock = sock_and_verified.socket + self.is_verified = sock_and_verified.is_verified + + # If there's a proxy to be connected to we are fully connected. + # This is set twice (once above and here) due to forwarding proxies + # not using tunnelling. + self._has_connected_to_proxy = bool(self.proxy) + + def _connect_tls_proxy(self, hostname: str, sock: socket.socket) -> ssl.SSLSocket: + """ + Establish a TLS connection to the proxy using the provided SSL context. + """ + # `_connect_tls_proxy` is called when self._tunnel_host is truthy. + proxy_config = typing.cast(ProxyConfig, self.proxy_config) + ssl_context = proxy_config.ssl_context + sock_and_verified = _ssl_wrap_socket_and_match_hostname( + sock, + cert_reqs=self.cert_reqs, + ssl_version=self.ssl_version, + ssl_minimum_version=self.ssl_minimum_version, + ssl_maximum_version=self.ssl_maximum_version, + ca_certs=self.ca_certs, + ca_cert_dir=self.ca_cert_dir, + ca_cert_data=self.ca_cert_data, + server_hostname=hostname, + ssl_context=ssl_context, + assert_hostname=proxy_config.assert_hostname, + assert_fingerprint=proxy_config.assert_fingerprint, + # Features that aren't implemented for proxies yet: + cert_file=None, + key_file=None, + key_password=None, + tls_in_tls=False, + ) + self.proxy_is_verified = sock_and_verified.is_verified + return sock_and_verified.socket # type: ignore[return-value] + + +class _WrappedAndVerifiedSocket(typing.NamedTuple): + """ + Wrapped socket and whether the connection is + verified after the TLS handshake + """ + + socket: ssl.SSLSocket | SSLTransport + is_verified: bool + + +def _ssl_wrap_socket_and_match_hostname( + sock: socket.socket, + *, + cert_reqs: None | str | int, + ssl_version: None | str | int, + ssl_minimum_version: int | None, + ssl_maximum_version: int | None, + cert_file: str | None, + key_file: str | None, + key_password: str | None, + ca_certs: str | None, + ca_cert_dir: str | None, + ca_cert_data: None | str | bytes, + assert_hostname: None | str | Literal[False], + assert_fingerprint: str | None, + server_hostname: str | None, + ssl_context: ssl.SSLContext | None, + tls_in_tls: bool = False, +) -> _WrappedAndVerifiedSocket: + """Logic for constructing an SSLContext from all TLS parameters, passing + that down into ssl_wrap_socket, and then doing certificate verification + either via hostname or fingerprint. This function exists to guarantee + that both proxies and targets have the same behavior when connecting via TLS. + """ + default_ssl_context = False + if ssl_context is None: + default_ssl_context = True + context = create_urllib3_context( + ssl_version=resolve_ssl_version(ssl_version), + ssl_minimum_version=ssl_minimum_version, + ssl_maximum_version=ssl_maximum_version, + cert_reqs=resolve_cert_reqs(cert_reqs), + ) + else: + context = ssl_context + + context.verify_mode = resolve_cert_reqs(cert_reqs) + + # In some cases, we want to verify hostnames ourselves + if ( + # `ssl` can't verify fingerprints or alternate hostnames + assert_fingerprint + or assert_hostname + # assert_hostname can be set to False to disable hostname checking + or assert_hostname is False + # We still support OpenSSL 1.0.2, which prevents us from verifying + # hostnames easily: https://github.com/pyca/pyopenssl/pull/933 + or ssl_.IS_PYOPENSSL + or not ssl_.HAS_NEVER_CHECK_COMMON_NAME + ): + context.check_hostname = False + + # Try to load OS default certs if none are given. We need to do the hasattr() check + # for custom pyOpenSSL SSLContext objects because they don't support + # load_default_certs(). + if ( + not ca_certs + and not ca_cert_dir + and not ca_cert_data + and default_ssl_context + and hasattr(context, "load_default_certs") + ): + context.load_default_certs() + + # Ensure that IPv6 addresses are in the proper format and don't have a + # scope ID. Python's SSL module fails to recognize scoped IPv6 addresses + # and interprets them as DNS hostnames. + if server_hostname is not None: + normalized = server_hostname.strip("[]") + if "%" in normalized: + normalized = normalized[: normalized.rfind("%")] + if is_ipaddress(normalized): + server_hostname = normalized + + ssl_sock = ssl_wrap_socket( + sock=sock, + keyfile=key_file, + certfile=cert_file, + key_password=key_password, + ca_certs=ca_certs, + ca_cert_dir=ca_cert_dir, + ca_cert_data=ca_cert_data, + server_hostname=server_hostname, + ssl_context=context, + tls_in_tls=tls_in_tls, + ) + + try: + if assert_fingerprint: + _assert_fingerprint( + ssl_sock.getpeercert(binary_form=True), assert_fingerprint + ) + elif ( + context.verify_mode != ssl.CERT_NONE + and not context.check_hostname + and assert_hostname is not False + ): + cert: _TYPE_PEER_CERT_RET_DICT = ssl_sock.getpeercert() # type: ignore[assignment] + + # Need to signal to our match_hostname whether to use 'commonName' or not. + # If we're using our own constructed SSLContext we explicitly set 'False' + # because PyPy hard-codes 'True' from SSLContext.hostname_checks_common_name. + if default_ssl_context: + hostname_checks_common_name = False + else: + hostname_checks_common_name = ( + getattr(context, "hostname_checks_common_name", False) or False + ) + + _match_hostname( + cert, + assert_hostname or server_hostname, # type: ignore[arg-type] + hostname_checks_common_name, + ) + + return _WrappedAndVerifiedSocket( + socket=ssl_sock, + is_verified=context.verify_mode == ssl.CERT_REQUIRED + or bool(assert_fingerprint), + ) + except BaseException: + ssl_sock.close() + raise + + +def _match_hostname( + cert: _TYPE_PEER_CERT_RET_DICT | None, + asserted_hostname: str, + hostname_checks_common_name: bool = False, +) -> None: + # Our upstream implementation of ssl.match_hostname() + # only applies this normalization to IP addresses so it doesn't + # match DNS SANs so we do the same thing! + stripped_hostname = asserted_hostname.strip("[]") + if is_ipaddress(stripped_hostname): + asserted_hostname = stripped_hostname + + try: + match_hostname(cert, asserted_hostname, hostname_checks_common_name) + except CertificateError as e: + log.warning( + "Certificate did not match expected hostname: %s. Certificate: %s", + asserted_hostname, + cert, + ) + # Add cert to exception and reraise so client code can inspect + # the cert when catching the exception, if they want to + e._peer_cert = cert # type: ignore[attr-defined] + raise + + +def _wrap_proxy_error(err: Exception, proxy_scheme: str | None) -> ProxyError: + # Look for the phrase 'wrong version number', if found + # then we should warn the user that we're very sure that + # this proxy is HTTP-only and they have a configuration issue. + error_normalized = " ".join(re.split("[^a-z]", str(err).lower())) + is_likely_http_proxy = ( + "wrong version number" in error_normalized + or "unknown protocol" in error_normalized + ) + http_proxy_warning = ( + ". Your proxy appears to only use HTTP and not HTTPS, " + "try changing your proxy URL to be HTTP. See: " + "https://urllib3.readthedocs.io/en/latest/advanced-usage.html" + "#https-proxy-error-http-proxy" + ) + new_err = ProxyError( + f"Unable to connect to proxy" + f"{http_proxy_warning if is_likely_http_proxy and proxy_scheme == 'https' else ''}", + err, + ) + new_err.__cause__ = err + return new_err + + +def _get_default_user_agent() -> str: + return f"python-urllib3/{__version__}" + + +class DummyConnection: + """Used to detect a failed ConnectionCls import.""" + + +if not ssl: + HTTPSConnection = DummyConnection # type: ignore[misc, assignment] # noqa: F811 + + +VerifiedHTTPSConnection = HTTPSConnection + + +def _url_from_connection( + conn: HTTPConnection | HTTPSConnection, path: str | None = None +) -> str: + """Returns the URL from a given connection. This is mainly used for testing and logging.""" + + scheme = "https" if isinstance(conn, HTTPSConnection) else "http" + + return Url(scheme=scheme, host=conn.host, port=conn.port, path=path).url diff --git a/Modules/urllib3/connectionpool.py b/Modules/urllib3/connectionpool.py new file mode 100644 index 0000000..70048b7 --- /dev/null +++ b/Modules/urllib3/connectionpool.py @@ -0,0 +1,1182 @@ +from __future__ import annotations + +import errno +import logging +import queue +import sys +import typing +import warnings +import weakref +from socket import timeout as SocketTimeout +from types import TracebackType + +from ._base_connection import _TYPE_BODY +from ._collections import HTTPHeaderDict +from ._request_methods import RequestMethods +from .connection import ( + BaseSSLError, + BrokenPipeError, + DummyConnection, + HTTPConnection, + HTTPException, + HTTPSConnection, + ProxyConfig, + _wrap_proxy_error, +) +from .connection import port_by_scheme as port_by_scheme +from .exceptions import ( + ClosedPoolError, + EmptyPoolError, + FullPoolError, + HostChangedError, + InsecureRequestWarning, + LocationValueError, + MaxRetryError, + NewConnectionError, + ProtocolError, + ProxyError, + ReadTimeoutError, + SSLError, + TimeoutError, +) +from .response import BaseHTTPResponse +from .util.connection import is_connection_dropped +from .util.proxy import connection_requires_http_tunnel +from .util.request import _TYPE_BODY_POSITION, set_file_position +from .util.retry import Retry +from .util.ssl_match_hostname import CertificateError +from .util.timeout import _DEFAULT_TIMEOUT, _TYPE_DEFAULT, Timeout +from .util.url import Url, _encode_target +from .util.url import _normalize_host as normalize_host +from .util.url import parse_url +from .util.util import to_str + +if typing.TYPE_CHECKING: + import ssl + from typing import Literal + + from ._base_connection import BaseHTTPConnection, BaseHTTPSConnection + +log = logging.getLogger(__name__) + +_TYPE_TIMEOUT = typing.Union[Timeout, float, _TYPE_DEFAULT, None] + +_SelfT = typing.TypeVar("_SelfT") + + +# Pool objects +class ConnectionPool: + """ + Base class for all connection pools, such as + :class:`.HTTPConnectionPool` and :class:`.HTTPSConnectionPool`. + + .. note:: + ConnectionPool.urlopen() does not normalize or percent-encode target URIs + which is useful if your target server doesn't support percent-encoded + target URIs. + """ + + scheme: str | None = None + QueueCls = queue.LifoQueue + + def __init__(self, host: str, port: int | None = None) -> None: + if not host: + raise LocationValueError("No host specified.") + + self.host = _normalize_host(host, scheme=self.scheme) + self.port = port + + # This property uses 'normalize_host()' (not '_normalize_host()') + # to avoid removing square braces around IPv6 addresses. + # This value is sent to `HTTPConnection.set_tunnel()` if called + # because square braces are required for HTTP CONNECT tunneling. + self._tunnel_host = normalize_host(host, scheme=self.scheme).lower() + + def __str__(self) -> str: + return f"{type(self).__name__}(host={self.host!r}, port={self.port!r})" + + def __enter__(self: _SelfT) -> _SelfT: + return self + + def __exit__( + self, + exc_type: type[BaseException] | None, + exc_val: BaseException | None, + exc_tb: TracebackType | None, + ) -> Literal[False]: + self.close() + # Return False to re-raise any potential exceptions + return False + + def close(self) -> None: + """ + Close all pooled connections and disable the pool. + """ + + +# This is taken from http://hg.python.org/cpython/file/7aaba721ebc0/Lib/socket.py#l252 +_blocking_errnos = {errno.EAGAIN, errno.EWOULDBLOCK} + + +class HTTPConnectionPool(ConnectionPool, RequestMethods): + """ + Thread-safe connection pool for one host. + + :param host: + Host used for this HTTP Connection (e.g. "localhost"), passed into + :class:`http.client.HTTPConnection`. + + :param port: + Port used for this HTTP Connection (None is equivalent to 80), passed + into :class:`http.client.HTTPConnection`. + + :param timeout: + Socket timeout in seconds for each individual connection. This can + be a float or integer, which sets the timeout for the HTTP request, + or an instance of :class:`urllib3.util.Timeout` which gives you more + fine-grained control over request timeouts. After the constructor has + been parsed, this is always a `urllib3.util.Timeout` object. + + :param maxsize: + Number of connections to save that can be reused. More than 1 is useful + in multithreaded situations. If ``block`` is set to False, more + connections will be created but they will not be saved once they've + been used. + + :param block: + If set to True, no more than ``maxsize`` connections will be used at + a time. When no free connections are available, the call will block + until a connection has been released. This is a useful side effect for + particular multithreaded situations where one does not want to use more + than maxsize connections per host to prevent flooding. + + :param headers: + Headers to include with all requests, unless other headers are given + explicitly. + + :param retries: + Retry configuration to use by default with requests in this pool. + + :param _proxy: + Parsed proxy URL, should not be used directly, instead, see + :class:`urllib3.ProxyManager` + + :param _proxy_headers: + A dictionary with proxy headers, should not be used directly, + instead, see :class:`urllib3.ProxyManager` + + :param \\**conn_kw: + Additional parameters are used to create fresh :class:`urllib3.connection.HTTPConnection`, + :class:`urllib3.connection.HTTPSConnection` instances. + """ + + scheme = "http" + ConnectionCls: ( + type[BaseHTTPConnection] | type[BaseHTTPSConnection] + ) = HTTPConnection + + def __init__( + self, + host: str, + port: int | None = None, + timeout: _TYPE_TIMEOUT | None = _DEFAULT_TIMEOUT, + maxsize: int = 1, + block: bool = False, + headers: typing.Mapping[str, str] | None = None, + retries: Retry | bool | int | None = None, + _proxy: Url | None = None, + _proxy_headers: typing.Mapping[str, str] | None = None, + _proxy_config: ProxyConfig | None = None, + **conn_kw: typing.Any, + ): + ConnectionPool.__init__(self, host, port) + RequestMethods.__init__(self, headers) + + if not isinstance(timeout, Timeout): + timeout = Timeout.from_float(timeout) + + if retries is None: + retries = Retry.DEFAULT + + self.timeout = timeout + self.retries = retries + + self.pool: queue.LifoQueue[typing.Any] | None = self.QueueCls(maxsize) + self.block = block + + self.proxy = _proxy + self.proxy_headers = _proxy_headers or {} + self.proxy_config = _proxy_config + + # Fill the queue up so that doing get() on it will block properly + for _ in range(maxsize): + self.pool.put(None) + + # These are mostly for testing and debugging purposes. + self.num_connections = 0 + self.num_requests = 0 + self.conn_kw = conn_kw + + if self.proxy: + # Enable Nagle's algorithm for proxies, to avoid packet fragmentation. + # We cannot know if the user has added default socket options, so we cannot replace the + # list. + self.conn_kw.setdefault("socket_options", []) + + self.conn_kw["proxy"] = self.proxy + self.conn_kw["proxy_config"] = self.proxy_config + + # Do not pass 'self' as callback to 'finalize'. + # Then the 'finalize' would keep an endless living (leak) to self. + # By just passing a reference to the pool allows the garbage collector + # to free self if nobody else has a reference to it. + pool = self.pool + + # Close all the HTTPConnections in the pool before the + # HTTPConnectionPool object is garbage collected. + weakref.finalize(self, _close_pool_connections, pool) + + def _new_conn(self) -> BaseHTTPConnection: + """ + Return a fresh :class:`HTTPConnection`. + """ + self.num_connections += 1 + log.debug( + "Starting new HTTP connection (%d): %s:%s", + self.num_connections, + self.host, + self.port or "80", + ) + + conn = self.ConnectionCls( + host=self.host, + port=self.port, + timeout=self.timeout.connect_timeout, + **self.conn_kw, + ) + return conn + + def _get_conn(self, timeout: float | None = None) -> BaseHTTPConnection: + """ + Get a connection. Will return a pooled connection if one is available. + + If no connections are available and :prop:`.block` is ``False``, then a + fresh connection is returned. + + :param timeout: + Seconds to wait before giving up and raising + :class:`urllib3.exceptions.EmptyPoolError` if the pool is empty and + :prop:`.block` is ``True``. + """ + conn = None + + if self.pool is None: + raise ClosedPoolError(self, "Pool is closed.") + + try: + conn = self.pool.get(block=self.block, timeout=timeout) + + except AttributeError: # self.pool is None + raise ClosedPoolError(self, "Pool is closed.") from None # Defensive: + + except queue.Empty: + if self.block: + raise EmptyPoolError( + self, + "Pool is empty and a new connection can't be opened due to blocking mode.", + ) from None + pass # Oh well, we'll create a new connection then + + # If this is a persistent connection, check if it got disconnected + if conn and is_connection_dropped(conn): + log.debug("Resetting dropped connection: %s", self.host) + conn.close() + + return conn or self._new_conn() + + def _put_conn(self, conn: BaseHTTPConnection | None) -> None: + """ + Put a connection back into the pool. + + :param conn: + Connection object for the current host and port as returned by + :meth:`._new_conn` or :meth:`._get_conn`. + + If the pool is already full, the connection is closed and discarded + because we exceeded maxsize. If connections are discarded frequently, + then maxsize should be increased. + + If the pool is closed, then the connection will be closed and discarded. + """ + if self.pool is not None: + try: + self.pool.put(conn, block=False) + return # Everything is dandy, done. + except AttributeError: + # self.pool is None. + pass + except queue.Full: + # Connection never got put back into the pool, close it. + if conn: + conn.close() + + if self.block: + # This should never happen if you got the conn from self._get_conn + raise FullPoolError( + self, + "Pool reached maximum size and no more connections are allowed.", + ) from None + + log.warning( + "Connection pool is full, discarding connection: %s. Connection pool size: %s", + self.host, + self.pool.qsize(), + ) + + # Connection never got put back into the pool, close it. + if conn: + conn.close() + + def _validate_conn(self, conn: BaseHTTPConnection) -> None: + """ + Called right before a request is made, after the socket is created. + """ + + def _prepare_proxy(self, conn: BaseHTTPConnection) -> None: + # Nothing to do for HTTP connections. + pass + + def _get_timeout(self, timeout: _TYPE_TIMEOUT) -> Timeout: + """Helper that always returns a :class:`urllib3.util.Timeout`""" + if timeout is _DEFAULT_TIMEOUT: + return self.timeout.clone() + + if isinstance(timeout, Timeout): + return timeout.clone() + else: + # User passed us an int/float. This is for backwards compatibility, + # can be removed later + return Timeout.from_float(timeout) + + def _raise_timeout( + self, + err: BaseSSLError | OSError | SocketTimeout, + url: str, + timeout_value: _TYPE_TIMEOUT | None, + ) -> None: + """Is the error actually a timeout? Will raise a ReadTimeout or pass""" + + if isinstance(err, SocketTimeout): + raise ReadTimeoutError( + self, url, f"Read timed out. (read timeout={timeout_value})" + ) from err + + # See the above comment about EAGAIN in Python 3. + if hasattr(err, "errno") and err.errno in _blocking_errnos: + raise ReadTimeoutError( + self, url, f"Read timed out. (read timeout={timeout_value})" + ) from err + + def _make_request( + self, + conn: BaseHTTPConnection, + method: str, + url: str, + body: _TYPE_BODY | None = None, + headers: typing.Mapping[str, str] | None = None, + retries: Retry | None = None, + timeout: _TYPE_TIMEOUT = _DEFAULT_TIMEOUT, + chunked: bool = False, + response_conn: BaseHTTPConnection | None = None, + preload_content: bool = True, + decode_content: bool = True, + enforce_content_length: bool = True, + ) -> BaseHTTPResponse: + """ + Perform a request on a given urllib connection object taken from our + pool. + + :param conn: + a connection from one of our connection pools + + :param method: + HTTP request method (such as GET, POST, PUT, etc.) + + :param url: + The URL to perform the request on. + + :param body: + Data to send in the request body, either :class:`str`, :class:`bytes`, + an iterable of :class:`str`/:class:`bytes`, or a file-like object. + + :param headers: + Dictionary of custom headers to send, such as User-Agent, + If-None-Match, etc. If None, pool headers are used. If provided, + these headers completely replace any pool-specific headers. + + :param retries: + Configure the number of retries to allow before raising a + :class:`~urllib3.exceptions.MaxRetryError` exception. + + Pass ``None`` to retry until you receive a response. Pass a + :class:`~urllib3.util.retry.Retry` object for fine-grained control + over different types of retries. + Pass an integer number to retry connection errors that many times, + but no other types of errors. Pass zero to never retry. + + If ``False``, then retries are disabled and any exception is raised + immediately. Also, instead of raising a MaxRetryError on redirects, + the redirect response will be returned. + + :type retries: :class:`~urllib3.util.retry.Retry`, False, or an int. + + :param timeout: + If specified, overrides the default timeout for this one + request. It may be a float (in seconds) or an instance of + :class:`urllib3.util.Timeout`. + + :param chunked: + If True, urllib3 will send the body using chunked transfer + encoding. Otherwise, urllib3 will send the body using the standard + content-length form. Defaults to False. + + :param response_conn: + Set this to ``None`` if you will handle releasing the connection or + set the connection to have the response release it. + + :param preload_content: + If True, the response's body will be preloaded during construction. + + :param decode_content: + If True, will attempt to decode the body based on the + 'content-encoding' header. + + :param enforce_content_length: + Enforce content length checking. Body returned by server must match + value of Content-Length header, if present. Otherwise, raise error. + """ + self.num_requests += 1 + + timeout_obj = self._get_timeout(timeout) + timeout_obj.start_connect() + conn.timeout = Timeout.resolve_default_timeout(timeout_obj.connect_timeout) + + try: + # Trigger any extra validation we need to do. + try: + self._validate_conn(conn) + except (SocketTimeout, BaseSSLError) as e: + self._raise_timeout(err=e, url=url, timeout_value=conn.timeout) + raise + + # _validate_conn() starts the connection to an HTTPS proxy + # so we need to wrap errors with 'ProxyError' here too. + except ( + OSError, + NewConnectionError, + TimeoutError, + BaseSSLError, + CertificateError, + SSLError, + ) as e: + new_e: Exception = e + if isinstance(e, (BaseSSLError, CertificateError)): + new_e = SSLError(e) + # If the connection didn't successfully connect to it's proxy + # then there + if isinstance( + new_e, (OSError, NewConnectionError, TimeoutError, SSLError) + ) and (conn and conn.proxy and not conn.has_connected_to_proxy): + new_e = _wrap_proxy_error(new_e, conn.proxy.scheme) + raise new_e + + # conn.request() calls http.client.*.request, not the method in + # urllib3.request. It also calls makefile (recv) on the socket. + try: + conn.request( + method, + url, + body=body, + headers=headers, + chunked=chunked, + preload_content=preload_content, + decode_content=decode_content, + enforce_content_length=enforce_content_length, + ) + + # We are swallowing BrokenPipeError (errno.EPIPE) since the server is + # legitimately able to close the connection after sending a valid response. + # With this behaviour, the received response is still readable. + except BrokenPipeError: + pass + except OSError as e: + # MacOS/Linux + # EPROTOTYPE is needed on macOS + # https://erickt.github.io/blog/2014/11/19/adventures-in-debugging-a-potential-osx-kernel-bug/ + if e.errno != errno.EPROTOTYPE: + raise + + # Reset the timeout for the recv() on the socket + read_timeout = timeout_obj.read_timeout + + if not conn.is_closed: + # In Python 3 socket.py will catch EAGAIN and return None when you + # try and read into the file pointer created by http.client, which + # instead raises a BadStatusLine exception. Instead of catching + # the exception and assuming all BadStatusLine exceptions are read + # timeouts, check for a zero timeout before making the request. + if read_timeout == 0: + raise ReadTimeoutError( + self, url, f"Read timed out. (read timeout={read_timeout})" + ) + conn.timeout = read_timeout + + # Receive the response from the server + try: + response = conn.getresponse() + except (BaseSSLError, OSError) as e: + self._raise_timeout(err=e, url=url, timeout_value=read_timeout) + raise + + # Set properties that are used by the pooling layer. + response.retries = retries + response._connection = response_conn # type: ignore[attr-defined] + response._pool = self # type: ignore[attr-defined] + + log.debug( + '%s://%s:%s "%s %s %s" %s %s', + self.scheme, + self.host, + self.port, + method, + url, + # HTTP version + conn._http_vsn_str, # type: ignore[attr-defined] + response.status, + response.length_remaining, # type: ignore[attr-defined] + ) + + return response + + def close(self) -> None: + """ + Close all pooled connections and disable the pool. + """ + if self.pool is None: + return + # Disable access to the pool + old_pool, self.pool = self.pool, None + + # Close all the HTTPConnections in the pool. + _close_pool_connections(old_pool) + + def is_same_host(self, url: str) -> bool: + """ + Check if the given ``url`` is a member of the same host as this + connection pool. + """ + if url.startswith("/"): + return True + + # TODO: Add optional support for socket.gethostbyname checking. + scheme, _, host, port, *_ = parse_url(url) + scheme = scheme or "http" + if host is not None: + host = _normalize_host(host, scheme=scheme) + + # Use explicit default port for comparison when none is given + if self.port and not port: + port = port_by_scheme.get(scheme) + elif not self.port and port == port_by_scheme.get(scheme): + port = None + + return (scheme, host, port) == (self.scheme, self.host, self.port) + + def urlopen( # type: ignore[override] + self, + method: str, + url: str, + body: _TYPE_BODY | None = None, + headers: typing.Mapping[str, str] | None = None, + retries: Retry | bool | int | None = None, + redirect: bool = True, + assert_same_host: bool = True, + timeout: _TYPE_TIMEOUT = _DEFAULT_TIMEOUT, + pool_timeout: int | None = None, + release_conn: bool | None = None, + chunked: bool = False, + body_pos: _TYPE_BODY_POSITION | None = None, + preload_content: bool = True, + decode_content: bool = True, + **response_kw: typing.Any, + ) -> BaseHTTPResponse: + """ + Get a connection from the pool and perform an HTTP request. This is the + lowest level call for making a request, so you'll need to specify all + the raw details. + + .. note:: + + More commonly, it's appropriate to use a convenience method + such as :meth:`request`. + + .. note:: + + `release_conn` will only behave as expected if + `preload_content=False` because we want to make + `preload_content=False` the default behaviour someday soon without + breaking backwards compatibility. + + :param method: + HTTP request method (such as GET, POST, PUT, etc.) + + :param url: + The URL to perform the request on. + + :param body: + Data to send in the request body, either :class:`str`, :class:`bytes`, + an iterable of :class:`str`/:class:`bytes`, or a file-like object. + + :param headers: + Dictionary of custom headers to send, such as User-Agent, + If-None-Match, etc. If None, pool headers are used. If provided, + these headers completely replace any pool-specific headers. + + :param retries: + Configure the number of retries to allow before raising a + :class:`~urllib3.exceptions.MaxRetryError` exception. + + Pass ``None`` to retry until you receive a response. Pass a + :class:`~urllib3.util.retry.Retry` object for fine-grained control + over different types of retries. + Pass an integer number to retry connection errors that many times, + but no other types of errors. Pass zero to never retry. + + If ``False``, then retries are disabled and any exception is raised + immediately. Also, instead of raising a MaxRetryError on redirects, + the redirect response will be returned. + + :type retries: :class:`~urllib3.util.retry.Retry`, False, or an int. + + :param redirect: + If True, automatically handle redirects (status codes 301, 302, + 303, 307, 308). Each redirect counts as a retry. Disabling retries + will disable redirect, too. + + :param assert_same_host: + If ``True``, will make sure that the host of the pool requests is + consistent else will raise HostChangedError. When ``False``, you can + use the pool on an HTTP proxy and request foreign hosts. + + :param timeout: + If specified, overrides the default timeout for this one + request. It may be a float (in seconds) or an instance of + :class:`urllib3.util.Timeout`. + + :param pool_timeout: + If set and the pool is set to block=True, then this method will + block for ``pool_timeout`` seconds and raise EmptyPoolError if no + connection is available within the time period. + + :param bool preload_content: + If True, the response's body will be preloaded into memory. + + :param bool decode_content: + If True, will attempt to decode the body based on the + 'content-encoding' header. + + :param release_conn: + If False, then the urlopen call will not release the connection + back into the pool once a response is received (but will release if + you read the entire contents of the response such as when + `preload_content=True`). This is useful if you're not preloading + the response's content immediately. You will need to call + ``r.release_conn()`` on the response ``r`` to return the connection + back into the pool. If None, it takes the value of ``preload_content`` + which defaults to ``True``. + + :param bool chunked: + If True, urllib3 will send the body using chunked transfer + encoding. Otherwise, urllib3 will send the body using the standard + content-length form. Defaults to False. + + :param int body_pos: + Position to seek to in file-like body in the event of a retry or + redirect. Typically this won't need to be set because urllib3 will + auto-populate the value when needed. + """ + parsed_url = parse_url(url) + destination_scheme = parsed_url.scheme + + if headers is None: + headers = self.headers + + if not isinstance(retries, Retry): + retries = Retry.from_int(retries, redirect=redirect, default=self.retries) + + if release_conn is None: + release_conn = preload_content + + # Check host + if assert_same_host and not self.is_same_host(url): + raise HostChangedError(self, url, retries) + + # Ensure that the URL we're connecting to is properly encoded + if url.startswith("/"): + url = to_str(_encode_target(url)) + else: + url = to_str(parsed_url.url) + + conn = None + + # Track whether `conn` needs to be released before + # returning/raising/recursing. Update this variable if necessary, and + # leave `release_conn` constant throughout the function. That way, if + # the function recurses, the original value of `release_conn` will be + # passed down into the recursive call, and its value will be respected. + # + # See issue #651 [1] for details. + # + # [1] + release_this_conn = release_conn + + http_tunnel_required = connection_requires_http_tunnel( + self.proxy, self.proxy_config, destination_scheme + ) + + # Merge the proxy headers. Only done when not using HTTP CONNECT. We + # have to copy the headers dict so we can safely change it without those + # changes being reflected in anyone else's copy. + if not http_tunnel_required: + headers = headers.copy() # type: ignore[attr-defined] + headers.update(self.proxy_headers) # type: ignore[union-attr] + + # Must keep the exception bound to a separate variable or else Python 3 + # complains about UnboundLocalError. + err = None + + # Keep track of whether we cleanly exited the except block. This + # ensures we do proper cleanup in finally. + clean_exit = False + + # Rewind body position, if needed. Record current position + # for future rewinds in the event of a redirect/retry. + body_pos = set_file_position(body, body_pos) + + try: + # Request a connection from the queue. + timeout_obj = self._get_timeout(timeout) + conn = self._get_conn(timeout=pool_timeout) + + conn.timeout = timeout_obj.connect_timeout # type: ignore[assignment] + + # Is this a closed/new connection that requires CONNECT tunnelling? + if self.proxy is not None and http_tunnel_required and conn.is_closed: + try: + self._prepare_proxy(conn) + except (BaseSSLError, OSError, SocketTimeout) as e: + self._raise_timeout( + err=e, url=self.proxy.url, timeout_value=conn.timeout + ) + raise + + # If we're going to release the connection in ``finally:``, then + # the response doesn't need to know about the connection. Otherwise + # it will also try to release it and we'll have a double-release + # mess. + response_conn = conn if not release_conn else None + + # Make the request on the HTTPConnection object + response = self._make_request( + conn, + method, + url, + timeout=timeout_obj, + body=body, + headers=headers, + chunked=chunked, + retries=retries, + response_conn=response_conn, + preload_content=preload_content, + decode_content=decode_content, + **response_kw, + ) + + # Everything went great! + clean_exit = True + + except EmptyPoolError: + # Didn't get a connection from the pool, no need to clean up + clean_exit = True + release_this_conn = False + raise + + except ( + TimeoutError, + HTTPException, + OSError, + ProtocolError, + BaseSSLError, + SSLError, + CertificateError, + ProxyError, + ) as e: + # Discard the connection for these exceptions. It will be + # replaced during the next _get_conn() call. + clean_exit = False + new_e: Exception = e + if isinstance(e, (BaseSSLError, CertificateError)): + new_e = SSLError(e) + if isinstance( + new_e, + ( + OSError, + NewConnectionError, + TimeoutError, + SSLError, + HTTPException, + ), + ) and (conn and conn.proxy and not conn.has_connected_to_proxy): + new_e = _wrap_proxy_error(new_e, conn.proxy.scheme) + elif isinstance(new_e, (OSError, HTTPException)): + new_e = ProtocolError("Connection aborted.", new_e) + + retries = retries.increment( + method, url, error=new_e, _pool=self, _stacktrace=sys.exc_info()[2] + ) + retries.sleep() + + # Keep track of the error for the retry warning. + err = e + + finally: + if not clean_exit: + # We hit some kind of exception, handled or otherwise. We need + # to throw the connection away unless explicitly told not to. + # Close the connection, set the variable to None, and make sure + # we put the None back in the pool to avoid leaking it. + if conn: + conn.close() + conn = None + release_this_conn = True + + if release_this_conn: + # Put the connection back to be reused. If the connection is + # expired then it will be None, which will get replaced with a + # fresh connection during _get_conn. + self._put_conn(conn) + + if not conn: + # Try again + log.warning( + "Retrying (%r) after connection broken by '%r': %s", retries, err, url + ) + return self.urlopen( + method, + url, + body, + headers, + retries, + redirect, + assert_same_host, + timeout=timeout, + pool_timeout=pool_timeout, + release_conn=release_conn, + chunked=chunked, + body_pos=body_pos, + preload_content=preload_content, + decode_content=decode_content, + **response_kw, + ) + + # Handle redirect? + redirect_location = redirect and response.get_redirect_location() + if redirect_location: + if response.status == 303: + # Change the method according to RFC 9110, Section 15.4.4. + method = "GET" + # And lose the body not to transfer anything sensitive. + body = None + headers = HTTPHeaderDict(headers)._prepare_for_method_change() + + try: + retries = retries.increment(method, url, response=response, _pool=self) + except MaxRetryError: + if retries.raise_on_redirect: + response.drain_conn() + raise + return response + + response.drain_conn() + retries.sleep_for_retry(response) + log.debug("Redirecting %s -> %s", url, redirect_location) + return self.urlopen( + method, + redirect_location, + body, + headers, + retries=retries, + redirect=redirect, + assert_same_host=assert_same_host, + timeout=timeout, + pool_timeout=pool_timeout, + release_conn=release_conn, + chunked=chunked, + body_pos=body_pos, + preload_content=preload_content, + decode_content=decode_content, + **response_kw, + ) + + # Check if we should retry the HTTP response. + has_retry_after = bool(response.headers.get("Retry-After")) + if retries.is_retry(method, response.status, has_retry_after): + try: + retries = retries.increment(method, url, response=response, _pool=self) + except MaxRetryError: + if retries.raise_on_status: + response.drain_conn() + raise + return response + + response.drain_conn() + retries.sleep(response) + log.debug("Retry: %s", url) + return self.urlopen( + method, + url, + body, + headers, + retries=retries, + redirect=redirect, + assert_same_host=assert_same_host, + timeout=timeout, + pool_timeout=pool_timeout, + release_conn=release_conn, + chunked=chunked, + body_pos=body_pos, + preload_content=preload_content, + decode_content=decode_content, + **response_kw, + ) + + return response + + +class HTTPSConnectionPool(HTTPConnectionPool): + """ + Same as :class:`.HTTPConnectionPool`, but HTTPS. + + :class:`.HTTPSConnection` uses one of ``assert_fingerprint``, + ``assert_hostname`` and ``host`` in this order to verify connections. + If ``assert_hostname`` is False, no verification is done. + + The ``key_file``, ``cert_file``, ``cert_reqs``, ``ca_certs``, + ``ca_cert_dir``, ``ssl_version``, ``key_password`` are only used if :mod:`ssl` + is available and are fed into :meth:`urllib3.util.ssl_wrap_socket` to upgrade + the connection socket into an SSL socket. + """ + + scheme = "https" + ConnectionCls: type[BaseHTTPSConnection] = HTTPSConnection + + def __init__( + self, + host: str, + port: int | None = None, + timeout: _TYPE_TIMEOUT | None = _DEFAULT_TIMEOUT, + maxsize: int = 1, + block: bool = False, + headers: typing.Mapping[str, str] | None = None, + retries: Retry | bool | int | None = None, + _proxy: Url | None = None, + _proxy_headers: typing.Mapping[str, str] | None = None, + key_file: str | None = None, + cert_file: str | None = None, + cert_reqs: int | str | None = None, + key_password: str | None = None, + ca_certs: str | None = None, + ssl_version: int | str | None = None, + ssl_minimum_version: ssl.TLSVersion | None = None, + ssl_maximum_version: ssl.TLSVersion | None = None, + assert_hostname: str | Literal[False] | None = None, + assert_fingerprint: str | None = None, + ca_cert_dir: str | None = None, + **conn_kw: typing.Any, + ) -> None: + super().__init__( + host, + port, + timeout, + maxsize, + block, + headers, + retries, + _proxy, + _proxy_headers, + **conn_kw, + ) + + self.key_file = key_file + self.cert_file = cert_file + self.cert_reqs = cert_reqs + self.key_password = key_password + self.ca_certs = ca_certs + self.ca_cert_dir = ca_cert_dir + self.ssl_version = ssl_version + self.ssl_minimum_version = ssl_minimum_version + self.ssl_maximum_version = ssl_maximum_version + self.assert_hostname = assert_hostname + self.assert_fingerprint = assert_fingerprint + + def _prepare_proxy(self, conn: HTTPSConnection) -> None: # type: ignore[override] + """Establishes a tunnel connection through HTTP CONNECT.""" + if self.proxy and self.proxy.scheme == "https": + tunnel_scheme = "https" + else: + tunnel_scheme = "http" + + conn.set_tunnel( + scheme=tunnel_scheme, + host=self._tunnel_host, + port=self.port, + headers=self.proxy_headers, + ) + conn.connect() + + def _new_conn(self) -> BaseHTTPSConnection: + """ + Return a fresh :class:`urllib3.connection.HTTPConnection`. + """ + self.num_connections += 1 + log.debug( + "Starting new HTTPS connection (%d): %s:%s", + self.num_connections, + self.host, + self.port or "443", + ) + + if not self.ConnectionCls or self.ConnectionCls is DummyConnection: # type: ignore[comparison-overlap] + raise ImportError( + "Can't connect to HTTPS URL because the SSL module is not available." + ) + + actual_host: str = self.host + actual_port = self.port + if self.proxy is not None and self.proxy.host is not None: + actual_host = self.proxy.host + actual_port = self.proxy.port + + return self.ConnectionCls( + host=actual_host, + port=actual_port, + timeout=self.timeout.connect_timeout, + cert_file=self.cert_file, + key_file=self.key_file, + key_password=self.key_password, + cert_reqs=self.cert_reqs, + ca_certs=self.ca_certs, + ca_cert_dir=self.ca_cert_dir, + assert_hostname=self.assert_hostname, + assert_fingerprint=self.assert_fingerprint, + ssl_version=self.ssl_version, + ssl_minimum_version=self.ssl_minimum_version, + ssl_maximum_version=self.ssl_maximum_version, + **self.conn_kw, + ) + + def _validate_conn(self, conn: BaseHTTPConnection) -> None: + """ + Called right before a request is made, after the socket is created. + """ + super()._validate_conn(conn) + + # Force connect early to allow us to validate the connection. + if conn.is_closed: + conn.connect() + + if not conn.is_verified: + warnings.warn( + ( + f"Unverified HTTPS request is being made to host '{conn.host}'. " + "Adding certificate verification is strongly advised. See: " + "https://urllib3.readthedocs.io/en/latest/advanced-usage.html" + "#tls-warnings" + ), + InsecureRequestWarning, + ) + + +def connection_from_url(url: str, **kw: typing.Any) -> HTTPConnectionPool: + """ + Given a url, return an :class:`.ConnectionPool` instance of its host. + + This is a shortcut for not having to parse out the scheme, host, and port + of the url before creating an :class:`.ConnectionPool` instance. + + :param url: + Absolute URL string that must include the scheme. Port is optional. + + :param \\**kw: + Passes additional parameters to the constructor of the appropriate + :class:`.ConnectionPool`. Useful for specifying things like + timeout, maxsize, headers, etc. + + Example:: + + >>> conn = connection_from_url('http://google.com/') + >>> r = conn.request('GET', '/') + """ + scheme, _, host, port, *_ = parse_url(url) + scheme = scheme or "http" + port = port or port_by_scheme.get(scheme, 80) + if scheme == "https": + return HTTPSConnectionPool(host, port=port, **kw) # type: ignore[arg-type] + else: + return HTTPConnectionPool(host, port=port, **kw) # type: ignore[arg-type] + + +@typing.overload +def _normalize_host(host: None, scheme: str | None) -> None: + ... + + +@typing.overload +def _normalize_host(host: str, scheme: str | None) -> str: + ... + + +def _normalize_host(host: str | None, scheme: str | None) -> str | None: + """ + Normalize hosts for comparisons and use with sockets. + """ + + host = normalize_host(host, scheme) + + # httplib doesn't like it when we include brackets in IPv6 addresses + # Specifically, if we include brackets but also pass the port then + # httplib crazily doubles up the square brackets on the Host header. + # Instead, we need to make sure we never pass ``None`` as the port. + # However, for backward compatibility reasons we can't actually + # *assert* that. See http://bugs.python.org/issue28539 + if host and host.startswith("[") and host.endswith("]"): + host = host[1:-1] + return host + + +def _url_from_pool( + pool: HTTPConnectionPool | HTTPSConnectionPool, path: str | None = None +) -> str: + """Returns the URL from a given connection pool. This is mainly used for testing and logging.""" + return Url(scheme=pool.scheme, host=pool.host, port=pool.port, path=path).url + + +def _close_pool_connections(pool: queue.LifoQueue[typing.Any]) -> None: + """Drains a queue of connections and closes each one.""" + try: + while True: + conn = pool.get(block=False) + if conn: + conn.close() + except queue.Empty: + pass # Done. diff --git a/Modules/urllib3/exceptions.py b/Modules/urllib3/exceptions.py new file mode 100644 index 0000000..5bb9236 --- /dev/null +++ b/Modules/urllib3/exceptions.py @@ -0,0 +1,318 @@ +from __future__ import annotations + +import socket +import typing +import warnings +from email.errors import MessageDefect +from http.client import IncompleteRead as httplib_IncompleteRead + +if typing.TYPE_CHECKING: + from .connection import HTTPConnection + from .connectionpool import ConnectionPool + from .response import HTTPResponse + from .util.retry import Retry + +# Base Exceptions + + +class HTTPError(Exception): + """Base exception used by this module.""" + + +class HTTPWarning(Warning): + """Base warning used by this module.""" + + +_TYPE_REDUCE_RESULT = typing.Tuple[ + typing.Callable[..., object], typing.Tuple[object, ...] +] + + +class PoolError(HTTPError): + """Base exception for errors caused within a pool.""" + + def __init__(self, pool: ConnectionPool, message: str) -> None: + self.pool = pool + super().__init__(f"{pool}: {message}") + + def __reduce__(self) -> _TYPE_REDUCE_RESULT: + # For pickling purposes. + return self.__class__, (None, None) + + +class RequestError(PoolError): + """Base exception for PoolErrors that have associated URLs.""" + + def __init__(self, pool: ConnectionPool, url: str, message: str) -> None: + self.url = url + super().__init__(pool, message) + + def __reduce__(self) -> _TYPE_REDUCE_RESULT: + # For pickling purposes. + return self.__class__, (None, self.url, None) + + +class SSLError(HTTPError): + """Raised when SSL certificate fails in an HTTPS connection.""" + + +class ProxyError(HTTPError): + """Raised when the connection to a proxy fails.""" + + # The original error is also available as __cause__. + original_error: Exception + + def __init__(self, message: str, error: Exception) -> None: + super().__init__(message, error) + self.original_error = error + + +class DecodeError(HTTPError): + """Raised when automatic decoding based on Content-Type fails.""" + + +class ProtocolError(HTTPError): + """Raised when something unexpected happens mid-request/response.""" + + +#: Renamed to ProtocolError but aliased for backwards compatibility. +ConnectionError = ProtocolError + + +# Leaf Exceptions + + +class MaxRetryError(RequestError): + """Raised when the maximum number of retries is exceeded. + + :param pool: The connection pool + :type pool: :class:`~urllib3.connectionpool.HTTPConnectionPool` + :param str url: The requested Url + :param reason: The underlying error + :type reason: :class:`Exception` + + """ + + def __init__( + self, pool: ConnectionPool, url: str, reason: Exception | None = None + ) -> None: + self.reason = reason + + message = f"Max retries exceeded with url: {url} (Caused by {reason!r})" + + super().__init__(pool, url, message) + + +class HostChangedError(RequestError): + """Raised when an existing pool gets a request for a foreign host.""" + + def __init__( + self, pool: ConnectionPool, url: str, retries: Retry | int = 3 + ) -> None: + message = f"Tried to open a foreign host with url: {url}" + super().__init__(pool, url, message) + self.retries = retries + + +class TimeoutStateError(HTTPError): + """Raised when passing an invalid state to a timeout""" + + +class TimeoutError(HTTPError): + """Raised when a socket timeout error occurs. + + Catching this error will catch both :exc:`ReadTimeoutErrors + ` and :exc:`ConnectTimeoutErrors `. + """ + + +class ReadTimeoutError(TimeoutError, RequestError): + """Raised when a socket timeout occurs while receiving data from a server""" + + +# This timeout error does not have a URL attached and needs to inherit from the +# base HTTPError +class ConnectTimeoutError(TimeoutError): + """Raised when a socket timeout occurs while connecting to a server""" + + +class NewConnectionError(ConnectTimeoutError, HTTPError): + """Raised when we fail to establish a new connection. Usually ECONNREFUSED.""" + + def __init__(self, conn: HTTPConnection, message: str) -> None: + self.conn = conn + super().__init__(f"{conn}: {message}") + + @property + def pool(self) -> HTTPConnection: + warnings.warn( + "The 'pool' property is deprecated and will be removed " + "in urllib3 v2.1.0. Use 'conn' instead.", + DeprecationWarning, + stacklevel=2, + ) + + return self.conn + + +class NameResolutionError(NewConnectionError): + """Raised when host name resolution fails.""" + + def __init__(self, host: str, conn: HTTPConnection, reason: socket.gaierror): + message = f"Failed to resolve '{host}' ({reason})" + super().__init__(conn, message) + + +class EmptyPoolError(PoolError): + """Raised when a pool runs out of connections and no more are allowed.""" + + +class FullPoolError(PoolError): + """Raised when we try to add a connection to a full pool in blocking mode.""" + + +class ClosedPoolError(PoolError): + """Raised when a request enters a pool after the pool has been closed.""" + + +class LocationValueError(ValueError, HTTPError): + """Raised when there is something wrong with a given URL input.""" + + +class LocationParseError(LocationValueError): + """Raised when get_host or similar fails to parse the URL input.""" + + def __init__(self, location: str) -> None: + message = f"Failed to parse: {location}" + super().__init__(message) + + self.location = location + + +class URLSchemeUnknown(LocationValueError): + """Raised when a URL input has an unsupported scheme.""" + + def __init__(self, scheme: str): + message = f"Not supported URL scheme {scheme}" + super().__init__(message) + + self.scheme = scheme + + +class ResponseError(HTTPError): + """Used as a container for an error reason supplied in a MaxRetryError.""" + + GENERIC_ERROR = "too many error responses" + SPECIFIC_ERROR = "too many {status_code} error responses" + + +class SecurityWarning(HTTPWarning): + """Warned when performing security reducing actions""" + + +class InsecureRequestWarning(SecurityWarning): + """Warned when making an unverified HTTPS request.""" + + +class NotOpenSSLWarning(SecurityWarning): + """Warned when using unsupported SSL library""" + + +class SystemTimeWarning(SecurityWarning): + """Warned when system time is suspected to be wrong""" + + +class InsecurePlatformWarning(SecurityWarning): + """Warned when certain TLS/SSL configuration is not available on a platform.""" + + +class DependencyWarning(HTTPWarning): + """ + Warned when an attempt is made to import a module with missing optional + dependencies. + """ + + +class ResponseNotChunked(ProtocolError, ValueError): + """Response needs to be chunked in order to read it as chunks.""" + + +class BodyNotHttplibCompatible(HTTPError): + """ + Body should be :class:`http.client.HTTPResponse` like + (have an fp attribute which returns raw chunks) for read_chunked(). + """ + + +class IncompleteRead(HTTPError, httplib_IncompleteRead): + """ + Response length doesn't match expected Content-Length + + Subclass of :class:`http.client.IncompleteRead` to allow int value + for ``partial`` to avoid creating large objects on streamed reads. + """ + + def __init__(self, partial: int, expected: int) -> None: + self.partial = partial # type: ignore[assignment] + self.expected = expected + + def __repr__(self) -> str: + return "IncompleteRead(%i bytes read, %i more expected)" % ( + self.partial, # type: ignore[str-format] + self.expected, + ) + + +class InvalidChunkLength(HTTPError, httplib_IncompleteRead): + """Invalid chunk length in a chunked response.""" + + def __init__(self, response: HTTPResponse, length: bytes) -> None: + self.partial: int = response.tell() # type: ignore[assignment] + self.expected: int | None = response.length_remaining + self.response = response + self.length = length + + def __repr__(self) -> str: + return "InvalidChunkLength(got length %r, %i bytes read)" % ( + self.length, + self.partial, + ) + + +class InvalidHeader(HTTPError): + """The header provided was somehow invalid.""" + + +class ProxySchemeUnknown(AssertionError, URLSchemeUnknown): + """ProxyManager does not support the supplied scheme""" + + # TODO(t-8ch): Stop inheriting from AssertionError in v2.0. + + def __init__(self, scheme: str | None) -> None: + # 'localhost' is here because our URL parser parses + # localhost:8080 -> scheme=localhost, remove if we fix this. + if scheme == "localhost": + scheme = None + if scheme is None: + message = "Proxy URL had no scheme, should start with http:// or https://" + else: + message = f"Proxy URL had unsupported scheme {scheme}, should use http:// or https://" + super().__init__(message) + + +class ProxySchemeUnsupported(ValueError): + """Fetching HTTPS resources through HTTPS proxies is unsupported""" + + +class HeaderParsingError(HTTPError): + """Raised by assert_header_parsing, but we convert it to a log.warning statement.""" + + def __init__( + self, defects: list[MessageDefect], unparsed_data: bytes | str | None + ) -> None: + message = f"{defects or 'Unknown'}, unparsed data: {unparsed_data!r}" + super().__init__(message) + + +class UnrewindableBodyError(HTTPError): + """urllib3 encountered an error when trying to rewind a body""" diff --git a/Modules/urllib3/fields.py b/Modules/urllib3/fields.py new file mode 100644 index 0000000..51d898e --- /dev/null +++ b/Modules/urllib3/fields.py @@ -0,0 +1,345 @@ +from __future__ import annotations + +import email.utils +import mimetypes +import typing + +_TYPE_FIELD_VALUE = typing.Union[str, bytes] +_TYPE_FIELD_VALUE_TUPLE = typing.Union[ + _TYPE_FIELD_VALUE, + typing.Tuple[str, _TYPE_FIELD_VALUE], + typing.Tuple[str, _TYPE_FIELD_VALUE, str], +] + + +def guess_content_type( + filename: str | None, default: str = "application/octet-stream" +) -> str: + """ + Guess the "Content-Type" of a file. + + :param filename: + The filename to guess the "Content-Type" of using :mod:`mimetypes`. + :param default: + If no "Content-Type" can be guessed, default to `default`. + """ + if filename: + return mimetypes.guess_type(filename)[0] or default + return default + + +def format_header_param_rfc2231(name: str, value: _TYPE_FIELD_VALUE) -> str: + """ + Helper function to format and quote a single header parameter using the + strategy defined in RFC 2231. + + Particularly useful for header parameters which might contain + non-ASCII values, like file names. This follows + `RFC 2388 Section 4.4 `_. + + :param name: + The name of the parameter, a string expected to be ASCII only. + :param value: + The value of the parameter, provided as ``bytes`` or `str``. + :returns: + An RFC-2231-formatted unicode string. + + .. deprecated:: 2.0.0 + Will be removed in urllib3 v2.1.0. This is not valid for + ``multipart/form-data`` header parameters. + """ + import warnings + + warnings.warn( + "'format_header_param_rfc2231' is deprecated and will be " + "removed in urllib3 v2.1.0. This is not valid for " + "multipart/form-data header parameters.", + DeprecationWarning, + stacklevel=2, + ) + + if isinstance(value, bytes): + value = value.decode("utf-8") + + if not any(ch in value for ch in '"\\\r\n'): + result = f'{name}="{value}"' + try: + result.encode("ascii") + except (UnicodeEncodeError, UnicodeDecodeError): + pass + else: + return result + + value = email.utils.encode_rfc2231(value, "utf-8") + value = f"{name}*={value}" + + return value + + +def format_multipart_header_param(name: str, value: _TYPE_FIELD_VALUE) -> str: + """ + Format and quote a single multipart header parameter. + + This follows the `WHATWG HTML Standard`_ as of 2021/06/10, matching + the behavior of current browser and curl versions. Values are + assumed to be UTF-8. The ``\\n``, ``\\r``, and ``"`` characters are + percent encoded. + + .. _WHATWG HTML Standard: + https://html.spec.whatwg.org/multipage/ + form-control-infrastructure.html#multipart-form-data + + :param name: + The name of the parameter, an ASCII-only ``str``. + :param value: + The value of the parameter, a ``str`` or UTF-8 encoded + ``bytes``. + :returns: + A string ``name="value"`` with the escaped value. + + .. versionchanged:: 2.0.0 + Matches the WHATWG HTML Standard as of 2021/06/10. Control + characters are no longer percent encoded. + + .. versionchanged:: 2.0.0 + Renamed from ``format_header_param_html5`` and + ``format_header_param``. The old names will be removed in + urllib3 v2.1.0. + """ + if isinstance(value, bytes): + value = value.decode("utf-8") + + # percent encode \n \r " + value = value.translate({10: "%0A", 13: "%0D", 34: "%22"}) + return f'{name}="{value}"' + + +def format_header_param_html5(name: str, value: _TYPE_FIELD_VALUE) -> str: + """ + .. deprecated:: 2.0.0 + Renamed to :func:`format_multipart_header_param`. Will be + removed in urllib3 v2.1.0. + """ + import warnings + + warnings.warn( + "'format_header_param_html5' has been renamed to " + "'format_multipart_header_param'. The old name will be " + "removed in urllib3 v2.1.0.", + DeprecationWarning, + stacklevel=2, + ) + return format_multipart_header_param(name, value) + + +def format_header_param(name: str, value: _TYPE_FIELD_VALUE) -> str: + """ + .. deprecated:: 2.0.0 + Renamed to :func:`format_multipart_header_param`. Will be + removed in urllib3 v2.1.0. + """ + import warnings + + warnings.warn( + "'format_header_param' has been renamed to " + "'format_multipart_header_param'. The old name will be " + "removed in urllib3 v2.1.0.", + DeprecationWarning, + stacklevel=2, + ) + return format_multipart_header_param(name, value) + + +class RequestField: + """ + A data container for request body parameters. + + :param name: + The name of this request field. Must be unicode. + :param data: + The data/value body. + :param filename: + An optional filename of the request field. Must be unicode. + :param headers: + An optional dict-like object of headers to initially use for the field. + + .. versionchanged:: 2.0.0 + The ``header_formatter`` parameter is deprecated and will + be removed in urllib3 v2.1.0. + """ + + def __init__( + self, + name: str, + data: _TYPE_FIELD_VALUE, + filename: str | None = None, + headers: typing.Mapping[str, str] | None = None, + header_formatter: typing.Callable[[str, _TYPE_FIELD_VALUE], str] | None = None, + ): + self._name = name + self._filename = filename + self.data = data + self.headers: dict[str, str | None] = {} + if headers: + self.headers = dict(headers) + + if header_formatter is not None: + import warnings + + warnings.warn( + "The 'header_formatter' parameter is deprecated and " + "will be removed in urllib3 v2.1.0.", + DeprecationWarning, + stacklevel=2, + ) + self.header_formatter = header_formatter + else: + self.header_formatter = format_multipart_header_param + + @classmethod + def from_tuples( + cls, + fieldname: str, + value: _TYPE_FIELD_VALUE_TUPLE, + header_formatter: typing.Callable[[str, _TYPE_FIELD_VALUE], str] | None = None, + ) -> RequestField: + """ + A :class:`~urllib3.fields.RequestField` factory from old-style tuple parameters. + + Supports constructing :class:`~urllib3.fields.RequestField` from + parameter of key/value strings AND key/filetuple. A filetuple is a + (filename, data, MIME type) tuple where the MIME type is optional. + For example:: + + 'foo': 'bar', + 'fakefile': ('foofile.txt', 'contents of foofile'), + 'realfile': ('barfile.txt', open('realfile').read()), + 'typedfile': ('bazfile.bin', open('bazfile').read(), 'image/jpeg'), + 'nonamefile': 'contents of nonamefile field', + + Field names and filenames must be unicode. + """ + filename: str | None + content_type: str | None + data: _TYPE_FIELD_VALUE + + if isinstance(value, tuple): + if len(value) == 3: + filename, data, content_type = typing.cast( + typing.Tuple[str, _TYPE_FIELD_VALUE, str], value + ) + else: + filename, data = typing.cast( + typing.Tuple[str, _TYPE_FIELD_VALUE], value + ) + content_type = guess_content_type(filename) + else: + filename = None + content_type = None + data = value + + request_param = cls( + fieldname, data, filename=filename, header_formatter=header_formatter + ) + request_param.make_multipart(content_type=content_type) + + return request_param + + def _render_part(self, name: str, value: _TYPE_FIELD_VALUE) -> str: + """ + Override this method to change how each multipart header + parameter is formatted. By default, this calls + :func:`format_multipart_header_param`. + + :param name: + The name of the parameter, an ASCII-only ``str``. + :param value: + The value of the parameter, a ``str`` or UTF-8 encoded + ``bytes``. + + :meta public: + """ + return self.header_formatter(name, value) + + def _render_parts( + self, + header_parts: ( + dict[str, _TYPE_FIELD_VALUE | None] + | typing.Sequence[tuple[str, _TYPE_FIELD_VALUE | None]] + ), + ) -> str: + """ + Helper function to format and quote a single header. + + Useful for single headers that are composed of multiple items. E.g., + 'Content-Disposition' fields. + + :param header_parts: + A sequence of (k, v) tuples or a :class:`dict` of (k, v) to format + as `k1="v1"; k2="v2"; ...`. + """ + iterable: typing.Iterable[tuple[str, _TYPE_FIELD_VALUE | None]] + + parts = [] + if isinstance(header_parts, dict): + iterable = header_parts.items() + else: + iterable = header_parts + + for name, value in iterable: + if value is not None: + parts.append(self._render_part(name, value)) + + return "; ".join(parts) + + def render_headers(self) -> str: + """ + Renders the headers for this request field. + """ + lines = [] + + sort_keys = ["Content-Disposition", "Content-Type", "Content-Location"] + for sort_key in sort_keys: + if self.headers.get(sort_key, False): + lines.append(f"{sort_key}: {self.headers[sort_key]}") + + for header_name, header_value in self.headers.items(): + if header_name not in sort_keys: + if header_value: + lines.append(f"{header_name}: {header_value}") + + lines.append("\r\n") + return "\r\n".join(lines) + + def make_multipart( + self, + content_disposition: str | None = None, + content_type: str | None = None, + content_location: str | None = None, + ) -> None: + """ + Makes this request field into a multipart request field. + + This method overrides "Content-Disposition", "Content-Type" and + "Content-Location" headers to the request parameter. + + :param content_disposition: + The 'Content-Disposition' of the request body. Defaults to 'form-data' + :param content_type: + The 'Content-Type' of the request body. + :param content_location: + The 'Content-Location' of the request body. + + """ + content_disposition = (content_disposition or "form-data") + "; ".join( + [ + "", + self._render_parts( + (("name", self._name), ("filename", self._filename)) + ), + ] + ) + + self.headers["Content-Disposition"] = content_disposition + self.headers["Content-Type"] = content_type + self.headers["Content-Location"] = content_location diff --git a/Modules/urllib3/filepost.py b/Modules/urllib3/filepost.py new file mode 100644 index 0000000..1c90a21 --- /dev/null +++ b/Modules/urllib3/filepost.py @@ -0,0 +1,89 @@ +from __future__ import annotations + +import binascii +import codecs +import os +import typing +from io import BytesIO + +from .fields import _TYPE_FIELD_VALUE_TUPLE, RequestField + +writer = codecs.lookup("utf-8")[3] + +_TYPE_FIELDS_SEQUENCE = typing.Sequence[ + typing.Union[typing.Tuple[str, _TYPE_FIELD_VALUE_TUPLE], RequestField] +] +_TYPE_FIELDS = typing.Union[ + _TYPE_FIELDS_SEQUENCE, + typing.Mapping[str, _TYPE_FIELD_VALUE_TUPLE], +] + + +def choose_boundary() -> str: + """ + Our embarrassingly-simple replacement for mimetools.choose_boundary. + """ + return binascii.hexlify(os.urandom(16)).decode() + + +def iter_field_objects(fields: _TYPE_FIELDS) -> typing.Iterable[RequestField]: + """ + Iterate over fields. + + Supports list of (k, v) tuples and dicts, and lists of + :class:`~urllib3.fields.RequestField`. + + """ + iterable: typing.Iterable[RequestField | tuple[str, _TYPE_FIELD_VALUE_TUPLE]] + + if isinstance(fields, typing.Mapping): + iterable = fields.items() + else: + iterable = fields + + for field in iterable: + if isinstance(field, RequestField): + yield field + else: + yield RequestField.from_tuples(*field) + + +def encode_multipart_formdata( + fields: _TYPE_FIELDS, boundary: str | None = None +) -> tuple[bytes, str]: + """ + Encode a dictionary of ``fields`` using the multipart/form-data MIME format. + + :param fields: + Dictionary of fields or list of (key, :class:`~urllib3.fields.RequestField`). + Values are processed by :func:`urllib3.fields.RequestField.from_tuples`. + + :param boundary: + If not specified, then a random boundary will be generated using + :func:`urllib3.filepost.choose_boundary`. + """ + body = BytesIO() + if boundary is None: + boundary = choose_boundary() + + for field in iter_field_objects(fields): + body.write(f"--{boundary}\r\n".encode("latin-1")) + + writer(body).write(field.render_headers()) + data = field.data + + if isinstance(data, int): + data = str(data) # Backwards compatibility + + if isinstance(data, str): + writer(body).write(data) + else: + body.write(data) + + body.write(b"\r\n") + + body.write(f"--{boundary}--\r\n".encode("latin-1")) + + content_type = f"multipart/form-data; boundary={boundary}" + + return body.getvalue(), content_type diff --git a/Modules/urllib3/poolmanager.py b/Modules/urllib3/poolmanager.py new file mode 100644 index 0000000..32da0a0 --- /dev/null +++ b/Modules/urllib3/poolmanager.py @@ -0,0 +1,638 @@ +from __future__ import annotations + +import functools +import logging +import typing +import warnings +from types import TracebackType +from urllib.parse import urljoin + +from ._collections import HTTPHeaderDict, RecentlyUsedContainer +from ._request_methods import RequestMethods +from .connection import ProxyConfig +from .connectionpool import HTTPConnectionPool, HTTPSConnectionPool, port_by_scheme +from .exceptions import ( + LocationValueError, + MaxRetryError, + ProxySchemeUnknown, + URLSchemeUnknown, +) +from .response import BaseHTTPResponse +from .util.connection import _TYPE_SOCKET_OPTIONS +from .util.proxy import connection_requires_http_tunnel +from .util.retry import Retry +from .util.timeout import Timeout +from .util.url import Url, parse_url + +if typing.TYPE_CHECKING: + import ssl + from typing import Literal + +__all__ = ["PoolManager", "ProxyManager", "proxy_from_url"] + + +log = logging.getLogger(__name__) + +SSL_KEYWORDS = ( + "key_file", + "cert_file", + "cert_reqs", + "ca_certs", + "ca_cert_data", + "ssl_version", + "ssl_minimum_version", + "ssl_maximum_version", + "ca_cert_dir", + "ssl_context", + "key_password", + "server_hostname", +) +# Default value for `blocksize` - a new parameter introduced to +# http.client.HTTPConnection & http.client.HTTPSConnection in Python 3.7 +_DEFAULT_BLOCKSIZE = 16384 + +_SelfT = typing.TypeVar("_SelfT") + + +class PoolKey(typing.NamedTuple): + """ + All known keyword arguments that could be provided to the pool manager, its + pools, or the underlying connections. + + All custom key schemes should include the fields in this key at a minimum. + """ + + key_scheme: str + key_host: str + key_port: int | None + key_timeout: Timeout | float | int | None + key_retries: Retry | bool | int | None + key_block: bool | None + key_source_address: tuple[str, int] | None + key_key_file: str | None + key_key_password: str | None + key_cert_file: str | None + key_cert_reqs: str | None + key_ca_certs: str | None + key_ca_cert_data: str | bytes | None + key_ssl_version: int | str | None + key_ssl_minimum_version: ssl.TLSVersion | None + key_ssl_maximum_version: ssl.TLSVersion | None + key_ca_cert_dir: str | None + key_ssl_context: ssl.SSLContext | None + key_maxsize: int | None + key_headers: frozenset[tuple[str, str]] | None + key__proxy: Url | None + key__proxy_headers: frozenset[tuple[str, str]] | None + key__proxy_config: ProxyConfig | None + key_socket_options: _TYPE_SOCKET_OPTIONS | None + key__socks_options: frozenset[tuple[str, str]] | None + key_assert_hostname: bool | str | None + key_assert_fingerprint: str | None + key_server_hostname: str | None + key_blocksize: int | None + + +def _default_key_normalizer( + key_class: type[PoolKey], request_context: dict[str, typing.Any] +) -> PoolKey: + """ + Create a pool key out of a request context dictionary. + + According to RFC 3986, both the scheme and host are case-insensitive. + Therefore, this function normalizes both before constructing the pool + key for an HTTPS request. If you wish to change this behaviour, provide + alternate callables to ``key_fn_by_scheme``. + + :param key_class: + The class to use when constructing the key. This should be a namedtuple + with the ``scheme`` and ``host`` keys at a minimum. + :type key_class: namedtuple + :param request_context: + A dictionary-like object that contain the context for a request. + :type request_context: dict + + :return: A namedtuple that can be used as a connection pool key. + :rtype: PoolKey + """ + # Since we mutate the dictionary, make a copy first + context = request_context.copy() + context["scheme"] = context["scheme"].lower() + context["host"] = context["host"].lower() + + # These are both dictionaries and need to be transformed into frozensets + for key in ("headers", "_proxy_headers", "_socks_options"): + if key in context and context[key] is not None: + context[key] = frozenset(context[key].items()) + + # The socket_options key may be a list and needs to be transformed into a + # tuple. + socket_opts = context.get("socket_options") + if socket_opts is not None: + context["socket_options"] = tuple(socket_opts) + + # Map the kwargs to the names in the namedtuple - this is necessary since + # namedtuples can't have fields starting with '_'. + for key in list(context.keys()): + context["key_" + key] = context.pop(key) + + # Default to ``None`` for keys missing from the context + for field in key_class._fields: + if field not in context: + context[field] = None + + # Default key_blocksize to _DEFAULT_BLOCKSIZE if missing from the context + if context.get("key_blocksize") is None: + context["key_blocksize"] = _DEFAULT_BLOCKSIZE + + return key_class(**context) + + +#: A dictionary that maps a scheme to a callable that creates a pool key. +#: This can be used to alter the way pool keys are constructed, if desired. +#: Each PoolManager makes a copy of this dictionary so they can be configured +#: globally here, or individually on the instance. +key_fn_by_scheme = { + "http": functools.partial(_default_key_normalizer, PoolKey), + "https": functools.partial(_default_key_normalizer, PoolKey), +} + +pool_classes_by_scheme = {"http": HTTPConnectionPool, "https": HTTPSConnectionPool} + + +class PoolManager(RequestMethods): + """ + Allows for arbitrary requests while transparently keeping track of + necessary connection pools for you. + + :param num_pools: + Number of connection pools to cache before discarding the least + recently used pool. + + :param headers: + Headers to include with all requests, unless other headers are given + explicitly. + + :param \\**connection_pool_kw: + Additional parameters are used to create fresh + :class:`urllib3.connectionpool.ConnectionPool` instances. + + Example: + + .. code-block:: python + + import urllib3 + + http = urllib3.PoolManager(num_pools=2) + + resp1 = http.request("GET", "https://google.com/") + resp2 = http.request("GET", "https://google.com/mail") + resp3 = http.request("GET", "https://yahoo.com/") + + print(len(http.pools)) + # 2 + + """ + + proxy: Url | None = None + proxy_config: ProxyConfig | None = None + + def __init__( + self, + num_pools: int = 10, + headers: typing.Mapping[str, str] | None = None, + **connection_pool_kw: typing.Any, + ) -> None: + super().__init__(headers) + self.connection_pool_kw = connection_pool_kw + + self.pools: RecentlyUsedContainer[PoolKey, HTTPConnectionPool] + self.pools = RecentlyUsedContainer(num_pools) + + # Locally set the pool classes and keys so other PoolManagers can + # override them. + self.pool_classes_by_scheme = pool_classes_by_scheme + self.key_fn_by_scheme = key_fn_by_scheme.copy() + + def __enter__(self: _SelfT) -> _SelfT: + return self + + def __exit__( + self, + exc_type: type[BaseException] | None, + exc_val: BaseException | None, + exc_tb: TracebackType | None, + ) -> Literal[False]: + self.clear() + # Return False to re-raise any potential exceptions + return False + + def _new_pool( + self, + scheme: str, + host: str, + port: int, + request_context: dict[str, typing.Any] | None = None, + ) -> HTTPConnectionPool: + """ + Create a new :class:`urllib3.connectionpool.ConnectionPool` based on host, port, scheme, and + any additional pool keyword arguments. + + If ``request_context`` is provided, it is provided as keyword arguments + to the pool class used. This method is used to actually create the + connection pools handed out by :meth:`connection_from_url` and + companion methods. It is intended to be overridden for customization. + """ + pool_cls: type[HTTPConnectionPool] = self.pool_classes_by_scheme[scheme] + if request_context is None: + request_context = self.connection_pool_kw.copy() + + # Default blocksize to _DEFAULT_BLOCKSIZE if missing or explicitly + # set to 'None' in the request_context. + if request_context.get("blocksize") is None: + request_context["blocksize"] = _DEFAULT_BLOCKSIZE + + # Although the context has everything necessary to create the pool, + # this function has historically only used the scheme, host, and port + # in the positional args. When an API change is acceptable these can + # be removed. + for key in ("scheme", "host", "port"): + request_context.pop(key, None) + + if scheme == "http": + for kw in SSL_KEYWORDS: + request_context.pop(kw, None) + + return pool_cls(host, port, **request_context) + + def clear(self) -> None: + """ + Empty our store of pools and direct them all to close. + + This will not affect in-flight connections, but they will not be + re-used after completion. + """ + self.pools.clear() + + def connection_from_host( + self, + host: str | None, + port: int | None = None, + scheme: str | None = "http", + pool_kwargs: dict[str, typing.Any] | None = None, + ) -> HTTPConnectionPool: + """ + Get a :class:`urllib3.connectionpool.ConnectionPool` based on the host, port, and scheme. + + If ``port`` isn't given, it will be derived from the ``scheme`` using + ``urllib3.connectionpool.port_by_scheme``. If ``pool_kwargs`` is + provided, it is merged with the instance's ``connection_pool_kw`` + variable and used to create the new connection pool, if one is + needed. + """ + + if not host: + raise LocationValueError("No host specified.") + + request_context = self._merge_pool_kwargs(pool_kwargs) + request_context["scheme"] = scheme or "http" + if not port: + port = port_by_scheme.get(request_context["scheme"].lower(), 80) + request_context["port"] = port + request_context["host"] = host + + return self.connection_from_context(request_context) + + def connection_from_context( + self, request_context: dict[str, typing.Any] + ) -> HTTPConnectionPool: + """ + Get a :class:`urllib3.connectionpool.ConnectionPool` based on the request context. + + ``request_context`` must at least contain the ``scheme`` key and its + value must be a key in ``key_fn_by_scheme`` instance variable. + """ + if "strict" in request_context: + warnings.warn( + "The 'strict' parameter is no longer needed on Python 3+. " + "This will raise an error in urllib3 v2.1.0.", + DeprecationWarning, + ) + request_context.pop("strict") + + scheme = request_context["scheme"].lower() + pool_key_constructor = self.key_fn_by_scheme.get(scheme) + if not pool_key_constructor: + raise URLSchemeUnknown(scheme) + pool_key = pool_key_constructor(request_context) + + return self.connection_from_pool_key(pool_key, request_context=request_context) + + def connection_from_pool_key( + self, pool_key: PoolKey, request_context: dict[str, typing.Any] + ) -> HTTPConnectionPool: + """ + Get a :class:`urllib3.connectionpool.ConnectionPool` based on the provided pool key. + + ``pool_key`` should be a namedtuple that only contains immutable + objects. At a minimum it must have the ``scheme``, ``host``, and + ``port`` fields. + """ + with self.pools.lock: + # If the scheme, host, or port doesn't match existing open + # connections, open a new ConnectionPool. + pool = self.pools.get(pool_key) + if pool: + return pool + + # Make a fresh ConnectionPool of the desired type + scheme = request_context["scheme"] + host = request_context["host"] + port = request_context["port"] + pool = self._new_pool(scheme, host, port, request_context=request_context) + self.pools[pool_key] = pool + + return pool + + def connection_from_url( + self, url: str, pool_kwargs: dict[str, typing.Any] | None = None + ) -> HTTPConnectionPool: + """ + Similar to :func:`urllib3.connectionpool.connection_from_url`. + + If ``pool_kwargs`` is not provided and a new pool needs to be + constructed, ``self.connection_pool_kw`` is used to initialize + the :class:`urllib3.connectionpool.ConnectionPool`. If ``pool_kwargs`` + is provided, it is used instead. Note that if a new pool does not + need to be created for the request, the provided ``pool_kwargs`` are + not used. + """ + u = parse_url(url) + return self.connection_from_host( + u.host, port=u.port, scheme=u.scheme, pool_kwargs=pool_kwargs + ) + + def _merge_pool_kwargs( + self, override: dict[str, typing.Any] | None + ) -> dict[str, typing.Any]: + """ + Merge a dictionary of override values for self.connection_pool_kw. + + This does not modify self.connection_pool_kw and returns a new dict. + Any keys in the override dictionary with a value of ``None`` are + removed from the merged dictionary. + """ + base_pool_kwargs = self.connection_pool_kw.copy() + if override: + for key, value in override.items(): + if value is None: + try: + del base_pool_kwargs[key] + except KeyError: + pass + else: + base_pool_kwargs[key] = value + return base_pool_kwargs + + def _proxy_requires_url_absolute_form(self, parsed_url: Url) -> bool: + """ + Indicates if the proxy requires the complete destination URL in the + request. Normally this is only needed when not using an HTTP CONNECT + tunnel. + """ + if self.proxy is None: + return False + + return not connection_requires_http_tunnel( + self.proxy, self.proxy_config, parsed_url.scheme + ) + + def urlopen( # type: ignore[override] + self, method: str, url: str, redirect: bool = True, **kw: typing.Any + ) -> BaseHTTPResponse: + """ + Same as :meth:`urllib3.HTTPConnectionPool.urlopen` + with custom cross-host redirect logic and only sends the request-uri + portion of the ``url``. + + The given ``url`` parameter must be absolute, such that an appropriate + :class:`urllib3.connectionpool.ConnectionPool` can be chosen for it. + """ + u = parse_url(url) + + if u.scheme is None: + warnings.warn( + "URLs without a scheme (ie 'https://') are deprecated and will raise an error " + "in a future version of urllib3. To avoid this DeprecationWarning ensure all URLs " + "start with 'https://' or 'http://'. Read more in this issue: " + "https://github.com/urllib3/urllib3/issues/2920", + category=DeprecationWarning, + stacklevel=2, + ) + + conn = self.connection_from_host(u.host, port=u.port, scheme=u.scheme) + + kw["assert_same_host"] = False + kw["redirect"] = False + + if "headers" not in kw: + kw["headers"] = self.headers + + if self._proxy_requires_url_absolute_form(u): + response = conn.urlopen(method, url, **kw) + else: + response = conn.urlopen(method, u.request_uri, **kw) + + redirect_location = redirect and response.get_redirect_location() + if not redirect_location: + return response + + # Support relative URLs for redirecting. + redirect_location = urljoin(url, redirect_location) + + if response.status == 303: + # Change the method according to RFC 9110, Section 15.4.4. + method = "GET" + # And lose the body not to transfer anything sensitive. + kw["body"] = None + kw["headers"] = HTTPHeaderDict(kw["headers"])._prepare_for_method_change() + + retries = kw.get("retries") + if not isinstance(retries, Retry): + retries = Retry.from_int(retries, redirect=redirect) + + # Strip headers marked as unsafe to forward to the redirected location. + # Check remove_headers_on_redirect to avoid a potential network call within + # conn.is_same_host() which may use socket.gethostbyname() in the future. + if retries.remove_headers_on_redirect and not conn.is_same_host( + redirect_location + ): + new_headers = kw["headers"].copy() + for header in kw["headers"]: + if header.lower() in retries.remove_headers_on_redirect: + new_headers.pop(header, None) + kw["headers"] = new_headers + + try: + retries = retries.increment(method, url, response=response, _pool=conn) + except MaxRetryError: + if retries.raise_on_redirect: + response.drain_conn() + raise + return response + + kw["retries"] = retries + kw["redirect"] = redirect + + log.info("Redirecting %s -> %s", url, redirect_location) + + response.drain_conn() + return self.urlopen(method, redirect_location, **kw) + + +class ProxyManager(PoolManager): + """ + Behaves just like :class:`PoolManager`, but sends all requests through + the defined proxy, using the CONNECT method for HTTPS URLs. + + :param proxy_url: + The URL of the proxy to be used. + + :param proxy_headers: + A dictionary containing headers that will be sent to the proxy. In case + of HTTP they are being sent with each request, while in the + HTTPS/CONNECT case they are sent only once. Could be used for proxy + authentication. + + :param proxy_ssl_context: + The proxy SSL context is used to establish the TLS connection to the + proxy when using HTTPS proxies. + + :param use_forwarding_for_https: + (Defaults to False) If set to True will forward requests to the HTTPS + proxy to be made on behalf of the client instead of creating a TLS + tunnel via the CONNECT method. **Enabling this flag means that request + and response headers and content will be visible from the HTTPS proxy** + whereas tunneling keeps request and response headers and content + private. IP address, target hostname, SNI, and port are always visible + to an HTTPS proxy even when this flag is disabled. + + :param proxy_assert_hostname: + The hostname of the certificate to verify against. + + :param proxy_assert_fingerprint: + The fingerprint of the certificate to verify against. + + Example: + + .. code-block:: python + + import urllib3 + + proxy = urllib3.ProxyManager("https://localhost:3128/") + + resp1 = proxy.request("GET", "https://google.com/") + resp2 = proxy.request("GET", "https://httpbin.org/") + + print(len(proxy.pools)) + # 1 + + resp3 = proxy.request("GET", "https://httpbin.org/") + resp4 = proxy.request("GET", "https://twitter.com/") + + print(len(proxy.pools)) + # 3 + + """ + + def __init__( + self, + proxy_url: str, + num_pools: int = 10, + headers: typing.Mapping[str, str] | None = None, + proxy_headers: typing.Mapping[str, str] | None = None, + proxy_ssl_context: ssl.SSLContext | None = None, + use_forwarding_for_https: bool = False, + proxy_assert_hostname: None | str | Literal[False] = None, + proxy_assert_fingerprint: str | None = None, + **connection_pool_kw: typing.Any, + ) -> None: + if isinstance(proxy_url, HTTPConnectionPool): + str_proxy_url = f"{proxy_url.scheme}://{proxy_url.host}:{proxy_url.port}" + else: + str_proxy_url = proxy_url + proxy = parse_url(str_proxy_url) + + if proxy.scheme not in ("http", "https"): + raise ProxySchemeUnknown(proxy.scheme) + + if not proxy.port: + port = port_by_scheme.get(proxy.scheme, 80) + proxy = proxy._replace(port=port) + + self.proxy = proxy + self.proxy_headers = proxy_headers or {} + self.proxy_ssl_context = proxy_ssl_context + self.proxy_config = ProxyConfig( + proxy_ssl_context, + use_forwarding_for_https, + proxy_assert_hostname, + proxy_assert_fingerprint, + ) + + connection_pool_kw["_proxy"] = self.proxy + connection_pool_kw["_proxy_headers"] = self.proxy_headers + connection_pool_kw["_proxy_config"] = self.proxy_config + + super().__init__(num_pools, headers, **connection_pool_kw) + + def connection_from_host( + self, + host: str | None, + port: int | None = None, + scheme: str | None = "http", + pool_kwargs: dict[str, typing.Any] | None = None, + ) -> HTTPConnectionPool: + if scheme == "https": + return super().connection_from_host( + host, port, scheme, pool_kwargs=pool_kwargs + ) + + return super().connection_from_host( + self.proxy.host, self.proxy.port, self.proxy.scheme, pool_kwargs=pool_kwargs # type: ignore[union-attr] + ) + + def _set_proxy_headers( + self, url: str, headers: typing.Mapping[str, str] | None = None + ) -> typing.Mapping[str, str]: + """ + Sets headers needed by proxies: specifically, the Accept and Host + headers. Only sets headers not provided by the user. + """ + headers_ = {"Accept": "*/*"} + + netloc = parse_url(url).netloc + if netloc: + headers_["Host"] = netloc + + if headers: + headers_.update(headers) + return headers_ + + def urlopen( # type: ignore[override] + self, method: str, url: str, redirect: bool = True, **kw: typing.Any + ) -> BaseHTTPResponse: + "Same as HTTP(S)ConnectionPool.urlopen, ``url`` must be absolute." + u = parse_url(url) + if not connection_requires_http_tunnel(self.proxy, self.proxy_config, u.scheme): + # For connections using HTTP CONNECT, httplib sets the necessary + # headers on the CONNECT to the proxy. If we're not using CONNECT, + # we'll definitely need to set 'Host' at the very least. + headers = kw.get("headers", self.headers) + kw["headers"] = self._set_proxy_headers(url, headers) + + return super().urlopen(method, url, redirect=redirect, **kw) + + +def proxy_from_url(url: str, **kw: typing.Any) -> ProxyManager: + return ProxyManager(proxy_url=url, **kw) diff --git a/Modules/urllib3/py.typed b/Modules/urllib3/py.typed new file mode 100644 index 0000000..5f3ea3d --- /dev/null +++ b/Modules/urllib3/py.typed @@ -0,0 +1,2 @@ +# Instruct type checkers to look for inline type annotations in this package. +# See PEP 561. diff --git a/Modules/urllib3/response.py b/Modules/urllib3/response.py new file mode 100644 index 0000000..37936f9 --- /dev/null +++ b/Modules/urllib3/response.py @@ -0,0 +1,1130 @@ +from __future__ import annotations + +import collections +import io +import json as _json +import logging +import re +import sys +import typing +import warnings +import zlib +from contextlib import contextmanager +from http.client import HTTPMessage as _HttplibHTTPMessage +from http.client import HTTPResponse as _HttplibHTTPResponse +from socket import timeout as SocketTimeout + +try: + try: + import brotlicffi as brotli # type: ignore[import] + except ImportError: + import brotli # type: ignore[import] +except ImportError: + brotli = None + +try: + import zstandard as zstd # type: ignore[import] + + # The package 'zstandard' added the 'eof' property starting + # in v0.18.0 which we require to ensure a complete and + # valid zstd stream was fed into the ZstdDecoder. + # See: https://github.com/urllib3/urllib3/pull/2624 + _zstd_version = _zstd_version = tuple( + map(int, re.search(r"^([0-9]+)\.([0-9]+)", zstd.__version__).groups()) # type: ignore[union-attr] + ) + if _zstd_version < (0, 18): # Defensive: + zstd = None + +except (AttributeError, ImportError, ValueError): # Defensive: + zstd = None + +from . import util +from ._base_connection import _TYPE_BODY +from ._collections import HTTPHeaderDict +from .connection import BaseSSLError, HTTPConnection, HTTPException +from .exceptions import ( + BodyNotHttplibCompatible, + DecodeError, + HTTPError, + IncompleteRead, + InvalidChunkLength, + InvalidHeader, + ProtocolError, + ReadTimeoutError, + ResponseNotChunked, + SSLError, +) +from .util.response import is_fp_closed, is_response_to_head +from .util.retry import Retry + +if typing.TYPE_CHECKING: + from typing import Literal + + from .connectionpool import HTTPConnectionPool + +log = logging.getLogger(__name__) + + +class ContentDecoder: + def decompress(self, data: bytes) -> bytes: + raise NotImplementedError() + + def flush(self) -> bytes: + raise NotImplementedError() + + +class DeflateDecoder(ContentDecoder): + def __init__(self) -> None: + self._first_try = True + self._data = b"" + self._obj = zlib.decompressobj() + + def decompress(self, data: bytes) -> bytes: + if not data: + return data + + if not self._first_try: + return self._obj.decompress(data) + + self._data += data + try: + decompressed = self._obj.decompress(data) + if decompressed: + self._first_try = False + self._data = None # type: ignore[assignment] + return decompressed + except zlib.error: + self._first_try = False + self._obj = zlib.decompressobj(-zlib.MAX_WBITS) + try: + return self.decompress(self._data) + finally: + self._data = None # type: ignore[assignment] + + def flush(self) -> bytes: + return self._obj.flush() + + +class GzipDecoderState: + FIRST_MEMBER = 0 + OTHER_MEMBERS = 1 + SWALLOW_DATA = 2 + + +class GzipDecoder(ContentDecoder): + def __init__(self) -> None: + self._obj = zlib.decompressobj(16 + zlib.MAX_WBITS) + self._state = GzipDecoderState.FIRST_MEMBER + + def decompress(self, data: bytes) -> bytes: + ret = bytearray() + if self._state == GzipDecoderState.SWALLOW_DATA or not data: + return bytes(ret) + while True: + try: + ret += self._obj.decompress(data) + except zlib.error: + previous_state = self._state + # Ignore data after the first error + self._state = GzipDecoderState.SWALLOW_DATA + if previous_state == GzipDecoderState.OTHER_MEMBERS: + # Allow trailing garbage acceptable in other gzip clients + return bytes(ret) + raise + data = self._obj.unused_data + if not data: + return bytes(ret) + self._state = GzipDecoderState.OTHER_MEMBERS + self._obj = zlib.decompressobj(16 + zlib.MAX_WBITS) + + def flush(self) -> bytes: + return self._obj.flush() + + +if brotli is not None: + + class BrotliDecoder(ContentDecoder): + # Supports both 'brotlipy' and 'Brotli' packages + # since they share an import name. The top branches + # are for 'brotlipy' and bottom branches for 'Brotli' + def __init__(self) -> None: + self._obj = brotli.Decompressor() + if hasattr(self._obj, "decompress"): + setattr(self, "decompress", self._obj.decompress) + else: + setattr(self, "decompress", self._obj.process) + + def flush(self) -> bytes: + if hasattr(self._obj, "flush"): + return self._obj.flush() # type: ignore[no-any-return] + return b"" + + +if zstd is not None: + + class ZstdDecoder(ContentDecoder): + def __init__(self) -> None: + self._obj = zstd.ZstdDecompressor().decompressobj() + + def decompress(self, data: bytes) -> bytes: + if not data: + return b"" + data_parts = [self._obj.decompress(data)] + while self._obj.eof and self._obj.unused_data: + unused_data = self._obj.unused_data + self._obj = zstd.ZstdDecompressor().decompressobj() + data_parts.append(self._obj.decompress(unused_data)) + return b"".join(data_parts) + + def flush(self) -> bytes: + ret = self._obj.flush() # note: this is a no-op + if not self._obj.eof: + raise DecodeError("Zstandard data is incomplete") + return ret # type: ignore[no-any-return] + + +class MultiDecoder(ContentDecoder): + """ + From RFC7231: + If one or more encodings have been applied to a representation, the + sender that applied the encodings MUST generate a Content-Encoding + header field that lists the content codings in the order in which + they were applied. + """ + + def __init__(self, modes: str) -> None: + self._decoders = [_get_decoder(m.strip()) for m in modes.split(",")] + + def flush(self) -> bytes: + return self._decoders[0].flush() + + def decompress(self, data: bytes) -> bytes: + for d in reversed(self._decoders): + data = d.decompress(data) + return data + + +def _get_decoder(mode: str) -> ContentDecoder: + if "," in mode: + return MultiDecoder(mode) + + # According to RFC 9110 section 8.4.1.3, recipients should + # consider x-gzip equivalent to gzip + if mode in ("gzip", "x-gzip"): + return GzipDecoder() + + if brotli is not None and mode == "br": + return BrotliDecoder() + + if zstd is not None and mode == "zstd": + return ZstdDecoder() + + return DeflateDecoder() + + +class BytesQueueBuffer: + """Memory-efficient bytes buffer + + To return decoded data in read() and still follow the BufferedIOBase API, we need a + buffer to always return the correct amount of bytes. + + This buffer should be filled using calls to put() + + Our maximum memory usage is determined by the sum of the size of: + + * self.buffer, which contains the full data + * the largest chunk that we will copy in get() + + The worst case scenario is a single chunk, in which case we'll make a full copy of + the data inside get(). + """ + + def __init__(self) -> None: + self.buffer: typing.Deque[bytes] = collections.deque() + self._size: int = 0 + + def __len__(self) -> int: + return self._size + + def put(self, data: bytes) -> None: + self.buffer.append(data) + self._size += len(data) + + def get(self, n: int) -> bytes: + if n == 0: + return b"" + elif not self.buffer: + raise RuntimeError("buffer is empty") + elif n < 0: + raise ValueError("n should be > 0") + + fetched = 0 + ret = io.BytesIO() + while fetched < n: + remaining = n - fetched + chunk = self.buffer.popleft() + chunk_length = len(chunk) + if remaining < chunk_length: + left_chunk, right_chunk = chunk[:remaining], chunk[remaining:] + ret.write(left_chunk) + self.buffer.appendleft(right_chunk) + self._size -= remaining + break + else: + ret.write(chunk) + self._size -= chunk_length + fetched += chunk_length + + if not self.buffer: + break + + return ret.getvalue() + + +class BaseHTTPResponse(io.IOBase): + CONTENT_DECODERS = ["gzip", "x-gzip", "deflate"] + if brotli is not None: + CONTENT_DECODERS += ["br"] + if zstd is not None: + CONTENT_DECODERS += ["zstd"] + REDIRECT_STATUSES = [301, 302, 303, 307, 308] + + DECODER_ERROR_CLASSES: tuple[type[Exception], ...] = (IOError, zlib.error) + if brotli is not None: + DECODER_ERROR_CLASSES += (brotli.error,) + + if zstd is not None: + DECODER_ERROR_CLASSES += (zstd.ZstdError,) + + def __init__( + self, + *, + headers: typing.Mapping[str, str] | typing.Mapping[bytes, bytes] | None = None, + status: int, + version: int, + reason: str | None, + decode_content: bool, + request_url: str | None, + retries: Retry | None = None, + ) -> None: + if isinstance(headers, HTTPHeaderDict): + self.headers = headers + else: + self.headers = HTTPHeaderDict(headers) # type: ignore[arg-type] + self.status = status + self.version = version + self.reason = reason + self.decode_content = decode_content + self._has_decoded_content = False + self._request_url: str | None = request_url + self.retries = retries + + self.chunked = False + tr_enc = self.headers.get("transfer-encoding", "").lower() + # Don't incur the penalty of creating a list and then discarding it + encodings = (enc.strip() for enc in tr_enc.split(",")) + if "chunked" in encodings: + self.chunked = True + + self._decoder: ContentDecoder | None = None + + def get_redirect_location(self) -> str | None | Literal[False]: + """ + Should we redirect and where to? + + :returns: Truthy redirect location string if we got a redirect status + code and valid location. ``None`` if redirect status and no + location. ``False`` if not a redirect status code. + """ + if self.status in self.REDIRECT_STATUSES: + return self.headers.get("location") + return False + + @property + def data(self) -> bytes: + raise NotImplementedError() + + def json(self) -> typing.Any: + """ + Parses the body of the HTTP response as JSON. + + To use a custom JSON decoder pass the result of :attr:`HTTPResponse.data` to the decoder. + + This method can raise either `UnicodeDecodeError` or `json.JSONDecodeError`. + + Read more :ref:`here `. + """ + data = self.data.decode("utf-8") + return _json.loads(data) + + @property + def url(self) -> str | None: + raise NotImplementedError() + + @url.setter + def url(self, url: str | None) -> None: + raise NotImplementedError() + + @property + def connection(self) -> HTTPConnection | None: + raise NotImplementedError() + + @property + def retries(self) -> Retry | None: + return self._retries + + @retries.setter + def retries(self, retries: Retry | None) -> None: + # Override the request_url if retries has a redirect location. + if retries is not None and retries.history: + self.url = retries.history[-1].redirect_location + self._retries = retries + + def stream( + self, amt: int | None = 2**16, decode_content: bool | None = None + ) -> typing.Iterator[bytes]: + raise NotImplementedError() + + def read( + self, + amt: int | None = None, + decode_content: bool | None = None, + cache_content: bool = False, + ) -> bytes: + raise NotImplementedError() + + def read_chunked( + self, + amt: int | None = None, + decode_content: bool | None = None, + ) -> typing.Iterator[bytes]: + raise NotImplementedError() + + def release_conn(self) -> None: + raise NotImplementedError() + + def drain_conn(self) -> None: + raise NotImplementedError() + + def close(self) -> None: + raise NotImplementedError() + + def _init_decoder(self) -> None: + """ + Set-up the _decoder attribute if necessary. + """ + # Note: content-encoding value should be case-insensitive, per RFC 7230 + # Section 3.2 + content_encoding = self.headers.get("content-encoding", "").lower() + if self._decoder is None: + if content_encoding in self.CONTENT_DECODERS: + self._decoder = _get_decoder(content_encoding) + elif "," in content_encoding: + encodings = [ + e.strip() + for e in content_encoding.split(",") + if e.strip() in self.CONTENT_DECODERS + ] + if encodings: + self._decoder = _get_decoder(content_encoding) + + def _decode( + self, data: bytes, decode_content: bool | None, flush_decoder: bool + ) -> bytes: + """ + Decode the data passed in and potentially flush the decoder. + """ + if not decode_content: + if self._has_decoded_content: + raise RuntimeError( + "Calling read(decode_content=False) is not supported after " + "read(decode_content=True) was called." + ) + return data + + try: + if self._decoder: + data = self._decoder.decompress(data) + self._has_decoded_content = True + except self.DECODER_ERROR_CLASSES as e: + content_encoding = self.headers.get("content-encoding", "").lower() + raise DecodeError( + "Received response with content-encoding: %s, but " + "failed to decode it." % content_encoding, + e, + ) from e + if flush_decoder: + data += self._flush_decoder() + + return data + + def _flush_decoder(self) -> bytes: + """ + Flushes the decoder. Should only be called if the decoder is actually + being used. + """ + if self._decoder: + return self._decoder.decompress(b"") + self._decoder.flush() + return b"" + + # Compatibility methods for `io` module + def readinto(self, b: bytearray) -> int: + temp = self.read(len(b)) + if len(temp) == 0: + return 0 + else: + b[: len(temp)] = temp + return len(temp) + + # Compatibility methods for http.client.HTTPResponse + def getheaders(self) -> HTTPHeaderDict: + warnings.warn( + "HTTPResponse.getheaders() is deprecated and will be removed " + "in urllib3 v2.1.0. Instead access HTTPResponse.headers directly.", + category=DeprecationWarning, + stacklevel=2, + ) + return self.headers + + def getheader(self, name: str, default: str | None = None) -> str | None: + warnings.warn( + "HTTPResponse.getheader() is deprecated and will be removed " + "in urllib3 v2.1.0. Instead use HTTPResponse.headers.get(name, default).", + category=DeprecationWarning, + stacklevel=2, + ) + return self.headers.get(name, default) + + # Compatibility method for http.cookiejar + def info(self) -> HTTPHeaderDict: + return self.headers + + def geturl(self) -> str | None: + return self.url + + +class HTTPResponse(BaseHTTPResponse): + """ + HTTP Response container. + + Backwards-compatible with :class:`http.client.HTTPResponse` but the response ``body`` is + loaded and decoded on-demand when the ``data`` property is accessed. This + class is also compatible with the Python standard library's :mod:`io` + module, and can hence be treated as a readable object in the context of that + framework. + + Extra parameters for behaviour not present in :class:`http.client.HTTPResponse`: + + :param preload_content: + If True, the response's body will be preloaded during construction. + + :param decode_content: + If True, will attempt to decode the body based on the + 'content-encoding' header. + + :param original_response: + When this HTTPResponse wrapper is generated from an :class:`http.client.HTTPResponse` + object, it's convenient to include the original for debug purposes. It's + otherwise unused. + + :param retries: + The retries contains the last :class:`~urllib3.util.retry.Retry` that + was used during the request. + + :param enforce_content_length: + Enforce content length checking. Body returned by server must match + value of Content-Length header, if present. Otherwise, raise error. + """ + + def __init__( + self, + body: _TYPE_BODY = "", + headers: typing.Mapping[str, str] | typing.Mapping[bytes, bytes] | None = None, + status: int = 0, + version: int = 0, + reason: str | None = None, + preload_content: bool = True, + decode_content: bool = True, + original_response: _HttplibHTTPResponse | None = None, + pool: HTTPConnectionPool | None = None, + connection: HTTPConnection | None = None, + msg: _HttplibHTTPMessage | None = None, + retries: Retry | None = None, + enforce_content_length: bool = True, + request_method: str | None = None, + request_url: str | None = None, + auto_close: bool = True, + ) -> None: + super().__init__( + headers=headers, + status=status, + version=version, + reason=reason, + decode_content=decode_content, + request_url=request_url, + retries=retries, + ) + + self.enforce_content_length = enforce_content_length + self.auto_close = auto_close + + self._body = None + self._fp: _HttplibHTTPResponse | None = None + self._original_response = original_response + self._fp_bytes_read = 0 + self.msg = msg + + if body and isinstance(body, (str, bytes)): + self._body = body + + self._pool = pool + self._connection = connection + + if hasattr(body, "read"): + self._fp = body # type: ignore[assignment] + + # Are we using the chunked-style of transfer encoding? + self.chunk_left: int | None = None + + # Determine length of response + self.length_remaining = self._init_length(request_method) + + # Used to return the correct amount of bytes for partial read()s + self._decoded_buffer = BytesQueueBuffer() + + # If requested, preload the body. + if preload_content and not self._body: + self._body = self.read(decode_content=decode_content) + + def release_conn(self) -> None: + if not self._pool or not self._connection: + return None + + self._pool._put_conn(self._connection) + self._connection = None + + def drain_conn(self) -> None: + """ + Read and discard any remaining HTTP response data in the response connection. + + Unread data in the HTTPResponse connection blocks the connection from being released back to the pool. + """ + try: + self.read() + except (HTTPError, OSError, BaseSSLError, HTTPException): + pass + + @property + def data(self) -> bytes: + # For backwards-compat with earlier urllib3 0.4 and earlier. + if self._body: + return self._body # type: ignore[return-value] + + if self._fp: + return self.read(cache_content=True) + + return None # type: ignore[return-value] + + @property + def connection(self) -> HTTPConnection | None: + return self._connection + + def isclosed(self) -> bool: + return is_fp_closed(self._fp) + + def tell(self) -> int: + """ + Obtain the number of bytes pulled over the wire so far. May differ from + the amount of content returned by :meth:``urllib3.response.HTTPResponse.read`` + if bytes are encoded on the wire (e.g, compressed). + """ + return self._fp_bytes_read + + def _init_length(self, request_method: str | None) -> int | None: + """ + Set initial length value for Response content if available. + """ + length: int | None + content_length: str | None = self.headers.get("content-length") + + if content_length is not None: + if self.chunked: + # This Response will fail with an IncompleteRead if it can't be + # received as chunked. This method falls back to attempt reading + # the response before raising an exception. + log.warning( + "Received response with both Content-Length and " + "Transfer-Encoding set. This is expressly forbidden " + "by RFC 7230 sec 3.3.2. Ignoring Content-Length and " + "attempting to process response as Transfer-Encoding: " + "chunked." + ) + return None + + try: + # RFC 7230 section 3.3.2 specifies multiple content lengths can + # be sent in a single Content-Length header + # (e.g. Content-Length: 42, 42). This line ensures the values + # are all valid ints and that as long as the `set` length is 1, + # all values are the same. Otherwise, the header is invalid. + lengths = {int(val) for val in content_length.split(",")} + if len(lengths) > 1: + raise InvalidHeader( + "Content-Length contained multiple " + "unmatching values (%s)" % content_length + ) + length = lengths.pop() + except ValueError: + length = None + else: + if length < 0: + length = None + + else: # if content_length is None + length = None + + # Convert status to int for comparison + # In some cases, httplib returns a status of "_UNKNOWN" + try: + status = int(self.status) + except ValueError: + status = 0 + + # Check for responses that shouldn't include a body + if status in (204, 304) or 100 <= status < 200 or request_method == "HEAD": + length = 0 + + return length + + @contextmanager + def _error_catcher(self) -> typing.Generator[None, None, None]: + """ + Catch low-level python exceptions, instead re-raising urllib3 + variants, so that low-level exceptions are not leaked in the + high-level api. + + On exit, release the connection back to the pool. + """ + clean_exit = False + + try: + try: + yield + + except SocketTimeout as e: + # FIXME: Ideally we'd like to include the url in the ReadTimeoutError but + # there is yet no clean way to get at it from this context. + raise ReadTimeoutError(self._pool, None, "Read timed out.") from e # type: ignore[arg-type] + + except BaseSSLError as e: + # FIXME: Is there a better way to differentiate between SSLErrors? + if "read operation timed out" not in str(e): + # SSL errors related to framing/MAC get wrapped and reraised here + raise SSLError(e) from e + + raise ReadTimeoutError(self._pool, None, "Read timed out.") from e # type: ignore[arg-type] + + except (HTTPException, OSError) as e: + # This includes IncompleteRead. + raise ProtocolError(f"Connection broken: {e!r}", e) from e + + # If no exception is thrown, we should avoid cleaning up + # unnecessarily. + clean_exit = True + finally: + # If we didn't terminate cleanly, we need to throw away our + # connection. + if not clean_exit: + # The response may not be closed but we're not going to use it + # anymore so close it now to ensure that the connection is + # released back to the pool. + if self._original_response: + self._original_response.close() + + # Closing the response may not actually be sufficient to close + # everything, so if we have a hold of the connection close that + # too. + if self._connection: + self._connection.close() + + # If we hold the original response but it's closed now, we should + # return the connection back to the pool. + if self._original_response and self._original_response.isclosed(): + self.release_conn() + + def _fp_read(self, amt: int | None = None) -> bytes: + """ + Read a response with the thought that reading the number of bytes + larger than can fit in a 32-bit int at a time via SSL in some + known cases leads to an overflow error that has to be prevented + if `amt` or `self.length_remaining` indicate that a problem may + happen. + + The known cases: + * 3.8 <= CPython < 3.9.7 because of a bug + https://github.com/urllib3/urllib3/issues/2513#issuecomment-1152559900. + * urllib3 injected with pyOpenSSL-backed SSL-support. + * CPython < 3.10 only when `amt` does not fit 32-bit int. + """ + assert self._fp + c_int_max = 2**31 - 1 + if ( + (amt and amt > c_int_max) + or (self.length_remaining and self.length_remaining > c_int_max) + ) and (util.IS_PYOPENSSL or sys.version_info < (3, 10)): + buffer = io.BytesIO() + # Besides `max_chunk_amt` being a maximum chunk size, it + # affects memory overhead of reading a response by this + # method in CPython. + # `c_int_max` equal to 2 GiB - 1 byte is the actual maximum + # chunk size that does not lead to an overflow error, but + # 256 MiB is a compromise. + max_chunk_amt = 2**28 + while amt is None or amt != 0: + if amt is not None: + chunk_amt = min(amt, max_chunk_amt) + amt -= chunk_amt + else: + chunk_amt = max_chunk_amt + data = self._fp.read(chunk_amt) + if not data: + break + buffer.write(data) + del data # to reduce peak memory usage by `max_chunk_amt`. + return buffer.getvalue() + else: + # StringIO doesn't like amt=None + return self._fp.read(amt) if amt is not None else self._fp.read() + + def _raw_read( + self, + amt: int | None = None, + ) -> bytes: + """ + Reads `amt` of bytes from the socket. + """ + if self._fp is None: + return None # type: ignore[return-value] + + fp_closed = getattr(self._fp, "closed", False) + + with self._error_catcher(): + data = self._fp_read(amt) if not fp_closed else b"" + if amt is not None and amt != 0 and not data: + # Platform-specific: Buggy versions of Python. + # Close the connection when no data is returned + # + # This is redundant to what httplib/http.client _should_ + # already do. However, versions of python released before + # December 15, 2012 (http://bugs.python.org/issue16298) do + # not properly close the connection in all cases. There is + # no harm in redundantly calling close. + self._fp.close() + if ( + self.enforce_content_length + and self.length_remaining is not None + and self.length_remaining != 0 + ): + # This is an edge case that httplib failed to cover due + # to concerns of backward compatibility. We're + # addressing it here to make sure IncompleteRead is + # raised during streaming, so all calls with incorrect + # Content-Length are caught. + raise IncompleteRead(self._fp_bytes_read, self.length_remaining) + + if data: + self._fp_bytes_read += len(data) + if self.length_remaining is not None: + self.length_remaining -= len(data) + return data + + def read( + self, + amt: int | None = None, + decode_content: bool | None = None, + cache_content: bool = False, + ) -> bytes: + """ + Similar to :meth:`http.client.HTTPResponse.read`, but with two additional + parameters: ``decode_content`` and ``cache_content``. + + :param amt: + How much of the content to read. If specified, caching is skipped + because it doesn't make sense to cache partial content as the full + response. + + :param decode_content: + If True, will attempt to decode the body based on the + 'content-encoding' header. + + :param cache_content: + If True, will save the returned data such that the same result is + returned despite of the state of the underlying file object. This + is useful if you want the ``.data`` property to continue working + after having ``.read()`` the file object. (Overridden if ``amt`` is + set.) + """ + self._init_decoder() + if decode_content is None: + decode_content = self.decode_content + + if amt is not None: + cache_content = False + + if len(self._decoded_buffer) >= amt: + return self._decoded_buffer.get(amt) + + data = self._raw_read(amt) + + flush_decoder = amt is None or (amt != 0 and not data) + + if not data and len(self._decoded_buffer) == 0: + return data + + if amt is None: + data = self._decode(data, decode_content, flush_decoder) + if cache_content: + self._body = data + else: + # do not waste memory on buffer when not decoding + if not decode_content: + if self._has_decoded_content: + raise RuntimeError( + "Calling read(decode_content=False) is not supported after " + "read(decode_content=True) was called." + ) + return data + + decoded_data = self._decode(data, decode_content, flush_decoder) + self._decoded_buffer.put(decoded_data) + + while len(self._decoded_buffer) < amt and data: + # TODO make sure to initially read enough data to get past the headers + # For example, the GZ file header takes 10 bytes, we don't want to read + # it one byte at a time + data = self._raw_read(amt) + decoded_data = self._decode(data, decode_content, flush_decoder) + self._decoded_buffer.put(decoded_data) + data = self._decoded_buffer.get(amt) + + return data + + def stream( + self, amt: int | None = 2**16, decode_content: bool | None = None + ) -> typing.Generator[bytes, None, None]: + """ + A generator wrapper for the read() method. A call will block until + ``amt`` bytes have been read from the connection or until the + connection is closed. + + :param amt: + How much of the content to read. The generator will return up to + much data per iteration, but may return less. This is particularly + likely when using compressed data. However, the empty string will + never be returned. + + :param decode_content: + If True, will attempt to decode the body based on the + 'content-encoding' header. + """ + if self.chunked and self.supports_chunked_reads(): + yield from self.read_chunked(amt, decode_content=decode_content) + else: + while not is_fp_closed(self._fp) or len(self._decoded_buffer) > 0: + data = self.read(amt=amt, decode_content=decode_content) + + if data: + yield data + + # Overrides from io.IOBase + def readable(self) -> bool: + return True + + def close(self) -> None: + if not self.closed and self._fp: + self._fp.close() + + if self._connection: + self._connection.close() + + if not self.auto_close: + io.IOBase.close(self) + + @property + def closed(self) -> bool: + if not self.auto_close: + return io.IOBase.closed.__get__(self) # type: ignore[no-any-return] + elif self._fp is None: + return True + elif hasattr(self._fp, "isclosed"): + return self._fp.isclosed() + elif hasattr(self._fp, "closed"): + return self._fp.closed + else: + return True + + def fileno(self) -> int: + if self._fp is None: + raise OSError("HTTPResponse has no file to get a fileno from") + elif hasattr(self._fp, "fileno"): + return self._fp.fileno() + else: + raise OSError( + "The file-like object this HTTPResponse is wrapped " + "around has no file descriptor" + ) + + def flush(self) -> None: + if ( + self._fp is not None + and hasattr(self._fp, "flush") + and not getattr(self._fp, "closed", False) + ): + return self._fp.flush() + + def supports_chunked_reads(self) -> bool: + """ + Checks if the underlying file-like object looks like a + :class:`http.client.HTTPResponse` object. We do this by testing for + the fp attribute. If it is present we assume it returns raw chunks as + processed by read_chunked(). + """ + return hasattr(self._fp, "fp") + + def _update_chunk_length(self) -> None: + # First, we'll figure out length of a chunk and then + # we'll try to read it from socket. + if self.chunk_left is not None: + return None + line = self._fp.fp.readline() # type: ignore[union-attr] + line = line.split(b";", 1)[0] + try: + self.chunk_left = int(line, 16) + except ValueError: + # Invalid chunked protocol response, abort. + self.close() + raise InvalidChunkLength(self, line) from None + + def _handle_chunk(self, amt: int | None) -> bytes: + returned_chunk = None + if amt is None: + chunk = self._fp._safe_read(self.chunk_left) # type: ignore[union-attr] + returned_chunk = chunk + self._fp._safe_read(2) # type: ignore[union-attr] # Toss the CRLF at the end of the chunk. + self.chunk_left = None + elif self.chunk_left is not None and amt < self.chunk_left: + value = self._fp._safe_read(amt) # type: ignore[union-attr] + self.chunk_left = self.chunk_left - amt + returned_chunk = value + elif amt == self.chunk_left: + value = self._fp._safe_read(amt) # type: ignore[union-attr] + self._fp._safe_read(2) # type: ignore[union-attr] # Toss the CRLF at the end of the chunk. + self.chunk_left = None + returned_chunk = value + else: # amt > self.chunk_left + returned_chunk = self._fp._safe_read(self.chunk_left) # type: ignore[union-attr] + self._fp._safe_read(2) # type: ignore[union-attr] # Toss the CRLF at the end of the chunk. + self.chunk_left = None + return returned_chunk # type: ignore[no-any-return] + + def read_chunked( + self, amt: int | None = None, decode_content: bool | None = None + ) -> typing.Generator[bytes, None, None]: + """ + Similar to :meth:`HTTPResponse.read`, but with an additional + parameter: ``decode_content``. + + :param amt: + How much of the content to read. If specified, caching is skipped + because it doesn't make sense to cache partial content as the full + response. + + :param decode_content: + If True, will attempt to decode the body based on the + 'content-encoding' header. + """ + self._init_decoder() + # FIXME: Rewrite this method and make it a class with a better structured logic. + if not self.chunked: + raise ResponseNotChunked( + "Response is not chunked. " + "Header 'transfer-encoding: chunked' is missing." + ) + if not self.supports_chunked_reads(): + raise BodyNotHttplibCompatible( + "Body should be http.client.HTTPResponse like. " + "It should have have an fp attribute which returns raw chunks." + ) + + with self._error_catcher(): + # Don't bother reading the body of a HEAD request. + if self._original_response and is_response_to_head(self._original_response): + self._original_response.close() + return None + + # If a response is already read and closed + # then return immediately. + if self._fp.fp is None: # type: ignore[union-attr] + return None + + while True: + self._update_chunk_length() + if self.chunk_left == 0: + break + chunk = self._handle_chunk(amt) + decoded = self._decode( + chunk, decode_content=decode_content, flush_decoder=False + ) + if decoded: + yield decoded + + if decode_content: + # On CPython and PyPy, we should never need to flush the + # decoder. However, on Jython we *might* need to, so + # lets defensively do it anyway. + decoded = self._flush_decoder() + if decoded: # Platform-specific: Jython. + yield decoded + + # Chunk content ends with \r\n: discard it. + while self._fp is not None: + line = self._fp.fp.readline() + if not line: + # Some sites may not end with '\r\n'. + break + if line == b"\r\n": + break + + # We read everything; close the "file". + if self._original_response: + self._original_response.close() + + @property + def url(self) -> str | None: + """ + Returns the URL that was the source of this response. + If the request that generated this response redirected, this method + will return the final redirect location. + """ + return self._request_url + + @url.setter + def url(self, url: str) -> None: + self._request_url = url + + def __iter__(self) -> typing.Iterator[bytes]: + buffer: list[bytes] = [] + for chunk in self.stream(decode_content=True): + if b"\n" in chunk: + chunks = chunk.split(b"\n") + yield b"".join(buffer) + chunks[0] + b"\n" + for x in chunks[1:-1]: + yield x + b"\n" + if chunks[-1]: + buffer = [chunks[-1]] + else: + buffer = [] + else: + buffer.append(chunk) + if buffer: + yield b"".join(buffer)