1401 lines
53 KiB
Python
1401 lines
53 KiB
Python
|
"""Utilities for input validation"""
|
||
|
|
||
|
# Authors: Olivier Grisel
|
||
|
# Gael Varoquaux
|
||
|
# Andreas Mueller
|
||
|
# Lars Buitinck
|
||
|
# Alexandre Gramfort
|
||
|
# Nicolas Tresegnie
|
||
|
# Sylvain Marie
|
||
|
# License: BSD 3 clause
|
||
|
|
||
|
from functools import wraps
|
||
|
import warnings
|
||
|
import numbers
|
||
|
|
||
|
import numpy as np
|
||
|
import scipy.sparse as sp
|
||
|
from inspect import signature, isclass, Parameter
|
||
|
|
||
|
# mypy error: Module 'numpy.core.numeric' has no attribute 'ComplexWarning'
|
||
|
from numpy.core.numeric import ComplexWarning # type: ignore
|
||
|
import joblib
|
||
|
|
||
|
from contextlib import suppress
|
||
|
|
||
|
from .fixes import _object_dtype_isnan, parse_version
|
||
|
from .. import get_config as _get_config
|
||
|
from ..exceptions import PositiveSpectrumWarning
|
||
|
from ..exceptions import NotFittedError
|
||
|
from ..exceptions import DataConversionWarning
|
||
|
|
||
|
FLOAT_DTYPES = (np.float64, np.float32, np.float16)
|
||
|
|
||
|
|
||
|
def _deprecate_positional_args(func=None, *, version="1.0 (renaming of 0.25)"):
|
||
|
"""Decorator for methods that issues warnings for positional arguments.
|
||
|
|
||
|
Using the keyword-only argument syntax in pep 3102, arguments after the
|
||
|
* will issue a warning when passed as a positional argument.
|
||
|
|
||
|
Parameters
|
||
|
----------
|
||
|
func : callable, default=None
|
||
|
Function to check arguments on.
|
||
|
version : callable, default="1.0 (renaming of 0.25)"
|
||
|
The version when positional arguments will result in error.
|
||
|
"""
|
||
|
def _inner_deprecate_positional_args(f):
|
||
|
sig = signature(f)
|
||
|
kwonly_args = []
|
||
|
all_args = []
|
||
|
|
||
|
for name, param in sig.parameters.items():
|
||
|
if param.kind == Parameter.POSITIONAL_OR_KEYWORD:
|
||
|
all_args.append(name)
|
||
|
elif param.kind == Parameter.KEYWORD_ONLY:
|
||
|
kwonly_args.append(name)
|
||
|
|
||
|
@wraps(f)
|
||
|
def inner_f(*args, **kwargs):
|
||
|
extra_args = len(args) - len(all_args)
|
||
|
if extra_args <= 0:
|
||
|
return f(*args, **kwargs)
|
||
|
|
||
|
# extra_args > 0
|
||
|
args_msg = ['{}={}'.format(name, arg)
|
||
|
for name, arg in zip(kwonly_args[:extra_args],
|
||
|
args[-extra_args:])]
|
||
|
args_msg = ", ".join(args_msg)
|
||
|
warnings.warn(f"Pass {args_msg} as keyword args. From version "
|
||
|
f"{version} passing these as positional arguments "
|
||
|
"will result in an error", FutureWarning)
|
||
|
kwargs.update(zip(sig.parameters, args))
|
||
|
return f(**kwargs)
|
||
|
return inner_f
|
||
|
|
||
|
if func is not None:
|
||
|
return _inner_deprecate_positional_args(func)
|
||
|
|
||
|
return _inner_deprecate_positional_args
|
||
|
|
||
|
|
||
|
def _assert_all_finite(X, allow_nan=False, msg_dtype=None):
|
||
|
"""Like assert_all_finite, but only for ndarray."""
|
||
|
# validation is also imported in extmath
|
||
|
from .extmath import _safe_accumulator_op
|
||
|
|
||
|
if _get_config()['assume_finite']:
|
||
|
return
|
||
|
X = np.asanyarray(X)
|
||
|
# First try an O(n) time, O(1) space solution for the common case that
|
||
|
# everything is finite; fall back to O(n) space np.isfinite to prevent
|
||
|
# false positives from overflow in sum method. The sum is also calculated
|
||
|
# safely to reduce dtype induced overflows.
|
||
|
is_float = X.dtype.kind in 'fc'
|
||
|
if is_float and (np.isfinite(_safe_accumulator_op(np.sum, X))):
|
||
|
pass
|
||
|
elif is_float:
|
||
|
msg_err = "Input contains {} or a value too large for {!r}."
|
||
|
if (allow_nan and np.isinf(X).any() or
|
||
|
not allow_nan and not np.isfinite(X).all()):
|
||
|
type_err = 'infinity' if allow_nan else 'NaN, infinity'
|
||
|
raise ValueError(
|
||
|
msg_err.format
|
||
|
(type_err,
|
||
|
msg_dtype if msg_dtype is not None else X.dtype)
|
||
|
)
|
||
|
# for object dtype data, we only check for NaNs (GH-13254)
|
||
|
elif X.dtype == np.dtype('object') and not allow_nan:
|
||
|
if _object_dtype_isnan(X).any():
|
||
|
raise ValueError("Input contains NaN")
|
||
|
|
||
|
|
||
|
@_deprecate_positional_args
|
||
|
def assert_all_finite(X, *, allow_nan=False):
|
||
|
"""Throw a ValueError if X contains NaN or infinity.
|
||
|
|
||
|
Parameters
|
||
|
----------
|
||
|
X : {ndarray, sparse matrix}
|
||
|
|
||
|
allow_nan : bool, default=False
|
||
|
"""
|
||
|
_assert_all_finite(X.data if sp.issparse(X) else X, allow_nan)
|
||
|
|
||
|
|
||
|
@_deprecate_positional_args
|
||
|
def as_float_array(X, *, copy=True, force_all_finite=True):
|
||
|
"""Converts an array-like to an array of floats.
|
||
|
|
||
|
The new dtype will be np.float32 or np.float64, depending on the original
|
||
|
type. The function can create a copy or modify the argument depending
|
||
|
on the argument copy.
|
||
|
|
||
|
Parameters
|
||
|
----------
|
||
|
X : {array-like, sparse matrix}
|
||
|
|
||
|
copy : bool, default=True
|
||
|
If True, a copy of X will be created. If False, a copy may still be
|
||
|
returned if X's dtype is not a floating point type.
|
||
|
|
||
|
force_all_finite : bool or 'allow-nan', default=True
|
||
|
Whether to raise an error on np.inf, np.nan, pd.NA in X. The
|
||
|
possibilities are:
|
||
|
|
||
|
- True: Force all values of X to be finite.
|
||
|
- False: accepts np.inf, np.nan, pd.NA in X.
|
||
|
- 'allow-nan': accepts only np.nan and pd.NA values in X. Values cannot
|
||
|
be infinite.
|
||
|
|
||
|
.. versionadded:: 0.20
|
||
|
``force_all_finite`` accepts the string ``'allow-nan'``.
|
||
|
|
||
|
.. versionchanged:: 0.23
|
||
|
Accepts `pd.NA` and converts it into `np.nan`
|
||
|
|
||
|
Returns
|
||
|
-------
|
||
|
XT : {ndarray, sparse matrix}
|
||
|
An array of type float.
|
||
|
"""
|
||
|
if isinstance(X, np.matrix) or (not isinstance(X, np.ndarray)
|
||
|
and not sp.issparse(X)):
|
||
|
return check_array(X, accept_sparse=['csr', 'csc', 'coo'],
|
||
|
dtype=np.float64, copy=copy,
|
||
|
force_all_finite=force_all_finite, ensure_2d=False)
|
||
|
elif sp.issparse(X) and X.dtype in [np.float32, np.float64]:
|
||
|
return X.copy() if copy else X
|
||
|
elif X.dtype in [np.float32, np.float64]: # is numpy array
|
||
|
return X.copy('F' if X.flags['F_CONTIGUOUS'] else 'C') if copy else X
|
||
|
else:
|
||
|
if X.dtype.kind in 'uib' and X.dtype.itemsize <= 4:
|
||
|
return_dtype = np.float32
|
||
|
else:
|
||
|
return_dtype = np.float64
|
||
|
return X.astype(return_dtype)
|
||
|
|
||
|
|
||
|
def _is_arraylike(x):
|
||
|
"""Returns whether the input is array-like."""
|
||
|
return (hasattr(x, '__len__') or
|
||
|
hasattr(x, 'shape') or
|
||
|
hasattr(x, '__array__'))
|
||
|
|
||
|
|
||
|
def _num_samples(x):
|
||
|
"""Return number of samples in array-like x."""
|
||
|
message = 'Expected sequence or array-like, got %s' % type(x)
|
||
|
if hasattr(x, 'fit') and callable(x.fit):
|
||
|
# Don't get num_samples from an ensembles length!
|
||
|
raise TypeError(message)
|
||
|
|
||
|
if not hasattr(x, '__len__') and not hasattr(x, 'shape'):
|
||
|
if hasattr(x, '__array__'):
|
||
|
x = np.asarray(x)
|
||
|
else:
|
||
|
raise TypeError(message)
|
||
|
|
||
|
if hasattr(x, 'shape') and x.shape is not None:
|
||
|
if len(x.shape) == 0:
|
||
|
raise TypeError("Singleton array %r cannot be considered"
|
||
|
" a valid collection." % x)
|
||
|
# Check that shape is returning an integer or default to len
|
||
|
# Dask dataframes may not return numeric shape[0] value
|
||
|
if isinstance(x.shape[0], numbers.Integral):
|
||
|
return x.shape[0]
|
||
|
|
||
|
try:
|
||
|
return len(x)
|
||
|
except TypeError as type_error:
|
||
|
raise TypeError(message) from type_error
|
||
|
|
||
|
|
||
|
def check_memory(memory):
|
||
|
"""Check that ``memory`` is joblib.Memory-like.
|
||
|
|
||
|
joblib.Memory-like means that ``memory`` can be converted into a
|
||
|
joblib.Memory instance (typically a str denoting the ``location``)
|
||
|
or has the same interface (has a ``cache`` method).
|
||
|
|
||
|
Parameters
|
||
|
----------
|
||
|
memory : None, str or object with the joblib.Memory interface
|
||
|
|
||
|
Returns
|
||
|
-------
|
||
|
memory : object with the joblib.Memory interface
|
||
|
|
||
|
Raises
|
||
|
------
|
||
|
ValueError
|
||
|
If ``memory`` is not joblib.Memory-like.
|
||
|
"""
|
||
|
|
||
|
if memory is None or isinstance(memory, str):
|
||
|
if parse_version(joblib.__version__) < parse_version('0.12'):
|
||
|
memory = joblib.Memory(cachedir=memory, verbose=0)
|
||
|
else:
|
||
|
memory = joblib.Memory(location=memory, verbose=0)
|
||
|
elif not hasattr(memory, 'cache'):
|
||
|
raise ValueError("'memory' should be None, a string or have the same"
|
||
|
" interface as joblib.Memory."
|
||
|
" Got memory='{}' instead.".format(memory))
|
||
|
return memory
|
||
|
|
||
|
|
||
|
def check_consistent_length(*arrays):
|
||
|
"""Check that all arrays have consistent first dimensions.
|
||
|
|
||
|
Checks whether all objects in arrays have the same shape or length.
|
||
|
|
||
|
Parameters
|
||
|
----------
|
||
|
*arrays : list or tuple of input objects.
|
||
|
Objects that will be checked for consistent length.
|
||
|
"""
|
||
|
|
||
|
lengths = [_num_samples(X) for X in arrays if X is not None]
|
||
|
uniques = np.unique(lengths)
|
||
|
if len(uniques) > 1:
|
||
|
raise ValueError("Found input variables with inconsistent numbers of"
|
||
|
" samples: %r" % [int(l) for l in lengths])
|
||
|
|
||
|
|
||
|
def _make_indexable(iterable):
|
||
|
"""Ensure iterable supports indexing or convert to an indexable variant.
|
||
|
|
||
|
Convert sparse matrices to csr and other non-indexable iterable to arrays.
|
||
|
Let `None` and indexable objects (e.g. pandas dataframes) pass unchanged.
|
||
|
|
||
|
Parameters
|
||
|
----------
|
||
|
iterable : {list, dataframe, ndarray, sparse matrix} or None
|
||
|
Object to be converted to an indexable iterable.
|
||
|
"""
|
||
|
if sp.issparse(iterable):
|
||
|
return iterable.tocsr()
|
||
|
elif hasattr(iterable, "__getitem__") or hasattr(iterable, "iloc"):
|
||
|
return iterable
|
||
|
elif iterable is None:
|
||
|
return iterable
|
||
|
return np.array(iterable)
|
||
|
|
||
|
|
||
|
def indexable(*iterables):
|
||
|
"""Make arrays indexable for cross-validation.
|
||
|
|
||
|
Checks consistent length, passes through None, and ensures that everything
|
||
|
can be indexed by converting sparse matrices to csr and converting
|
||
|
non-interable objects to arrays.
|
||
|
|
||
|
Parameters
|
||
|
----------
|
||
|
*iterables : {lists, dataframes, ndarrays, sparse matrices}
|
||
|
List of objects to ensure sliceability.
|
||
|
"""
|
||
|
result = [_make_indexable(X) for X in iterables]
|
||
|
check_consistent_length(*result)
|
||
|
return result
|
||
|
|
||
|
|
||
|
def _ensure_sparse_format(spmatrix, accept_sparse, dtype, copy,
|
||
|
force_all_finite, accept_large_sparse):
|
||
|
"""Convert a sparse matrix to a given format.
|
||
|
|
||
|
Checks the sparse format of spmatrix and converts if necessary.
|
||
|
|
||
|
Parameters
|
||
|
----------
|
||
|
spmatrix : sparse matrix
|
||
|
Input to validate and convert.
|
||
|
|
||
|
accept_sparse : str, bool or list/tuple of str
|
||
|
String[s] representing allowed sparse matrix formats ('csc',
|
||
|
'csr', 'coo', 'dok', 'bsr', 'lil', 'dia'). If the input is sparse but
|
||
|
not in the allowed format, it will be converted to the first listed
|
||
|
format. True allows the input to be any format. False means
|
||
|
that a sparse matrix input will raise an error.
|
||
|
|
||
|
dtype : str, type or None
|
||
|
Data type of result. If None, the dtype of the input is preserved.
|
||
|
|
||
|
copy : bool
|
||
|
Whether a forced copy will be triggered. If copy=False, a copy might
|
||
|
be triggered by a conversion.
|
||
|
|
||
|
force_all_finite : bool or 'allow-nan'
|
||
|
Whether to raise an error on np.inf, np.nan, pd.NA in X. The
|
||
|
possibilities are:
|
||
|
|
||
|
- True: Force all values of X to be finite.
|
||
|
- False: accepts np.inf, np.nan, pd.NA in X.
|
||
|
- 'allow-nan': accepts only np.nan and pd.NA values in X. Values cannot
|
||
|
be infinite.
|
||
|
|
||
|
.. versionadded:: 0.20
|
||
|
``force_all_finite`` accepts the string ``'allow-nan'``.
|
||
|
|
||
|
.. versionchanged:: 0.23
|
||
|
Accepts `pd.NA` and converts it into `np.nan`
|
||
|
|
||
|
Returns
|
||
|
-------
|
||
|
spmatrix_converted : sparse matrix.
|
||
|
Matrix that is ensured to have an allowed type.
|
||
|
"""
|
||
|
if dtype is None:
|
||
|
dtype = spmatrix.dtype
|
||
|
|
||
|
changed_format = False
|
||
|
|
||
|
if isinstance(accept_sparse, str):
|
||
|
accept_sparse = [accept_sparse]
|
||
|
|
||
|
# Indices dtype validation
|
||
|
_check_large_sparse(spmatrix, accept_large_sparse)
|
||
|
|
||
|
if accept_sparse is False:
|
||
|
raise TypeError('A sparse matrix was passed, but dense '
|
||
|
'data is required. Use X.toarray() to '
|
||
|
'convert to a dense numpy array.')
|
||
|
elif isinstance(accept_sparse, (list, tuple)):
|
||
|
if len(accept_sparse) == 0:
|
||
|
raise ValueError("When providing 'accept_sparse' "
|
||
|
"as a tuple or list, it must contain at "
|
||
|
"least one string value.")
|
||
|
# ensure correct sparse format
|
||
|
if spmatrix.format not in accept_sparse:
|
||
|
# create new with correct sparse
|
||
|
spmatrix = spmatrix.asformat(accept_sparse[0])
|
||
|
changed_format = True
|
||
|
elif accept_sparse is not True:
|
||
|
# any other type
|
||
|
raise ValueError("Parameter 'accept_sparse' should be a string, "
|
||
|
"boolean or list of strings. You provided "
|
||
|
"'accept_sparse={}'.".format(accept_sparse))
|
||
|
|
||
|
if dtype != spmatrix.dtype:
|
||
|
# convert dtype
|
||
|
spmatrix = spmatrix.astype(dtype)
|
||
|
elif copy and not changed_format:
|
||
|
# force copy
|
||
|
spmatrix = spmatrix.copy()
|
||
|
|
||
|
if force_all_finite:
|
||
|
if not hasattr(spmatrix, "data"):
|
||
|
warnings.warn("Can't check %s sparse matrix for nan or inf."
|
||
|
% spmatrix.format, stacklevel=2)
|
||
|
else:
|
||
|
_assert_all_finite(spmatrix.data,
|
||
|
allow_nan=force_all_finite == 'allow-nan')
|
||
|
|
||
|
return spmatrix
|
||
|
|
||
|
|
||
|
def _ensure_no_complex_data(array):
|
||
|
if hasattr(array, 'dtype') and array.dtype is not None \
|
||
|
and hasattr(array.dtype, 'kind') and array.dtype.kind == "c":
|
||
|
raise ValueError("Complex data not supported\n"
|
||
|
"{}\n".format(array))
|
||
|
|
||
|
|
||
|
@_deprecate_positional_args
|
||
|
def check_array(array, accept_sparse=False, *, accept_large_sparse=True,
|
||
|
dtype="numeric", order=None, copy=False, force_all_finite=True,
|
||
|
ensure_2d=True, allow_nd=False, ensure_min_samples=1,
|
||
|
ensure_min_features=1, estimator=None):
|
||
|
|
||
|
"""Input validation on an array, list, sparse matrix or similar.
|
||
|
|
||
|
By default, the input is checked to be a non-empty 2D array containing
|
||
|
only finite values. If the dtype of the array is object, attempt
|
||
|
converting to float, raising on failure.
|
||
|
|
||
|
Parameters
|
||
|
----------
|
||
|
array : object
|
||
|
Input object to check / convert.
|
||
|
|
||
|
accept_sparse : str, bool or list/tuple of str, default=False
|
||
|
String[s] representing allowed sparse matrix formats, such as 'csc',
|
||
|
'csr', etc. If the input is sparse but not in the allowed format,
|
||
|
it will be converted to the first listed format. True allows the input
|
||
|
to be any format. False means that a sparse matrix input will
|
||
|
raise an error.
|
||
|
|
||
|
accept_large_sparse : bool, default=True
|
||
|
If a CSR, CSC, COO or BSR sparse matrix is supplied and accepted by
|
||
|
accept_sparse, accept_large_sparse=False will cause it to be accepted
|
||
|
only if its indices are stored with a 32-bit dtype.
|
||
|
|
||
|
.. versionadded:: 0.20
|
||
|
|
||
|
dtype : 'numeric', type, list of type or None, default='numeric'
|
||
|
Data type of result. If None, the dtype of the input is preserved.
|
||
|
If "numeric", dtype is preserved unless array.dtype is object.
|
||
|
If dtype is a list of types, conversion on the first type is only
|
||
|
performed if the dtype of the input is not in the list.
|
||
|
|
||
|
order : {'F', 'C'} or None, default=None
|
||
|
Whether an array will be forced to be fortran or c-style.
|
||
|
When order is None (default), then if copy=False, nothing is ensured
|
||
|
about the memory layout of the output array; otherwise (copy=True)
|
||
|
the memory layout of the returned array is kept as close as possible
|
||
|
to the original array.
|
||
|
|
||
|
copy : bool, default=False
|
||
|
Whether a forced copy will be triggered. If copy=False, a copy might
|
||
|
be triggered by a conversion.
|
||
|
|
||
|
force_all_finite : bool or 'allow-nan', default=True
|
||
|
Whether to raise an error on np.inf, np.nan, pd.NA in array. The
|
||
|
possibilities are:
|
||
|
|
||
|
- True: Force all values of array to be finite.
|
||
|
- False: accepts np.inf, np.nan, pd.NA in array.
|
||
|
- 'allow-nan': accepts only np.nan and pd.NA values in array. Values
|
||
|
cannot be infinite.
|
||
|
|
||
|
.. versionadded:: 0.20
|
||
|
``force_all_finite`` accepts the string ``'allow-nan'``.
|
||
|
|
||
|
.. versionchanged:: 0.23
|
||
|
Accepts `pd.NA` and converts it into `np.nan`
|
||
|
|
||
|
ensure_2d : bool, default=True
|
||
|
Whether to raise a value error if array is not 2D.
|
||
|
|
||
|
allow_nd : bool, default=False
|
||
|
Whether to allow array.ndim > 2.
|
||
|
|
||
|
ensure_min_samples : int, default=1
|
||
|
Make sure that the array has a minimum number of samples in its first
|
||
|
axis (rows for a 2D array). Setting to 0 disables this check.
|
||
|
|
||
|
ensure_min_features : int, default=1
|
||
|
Make sure that the 2D array has some minimum number of features
|
||
|
(columns). The default value of 1 rejects empty datasets.
|
||
|
This check is only enforced when the input data has effectively 2
|
||
|
dimensions or is originally 1D and ``ensure_2d`` is True. Setting to 0
|
||
|
disables this check.
|
||
|
|
||
|
estimator : str or estimator instance, default=None
|
||
|
If passed, include the name of the estimator in warning messages.
|
||
|
|
||
|
Returns
|
||
|
-------
|
||
|
array_converted : object
|
||
|
The converted and validated array.
|
||
|
"""
|
||
|
# store reference to original array to check if copy is needed when
|
||
|
# function returns
|
||
|
array_orig = array
|
||
|
|
||
|
# store whether originally we wanted numeric dtype
|
||
|
dtype_numeric = isinstance(dtype, str) and dtype == "numeric"
|
||
|
|
||
|
dtype_orig = getattr(array, "dtype", None)
|
||
|
if not hasattr(dtype_orig, 'kind'):
|
||
|
# not a data type (e.g. a column named dtype in a pandas DataFrame)
|
||
|
dtype_orig = None
|
||
|
|
||
|
# check if the object contains several dtypes (typically a pandas
|
||
|
# DataFrame), and store them. If not, store None.
|
||
|
dtypes_orig = None
|
||
|
has_pd_integer_array = False
|
||
|
if hasattr(array, "dtypes") and hasattr(array.dtypes, '__array__'):
|
||
|
# throw warning if columns are sparse. If all columns are sparse, then
|
||
|
# array.sparse exists and sparsity will be perserved (later).
|
||
|
with suppress(ImportError):
|
||
|
from pandas.api.types import is_sparse
|
||
|
if (not hasattr(array, 'sparse') and
|
||
|
array.dtypes.apply(is_sparse).any()):
|
||
|
warnings.warn(
|
||
|
"pandas.DataFrame with sparse columns found."
|
||
|
"It will be converted to a dense numpy array."
|
||
|
)
|
||
|
|
||
|
dtypes_orig = list(array.dtypes)
|
||
|
# pandas boolean dtype __array__ interface coerces bools to objects
|
||
|
for i, dtype_iter in enumerate(dtypes_orig):
|
||
|
if dtype_iter.kind == 'b':
|
||
|
dtypes_orig[i] = np.dtype(object)
|
||
|
elif dtype_iter.name.startswith(("Int", "UInt")):
|
||
|
# name looks like an Integer Extension Array, now check for
|
||
|
# the dtype
|
||
|
with suppress(ImportError):
|
||
|
from pandas import (Int8Dtype, Int16Dtype,
|
||
|
Int32Dtype, Int64Dtype,
|
||
|
UInt8Dtype, UInt16Dtype,
|
||
|
UInt32Dtype, UInt64Dtype)
|
||
|
if isinstance(dtype_iter, (Int8Dtype, Int16Dtype,
|
||
|
Int32Dtype, Int64Dtype,
|
||
|
UInt8Dtype, UInt16Dtype,
|
||
|
UInt32Dtype, UInt64Dtype)):
|
||
|
has_pd_integer_array = True
|
||
|
|
||
|
if all(isinstance(dtype, np.dtype) for dtype in dtypes_orig):
|
||
|
dtype_orig = np.result_type(*dtypes_orig)
|
||
|
|
||
|
if dtype_numeric:
|
||
|
if dtype_orig is not None and dtype_orig.kind == "O":
|
||
|
# if input is object, convert to float.
|
||
|
dtype = np.float64
|
||
|
else:
|
||
|
dtype = None
|
||
|
|
||
|
if isinstance(dtype, (list, tuple)):
|
||
|
if dtype_orig is not None and dtype_orig in dtype:
|
||
|
# no dtype conversion required
|
||
|
dtype = None
|
||
|
else:
|
||
|
# dtype conversion required. Let's select the first element of the
|
||
|
# list of accepted types.
|
||
|
dtype = dtype[0]
|
||
|
|
||
|
if has_pd_integer_array:
|
||
|
# If there are any pandas integer extension arrays,
|
||
|
array = array.astype(dtype)
|
||
|
|
||
|
if force_all_finite not in (True, False, 'allow-nan'):
|
||
|
raise ValueError('force_all_finite should be a bool or "allow-nan"'
|
||
|
'. Got {!r} instead'.format(force_all_finite))
|
||
|
|
||
|
if estimator is not None:
|
||
|
if isinstance(estimator, str):
|
||
|
estimator_name = estimator
|
||
|
else:
|
||
|
estimator_name = estimator.__class__.__name__
|
||
|
else:
|
||
|
estimator_name = "Estimator"
|
||
|
context = " by %s" % estimator_name if estimator is not None else ""
|
||
|
|
||
|
# When all dataframe columns are sparse, convert to a sparse array
|
||
|
if hasattr(array, 'sparse') and array.ndim > 1:
|
||
|
# DataFrame.sparse only supports `to_coo`
|
||
|
array = array.sparse.to_coo()
|
||
|
if array.dtype == np.dtype('object'):
|
||
|
unique_dtypes = set(
|
||
|
[dt.subtype.name for dt in array_orig.dtypes]
|
||
|
)
|
||
|
if len(unique_dtypes) > 1:
|
||
|
raise ValueError(
|
||
|
"Pandas DataFrame with mixed sparse extension arrays "
|
||
|
"generated a sparse matrix with object dtype which "
|
||
|
"can not be converted to a scipy sparse matrix."
|
||
|
"Sparse extension arrays should all have the same "
|
||
|
"numeric type.")
|
||
|
|
||
|
if sp.issparse(array):
|
||
|
_ensure_no_complex_data(array)
|
||
|
array = _ensure_sparse_format(array, accept_sparse=accept_sparse,
|
||
|
dtype=dtype, copy=copy,
|
||
|
force_all_finite=force_all_finite,
|
||
|
accept_large_sparse=accept_large_sparse)
|
||
|
else:
|
||
|
# If np.array(..) gives ComplexWarning, then we convert the warning
|
||
|
# to an error. This is needed because specifying a non complex
|
||
|
# dtype to the function converts complex to real dtype,
|
||
|
# thereby passing the test made in the lines following the scope
|
||
|
# of warnings context manager.
|
||
|
with warnings.catch_warnings():
|
||
|
try:
|
||
|
warnings.simplefilter('error', ComplexWarning)
|
||
|
if dtype is not None and np.dtype(dtype).kind in 'iu':
|
||
|
# Conversion float -> int should not contain NaN or
|
||
|
# inf (numpy#14412). We cannot use casting='safe' because
|
||
|
# then conversion float -> int would be disallowed.
|
||
|
array = np.asarray(array, order=order)
|
||
|
if array.dtype.kind == 'f':
|
||
|
_assert_all_finite(array, allow_nan=False,
|
||
|
msg_dtype=dtype)
|
||
|
array = array.astype(dtype, casting="unsafe", copy=False)
|
||
|
else:
|
||
|
array = np.asarray(array, order=order, dtype=dtype)
|
||
|
except ComplexWarning as complex_warning:
|
||
|
raise ValueError("Complex data not supported\n"
|
||
|
"{}\n".format(array)) from complex_warning
|
||
|
|
||
|
# It is possible that the np.array(..) gave no warning. This happens
|
||
|
# when no dtype conversion happened, for example dtype = None. The
|
||
|
# result is that np.array(..) produces an array of complex dtype
|
||
|
# and we need to catch and raise exception for such cases.
|
||
|
_ensure_no_complex_data(array)
|
||
|
|
||
|
if ensure_2d:
|
||
|
# If input is scalar raise error
|
||
|
if array.ndim == 0:
|
||
|
raise ValueError(
|
||
|
"Expected 2D array, got scalar array instead:\narray={}.\n"
|
||
|
"Reshape your data either using array.reshape(-1, 1) if "
|
||
|
"your data has a single feature or array.reshape(1, -1) "
|
||
|
"if it contains a single sample.".format(array))
|
||
|
# If input is 1D raise error
|
||
|
if array.ndim == 1:
|
||
|
raise ValueError(
|
||
|
"Expected 2D array, got 1D array instead:\narray={}.\n"
|
||
|
"Reshape your data either using array.reshape(-1, 1) if "
|
||
|
"your data has a single feature or array.reshape(1, -1) "
|
||
|
"if it contains a single sample.".format(array))
|
||
|
|
||
|
# make sure we actually converted to numeric:
|
||
|
if dtype_numeric and array.dtype.kind in "OUSV":
|
||
|
warnings.warn(
|
||
|
"Arrays of bytes/strings is being converted to decimal "
|
||
|
"numbers if dtype='numeric'. This behavior is deprecated in "
|
||
|
"0.24 and will be removed in 1.1 (renaming of 0.26). Please "
|
||
|
"convert your data to numeric values explicitly instead.",
|
||
|
FutureWarning, stacklevel=2
|
||
|
)
|
||
|
try:
|
||
|
array = array.astype(np.float64)
|
||
|
except ValueError as e:
|
||
|
raise ValueError(
|
||
|
"Unable to convert array of bytes/strings "
|
||
|
"into decimal numbers with dtype='numeric'") from e
|
||
|
if not allow_nd and array.ndim >= 3:
|
||
|
raise ValueError("Found array with dim %d. %s expected <= 2."
|
||
|
% (array.ndim, estimator_name))
|
||
|
|
||
|
if force_all_finite:
|
||
|
_assert_all_finite(array,
|
||
|
allow_nan=force_all_finite == 'allow-nan')
|
||
|
|
||
|
if ensure_min_samples > 0:
|
||
|
n_samples = _num_samples(array)
|
||
|
if n_samples < ensure_min_samples:
|
||
|
raise ValueError("Found array with %d sample(s) (shape=%s) while a"
|
||
|
" minimum of %d is required%s."
|
||
|
% (n_samples, array.shape, ensure_min_samples,
|
||
|
context))
|
||
|
|
||
|
if ensure_min_features > 0 and array.ndim == 2:
|
||
|
n_features = array.shape[1]
|
||
|
if n_features < ensure_min_features:
|
||
|
raise ValueError("Found array with %d feature(s) (shape=%s) while"
|
||
|
" a minimum of %d is required%s."
|
||
|
% (n_features, array.shape, ensure_min_features,
|
||
|
context))
|
||
|
|
||
|
if copy and np.may_share_memory(array, array_orig):
|
||
|
array = np.array(array, dtype=dtype, order=order)
|
||
|
|
||
|
return array
|
||
|
|
||
|
|
||
|
def _check_large_sparse(X, accept_large_sparse=False):
|
||
|
"""Raise a ValueError if X has 64bit indices and accept_large_sparse=False
|
||
|
"""
|
||
|
if not accept_large_sparse:
|
||
|
supported_indices = ["int32"]
|
||
|
if X.getformat() == "coo":
|
||
|
index_keys = ['col', 'row']
|
||
|
elif X.getformat() in ["csr", "csc", "bsr"]:
|
||
|
index_keys = ['indices', 'indptr']
|
||
|
else:
|
||
|
return
|
||
|
for key in index_keys:
|
||
|
indices_datatype = getattr(X, key).dtype
|
||
|
if (indices_datatype not in supported_indices):
|
||
|
raise ValueError("Only sparse matrices with 32-bit integer"
|
||
|
" indices are accepted. Got %s indices."
|
||
|
% indices_datatype)
|
||
|
|
||
|
|
||
|
@_deprecate_positional_args
|
||
|
def check_X_y(X, y, accept_sparse=False, *, accept_large_sparse=True,
|
||
|
dtype="numeric", order=None, copy=False, force_all_finite=True,
|
||
|
ensure_2d=True, allow_nd=False, multi_output=False,
|
||
|
ensure_min_samples=1, ensure_min_features=1, y_numeric=False,
|
||
|
estimator=None):
|
||
|
"""Input validation for standard estimators.
|
||
|
|
||
|
Checks X and y for consistent length, enforces X to be 2D and y 1D. By
|
||
|
default, X is checked to be non-empty and containing only finite values.
|
||
|
Standard input checks are also applied to y, such as checking that y
|
||
|
does not have np.nan or np.inf targets. For multi-label y, set
|
||
|
multi_output=True to allow 2D and sparse y. If the dtype of X is
|
||
|
object, attempt converting to float, raising on failure.
|
||
|
|
||
|
Parameters
|
||
|
----------
|
||
|
X : {ndarray, list, sparse matrix}
|
||
|
Input data.
|
||
|
|
||
|
y : {ndarray, list, sparse matrix}
|
||
|
Labels.
|
||
|
|
||
|
accept_sparse : str, bool or list of str, default=False
|
||
|
String[s] representing allowed sparse matrix formats, such as 'csc',
|
||
|
'csr', etc. If the input is sparse but not in the allowed format,
|
||
|
it will be converted to the first listed format. True allows the input
|
||
|
to be any format. False means that a sparse matrix input will
|
||
|
raise an error.
|
||
|
|
||
|
accept_large_sparse : bool, default=True
|
||
|
If a CSR, CSC, COO or BSR sparse matrix is supplied and accepted by
|
||
|
accept_sparse, accept_large_sparse will cause it to be accepted only
|
||
|
if its indices are stored with a 32-bit dtype.
|
||
|
|
||
|
.. versionadded:: 0.20
|
||
|
|
||
|
dtype : 'numeric', type, list of type or None, default='numeric'
|
||
|
Data type of result. If None, the dtype of the input is preserved.
|
||
|
If "numeric", dtype is preserved unless array.dtype is object.
|
||
|
If dtype is a list of types, conversion on the first type is only
|
||
|
performed if the dtype of the input is not in the list.
|
||
|
|
||
|
order : {'F', 'C'}, default=None
|
||
|
Whether an array will be forced to be fortran or c-style.
|
||
|
|
||
|
copy : bool, default=False
|
||
|
Whether a forced copy will be triggered. If copy=False, a copy might
|
||
|
be triggered by a conversion.
|
||
|
|
||
|
force_all_finite : bool or 'allow-nan', default=True
|
||
|
Whether to raise an error on np.inf, np.nan, pd.NA in X. This parameter
|
||
|
does not influence whether y can have np.inf, np.nan, pd.NA values.
|
||
|
The possibilities are:
|
||
|
|
||
|
- True: Force all values of X to be finite.
|
||
|
- False: accepts np.inf, np.nan, pd.NA in X.
|
||
|
- 'allow-nan': accepts only np.nan or pd.NA values in X. Values cannot
|
||
|
be infinite.
|
||
|
|
||
|
.. versionadded:: 0.20
|
||
|
``force_all_finite`` accepts the string ``'allow-nan'``.
|
||
|
|
||
|
.. versionchanged:: 0.23
|
||
|
Accepts `pd.NA` and converts it into `np.nan`
|
||
|
|
||
|
ensure_2d : bool, default=True
|
||
|
Whether to raise a value error if X is not 2D.
|
||
|
|
||
|
allow_nd : bool, default=False
|
||
|
Whether to allow X.ndim > 2.
|
||
|
|
||
|
multi_output : bool, default=False
|
||
|
Whether to allow 2D y (array or sparse matrix). If false, y will be
|
||
|
validated as a vector. y cannot have np.nan or np.inf values if
|
||
|
multi_output=True.
|
||
|
|
||
|
ensure_min_samples : int, default=1
|
||
|
Make sure that X has a minimum number of samples in its first
|
||
|
axis (rows for a 2D array).
|
||
|
|
||
|
ensure_min_features : int, default=1
|
||
|
Make sure that the 2D array has some minimum number of features
|
||
|
(columns). The default value of 1 rejects empty datasets.
|
||
|
This check is only enforced when X has effectively 2 dimensions or
|
||
|
is originally 1D and ``ensure_2d`` is True. Setting to 0 disables
|
||
|
this check.
|
||
|
|
||
|
y_numeric : bool, default=False
|
||
|
Whether to ensure that y has a numeric type. If dtype of y is object,
|
||
|
it is converted to float64. Should only be used for regression
|
||
|
algorithms.
|
||
|
|
||
|
estimator : str or estimator instance, default=None
|
||
|
If passed, include the name of the estimator in warning messages.
|
||
|
|
||
|
Returns
|
||
|
-------
|
||
|
X_converted : object
|
||
|
The converted and validated X.
|
||
|
|
||
|
y_converted : object
|
||
|
The converted and validated y.
|
||
|
"""
|
||
|
if y is None:
|
||
|
raise ValueError("y cannot be None")
|
||
|
|
||
|
X = check_array(X, accept_sparse=accept_sparse,
|
||
|
accept_large_sparse=accept_large_sparse,
|
||
|
dtype=dtype, order=order, copy=copy,
|
||
|
force_all_finite=force_all_finite,
|
||
|
ensure_2d=ensure_2d, allow_nd=allow_nd,
|
||
|
ensure_min_samples=ensure_min_samples,
|
||
|
ensure_min_features=ensure_min_features,
|
||
|
estimator=estimator)
|
||
|
if multi_output:
|
||
|
y = check_array(y, accept_sparse='csr', force_all_finite=True,
|
||
|
ensure_2d=False, dtype=None)
|
||
|
else:
|
||
|
y = column_or_1d(y, warn=True)
|
||
|
_assert_all_finite(y)
|
||
|
if y_numeric and y.dtype.kind == 'O':
|
||
|
y = y.astype(np.float64)
|
||
|
|
||
|
check_consistent_length(X, y)
|
||
|
|
||
|
return X, y
|
||
|
|
||
|
|
||
|
@_deprecate_positional_args
|
||
|
def column_or_1d(y, *, warn=False):
|
||
|
""" Ravel column or 1d numpy array, else raises an error.
|
||
|
|
||
|
Parameters
|
||
|
----------
|
||
|
y : array-like
|
||
|
|
||
|
warn : bool, default=False
|
||
|
To control display of warnings.
|
||
|
|
||
|
Returns
|
||
|
-------
|
||
|
y : ndarray
|
||
|
|
||
|
"""
|
||
|
y = np.asarray(y)
|
||
|
shape = np.shape(y)
|
||
|
if len(shape) == 1:
|
||
|
return np.ravel(y)
|
||
|
if len(shape) == 2 and shape[1] == 1:
|
||
|
if warn:
|
||
|
warnings.warn("A column-vector y was passed when a 1d array was"
|
||
|
" expected. Please change the shape of y to "
|
||
|
"(n_samples, ), for example using ravel().",
|
||
|
DataConversionWarning, stacklevel=2)
|
||
|
return np.ravel(y)
|
||
|
|
||
|
raise ValueError(
|
||
|
"y should be a 1d array, "
|
||
|
"got an array of shape {} instead.".format(shape))
|
||
|
|
||
|
|
||
|
def check_random_state(seed):
|
||
|
"""Turn seed into a np.random.RandomState instance
|
||
|
|
||
|
Parameters
|
||
|
----------
|
||
|
seed : None, int or instance of RandomState
|
||
|
If seed is None, return the RandomState singleton used by np.random.
|
||
|
If seed is an int, return a new RandomState instance seeded with seed.
|
||
|
If seed is already a RandomState instance, return it.
|
||
|
Otherwise raise ValueError.
|
||
|
"""
|
||
|
if seed is None or seed is np.random:
|
||
|
return np.random.mtrand._rand
|
||
|
if isinstance(seed, numbers.Integral):
|
||
|
return np.random.RandomState(seed)
|
||
|
if isinstance(seed, np.random.RandomState):
|
||
|
return seed
|
||
|
raise ValueError('%r cannot be used to seed a numpy.random.RandomState'
|
||
|
' instance' % seed)
|
||
|
|
||
|
|
||
|
def has_fit_parameter(estimator, parameter):
|
||
|
"""Checks whether the estimator's fit method supports the given parameter.
|
||
|
|
||
|
Parameters
|
||
|
----------
|
||
|
estimator : object
|
||
|
An estimator to inspect.
|
||
|
|
||
|
parameter : str
|
||
|
The searched parameter.
|
||
|
|
||
|
Returns
|
||
|
-------
|
||
|
is_parameter: bool
|
||
|
Whether the parameter was found to be a named parameter of the
|
||
|
estimator's fit method.
|
||
|
|
||
|
Examples
|
||
|
--------
|
||
|
>>> from sklearn.svm import SVC
|
||
|
>>> has_fit_parameter(SVC(), "sample_weight")
|
||
|
True
|
||
|
|
||
|
"""
|
||
|
return parameter in signature(estimator.fit).parameters
|
||
|
|
||
|
|
||
|
@_deprecate_positional_args
|
||
|
def check_symmetric(array, *, tol=1E-10, raise_warning=True,
|
||
|
raise_exception=False):
|
||
|
"""Make sure that array is 2D, square and symmetric.
|
||
|
|
||
|
If the array is not symmetric, then a symmetrized version is returned.
|
||
|
Optionally, a warning or exception is raised if the matrix is not
|
||
|
symmetric.
|
||
|
|
||
|
Parameters
|
||
|
----------
|
||
|
array : {ndarray, sparse matrix}
|
||
|
Input object to check / convert. Must be two-dimensional and square,
|
||
|
otherwise a ValueError will be raised.
|
||
|
|
||
|
tol : float, default=1e-10
|
||
|
Absolute tolerance for equivalence of arrays. Default = 1E-10.
|
||
|
|
||
|
raise_warning : bool, default=True
|
||
|
If True then raise a warning if conversion is required.
|
||
|
|
||
|
raise_exception : bool, default=False
|
||
|
If True then raise an exception if array is not symmetric.
|
||
|
|
||
|
Returns
|
||
|
-------
|
||
|
array_sym : {ndarray, sparse matrix}
|
||
|
Symmetrized version of the input array, i.e. the average of array
|
||
|
and array.transpose(). If sparse, then duplicate entries are first
|
||
|
summed and zeros are eliminated.
|
||
|
"""
|
||
|
if (array.ndim != 2) or (array.shape[0] != array.shape[1]):
|
||
|
raise ValueError("array must be 2-dimensional and square. "
|
||
|
"shape = {0}".format(array.shape))
|
||
|
|
||
|
if sp.issparse(array):
|
||
|
diff = array - array.T
|
||
|
# only csr, csc, and coo have `data` attribute
|
||
|
if diff.format not in ['csr', 'csc', 'coo']:
|
||
|
diff = diff.tocsr()
|
||
|
symmetric = np.all(abs(diff.data) < tol)
|
||
|
else:
|
||
|
symmetric = np.allclose(array, array.T, atol=tol)
|
||
|
|
||
|
if not symmetric:
|
||
|
if raise_exception:
|
||
|
raise ValueError("Array must be symmetric")
|
||
|
if raise_warning:
|
||
|
warnings.warn("Array is not symmetric, and will be converted "
|
||
|
"to symmetric by average with its transpose.",
|
||
|
stacklevel=2)
|
||
|
if sp.issparse(array):
|
||
|
conversion = 'to' + array.format
|
||
|
array = getattr(0.5 * (array + array.T), conversion)()
|
||
|
else:
|
||
|
array = 0.5 * (array + array.T)
|
||
|
|
||
|
return array
|
||
|
|
||
|
|
||
|
@_deprecate_positional_args
|
||
|
def check_is_fitted(estimator, attributes=None, *, msg=None, all_or_any=all):
|
||
|
"""Perform is_fitted validation for estimator.
|
||
|
|
||
|
Checks if the estimator is fitted by verifying the presence of
|
||
|
fitted attributes (ending with a trailing underscore) and otherwise
|
||
|
raises a NotFittedError with the given message.
|
||
|
|
||
|
This utility is meant to be used internally by estimators themselves,
|
||
|
typically in their own predict / transform methods.
|
||
|
|
||
|
Parameters
|
||
|
----------
|
||
|
estimator : estimator instance
|
||
|
estimator instance for which the check is performed.
|
||
|
|
||
|
attributes : str, list or tuple of str, default=None
|
||
|
Attribute name(s) given as string or a list/tuple of strings
|
||
|
Eg.: ``["coef_", "estimator_", ...], "coef_"``
|
||
|
|
||
|
If `None`, `estimator` is considered fitted if there exist an
|
||
|
attribute that ends with a underscore and does not start with double
|
||
|
underscore.
|
||
|
|
||
|
msg : str, default=None
|
||
|
The default error message is, "This %(name)s instance is not fitted
|
||
|
yet. Call 'fit' with appropriate arguments before using this
|
||
|
estimator."
|
||
|
|
||
|
For custom messages if "%(name)s" is present in the message string,
|
||
|
it is substituted for the estimator name.
|
||
|
|
||
|
Eg. : "Estimator, %(name)s, must be fitted before sparsifying".
|
||
|
|
||
|
all_or_any : callable, {all, any}, default=all
|
||
|
Specify whether all or any of the given attributes must exist.
|
||
|
|
||
|
Returns
|
||
|
-------
|
||
|
None
|
||
|
|
||
|
Raises
|
||
|
------
|
||
|
NotFittedError
|
||
|
If the attributes are not found.
|
||
|
"""
|
||
|
if isclass(estimator):
|
||
|
raise TypeError("{} is a class, not an instance.".format(estimator))
|
||
|
if msg is None:
|
||
|
msg = ("This %(name)s instance is not fitted yet. Call 'fit' with "
|
||
|
"appropriate arguments before using this estimator.")
|
||
|
|
||
|
if not hasattr(estimator, 'fit'):
|
||
|
raise TypeError("%s is not an estimator instance." % (estimator))
|
||
|
|
||
|
if attributes is not None:
|
||
|
if not isinstance(attributes, (list, tuple)):
|
||
|
attributes = [attributes]
|
||
|
attrs = all_or_any([hasattr(estimator, attr) for attr in attributes])
|
||
|
else:
|
||
|
attrs = [v for v in vars(estimator)
|
||
|
if v.endswith("_") and not v.startswith("__")]
|
||
|
|
||
|
if not attrs:
|
||
|
raise NotFittedError(msg % {'name': type(estimator).__name__})
|
||
|
|
||
|
|
||
|
def check_non_negative(X, whom):
|
||
|
"""
|
||
|
Check if there is any negative value in an array.
|
||
|
|
||
|
Parameters
|
||
|
----------
|
||
|
X : {array-like, sparse matrix}
|
||
|
Input data.
|
||
|
|
||
|
whom : str
|
||
|
Who passed X to this function.
|
||
|
"""
|
||
|
# avoid X.min() on sparse matrix since it also sorts the indices
|
||
|
if sp.issparse(X):
|
||
|
if X.format in ['lil', 'dok']:
|
||
|
X = X.tocsr()
|
||
|
if X.data.size == 0:
|
||
|
X_min = 0
|
||
|
else:
|
||
|
X_min = X.data.min()
|
||
|
else:
|
||
|
X_min = X.min()
|
||
|
|
||
|
if X_min < 0:
|
||
|
raise ValueError("Negative values in data passed to %s" % whom)
|
||
|
|
||
|
|
||
|
def check_scalar(x, name, target_type, *, min_val=None, max_val=None):
|
||
|
"""Validate scalar parameters type and value.
|
||
|
|
||
|
Parameters
|
||
|
----------
|
||
|
x : object
|
||
|
The scalar parameter to validate.
|
||
|
|
||
|
name : str
|
||
|
The name of the parameter to be printed in error messages.
|
||
|
|
||
|
target_type : type or tuple
|
||
|
Acceptable data types for the parameter.
|
||
|
|
||
|
min_val : float or int, default=None
|
||
|
The minimum valid value the parameter can take. If None (default) it
|
||
|
is implied that the parameter does not have a lower bound.
|
||
|
|
||
|
max_val : float or int, default=None
|
||
|
The maximum valid value the parameter can take. If None (default) it
|
||
|
is implied that the parameter does not have an upper bound.
|
||
|
|
||
|
Raises
|
||
|
-------
|
||
|
TypeError
|
||
|
If the parameter's type does not match the desired type.
|
||
|
|
||
|
ValueError
|
||
|
If the parameter's value violates the given bounds.
|
||
|
"""
|
||
|
|
||
|
if not isinstance(x, target_type):
|
||
|
raise TypeError('`{}` must be an instance of {}, not {}.'
|
||
|
.format(name, target_type, type(x)))
|
||
|
|
||
|
if min_val is not None and x < min_val:
|
||
|
raise ValueError('`{}`= {}, must be >= {}.'.format(name, x, min_val))
|
||
|
|
||
|
if max_val is not None and x > max_val:
|
||
|
raise ValueError('`{}`= {}, must be <= {}.'.format(name, x, max_val))
|
||
|
|
||
|
|
||
|
def _check_psd_eigenvalues(lambdas, enable_warnings=False):
|
||
|
"""Check the eigenvalues of a positive semidefinite (PSD) matrix.
|
||
|
|
||
|
Checks the provided array of PSD matrix eigenvalues for numerical or
|
||
|
conditioning issues and returns a fixed validated version. This method
|
||
|
should typically be used if the PSD matrix is user-provided (e.g. a
|
||
|
Gram matrix) or computed using a user-provided dissimilarity metric
|
||
|
(e.g. kernel function), or if the decomposition process uses approximation
|
||
|
methods (randomized SVD, etc.).
|
||
|
|
||
|
It checks for three things:
|
||
|
|
||
|
- that there are no significant imaginary parts in eigenvalues (more than
|
||
|
1e-5 times the maximum real part). If this check fails, it raises a
|
||
|
``ValueError``. Otherwise all non-significant imaginary parts that may
|
||
|
remain are set to zero. This operation is traced with a
|
||
|
``PositiveSpectrumWarning`` when ``enable_warnings=True``.
|
||
|
|
||
|
- that eigenvalues are not all negative. If this check fails, it raises a
|
||
|
``ValueError``
|
||
|
|
||
|
- that there are no significant negative eigenvalues with absolute value
|
||
|
more than 1e-10 (1e-6) and more than 1e-5 (5e-3) times the largest
|
||
|
positive eigenvalue in double (simple) precision. If this check fails,
|
||
|
it raises a ``ValueError``. Otherwise all negative eigenvalues that may
|
||
|
remain are set to zero. This operation is traced with a
|
||
|
``PositiveSpectrumWarning`` when ``enable_warnings=True``.
|
||
|
|
||
|
Finally, all the positive eigenvalues that are too small (with a value
|
||
|
smaller than the maximum eigenvalue multiplied by 1e-12 (2e-7)) are set to
|
||
|
zero. This operation is traced with a ``PositiveSpectrumWarning`` when
|
||
|
``enable_warnings=True``.
|
||
|
|
||
|
Parameters
|
||
|
----------
|
||
|
lambdas : array-like of shape (n_eigenvalues,)
|
||
|
Array of eigenvalues to check / fix.
|
||
|
|
||
|
enable_warnings : bool, default=False
|
||
|
When this is set to ``True``, a ``PositiveSpectrumWarning`` will be
|
||
|
raised when there are imaginary parts, negative eigenvalues, or
|
||
|
extremely small non-zero eigenvalues. Otherwise no warning will be
|
||
|
raised. In both cases, imaginary parts, negative eigenvalues, and
|
||
|
extremely small non-zero eigenvalues will be set to zero.
|
||
|
|
||
|
Returns
|
||
|
-------
|
||
|
lambdas_fixed : ndarray of shape (n_eigenvalues,)
|
||
|
A fixed validated copy of the array of eigenvalues.
|
||
|
|
||
|
Examples
|
||
|
--------
|
||
|
>>> _check_psd_eigenvalues([1, 2]) # nominal case
|
||
|
array([1, 2])
|
||
|
>>> _check_psd_eigenvalues([5, 5j]) # significant imag part
|
||
|
Traceback (most recent call last):
|
||
|
...
|
||
|
ValueError: There are significant imaginary parts in eigenvalues (1
|
||
|
of the maximum real part). Either the matrix is not PSD, or there was
|
||
|
an issue while computing the eigendecomposition of the matrix.
|
||
|
>>> _check_psd_eigenvalues([5, 5e-5j]) # insignificant imag part
|
||
|
array([5., 0.])
|
||
|
>>> _check_psd_eigenvalues([-5, -1]) # all negative
|
||
|
Traceback (most recent call last):
|
||
|
...
|
||
|
ValueError: All eigenvalues are negative (maximum is -1). Either the
|
||
|
matrix is not PSD, or there was an issue while computing the
|
||
|
eigendecomposition of the matrix.
|
||
|
>>> _check_psd_eigenvalues([5, -1]) # significant negative
|
||
|
Traceback (most recent call last):
|
||
|
...
|
||
|
ValueError: There are significant negative eigenvalues (0.2 of the
|
||
|
maximum positive). Either the matrix is not PSD, or there was an issue
|
||
|
while computing the eigendecomposition of the matrix.
|
||
|
>>> _check_psd_eigenvalues([5, -5e-5]) # insignificant negative
|
||
|
array([5., 0.])
|
||
|
>>> _check_psd_eigenvalues([5, 4e-12]) # bad conditioning (too small)
|
||
|
array([5., 0.])
|
||
|
|
||
|
"""
|
||
|
|
||
|
lambdas = np.array(lambdas)
|
||
|
is_double_precision = lambdas.dtype == np.float64
|
||
|
|
||
|
# note: the minimum value available is
|
||
|
# - single-precision: np.finfo('float32').eps = 1.2e-07
|
||
|
# - double-precision: np.finfo('float64').eps = 2.2e-16
|
||
|
|
||
|
# the various thresholds used for validation
|
||
|
# we may wish to change the value according to precision.
|
||
|
significant_imag_ratio = 1e-5
|
||
|
significant_neg_ratio = 1e-5 if is_double_precision else 5e-3
|
||
|
significant_neg_value = 1e-10 if is_double_precision else 1e-6
|
||
|
small_pos_ratio = 1e-12 if is_double_precision else 2e-7
|
||
|
|
||
|
# Check that there are no significant imaginary parts
|
||
|
if not np.isreal(lambdas).all():
|
||
|
max_imag_abs = np.abs(np.imag(lambdas)).max()
|
||
|
max_real_abs = np.abs(np.real(lambdas)).max()
|
||
|
if max_imag_abs > significant_imag_ratio * max_real_abs:
|
||
|
raise ValueError(
|
||
|
"There are significant imaginary parts in eigenvalues (%g "
|
||
|
"of the maximum real part). Either the matrix is not PSD, or "
|
||
|
"there was an issue while computing the eigendecomposition "
|
||
|
"of the matrix."
|
||
|
% (max_imag_abs / max_real_abs))
|
||
|
|
||
|
# warn about imaginary parts being removed
|
||
|
if enable_warnings:
|
||
|
warnings.warn("There are imaginary parts in eigenvalues (%g "
|
||
|
"of the maximum real part). Either the matrix is not"
|
||
|
" PSD, or there was an issue while computing the "
|
||
|
"eigendecomposition of the matrix. Only the real "
|
||
|
"parts will be kept."
|
||
|
% (max_imag_abs / max_real_abs),
|
||
|
PositiveSpectrumWarning)
|
||
|
|
||
|
# Remove all imaginary parts (even if zero)
|
||
|
lambdas = np.real(lambdas)
|
||
|
|
||
|
# Check that there are no significant negative eigenvalues
|
||
|
max_eig = lambdas.max()
|
||
|
if max_eig < 0:
|
||
|
raise ValueError("All eigenvalues are negative (maximum is %g). "
|
||
|
"Either the matrix is not PSD, or there was an "
|
||
|
"issue while computing the eigendecomposition of "
|
||
|
"the matrix." % max_eig)
|
||
|
|
||
|
else:
|
||
|
min_eig = lambdas.min()
|
||
|
if (min_eig < -significant_neg_ratio * max_eig
|
||
|
and min_eig < -significant_neg_value):
|
||
|
raise ValueError("There are significant negative eigenvalues (%g"
|
||
|
" of the maximum positive). Either the matrix is "
|
||
|
"not PSD, or there was an issue while computing "
|
||
|
"the eigendecomposition of the matrix."
|
||
|
% (-min_eig / max_eig))
|
||
|
elif min_eig < 0:
|
||
|
# Remove all negative values and warn about it
|
||
|
if enable_warnings:
|
||
|
warnings.warn("There are negative eigenvalues (%g of the "
|
||
|
"maximum positive). Either the matrix is not "
|
||
|
"PSD, or there was an issue while computing the"
|
||
|
" eigendecomposition of the matrix. Negative "
|
||
|
"eigenvalues will be replaced with 0."
|
||
|
% (-min_eig / max_eig),
|
||
|
PositiveSpectrumWarning)
|
||
|
lambdas[lambdas < 0] = 0
|
||
|
|
||
|
# Check for conditioning (small positive non-zeros)
|
||
|
too_small_lambdas = (0 < lambdas) & (lambdas < small_pos_ratio * max_eig)
|
||
|
if too_small_lambdas.any():
|
||
|
if enable_warnings:
|
||
|
warnings.warn("Badly conditioned PSD matrix spectrum: the largest "
|
||
|
"eigenvalue is more than %g times the smallest. "
|
||
|
"Small eigenvalues will be replaced with 0."
|
||
|
"" % (1 / small_pos_ratio),
|
||
|
PositiveSpectrumWarning)
|
||
|
lambdas[too_small_lambdas] = 0
|
||
|
|
||
|
return lambdas
|
||
|
|
||
|
|
||
|
def _check_sample_weight(sample_weight, X, dtype=None):
|
||
|
"""Validate sample weights.
|
||
|
|
||
|
Note that passing sample_weight=None will output an array of ones.
|
||
|
Therefore, in some cases, you may want to protect the call with:
|
||
|
if sample_weight is not None:
|
||
|
sample_weight = _check_sample_weight(...)
|
||
|
|
||
|
Parameters
|
||
|
----------
|
||
|
sample_weight : {ndarray, Number or None}, shape (n_samples,)
|
||
|
Input sample weights.
|
||
|
|
||
|
X : {ndarray, list, sparse matrix}
|
||
|
Input data.
|
||
|
|
||
|
dtype: dtype, default=None
|
||
|
dtype of the validated `sample_weight`.
|
||
|
If None, and the input `sample_weight` is an array, the dtype of the
|
||
|
input is preserved; otherwise an array with the default numpy dtype
|
||
|
is be allocated. If `dtype` is not one of `float32`, `float64`,
|
||
|
`None`, the output will be of dtype `float64`.
|
||
|
|
||
|
Returns
|
||
|
-------
|
||
|
sample_weight : ndarray of shape (n_samples,)
|
||
|
Validated sample weight. It is guaranteed to be "C" contiguous.
|
||
|
"""
|
||
|
n_samples = _num_samples(X)
|
||
|
|
||
|
if dtype is not None and dtype not in [np.float32, np.float64]:
|
||
|
dtype = np.float64
|
||
|
|
||
|
if sample_weight is None:
|
||
|
sample_weight = np.ones(n_samples, dtype=dtype)
|
||
|
elif isinstance(sample_weight, numbers.Number):
|
||
|
sample_weight = np.full(n_samples, sample_weight, dtype=dtype)
|
||
|
else:
|
||
|
if dtype is None:
|
||
|
dtype = [np.float64, np.float32]
|
||
|
sample_weight = check_array(
|
||
|
sample_weight, accept_sparse=False, ensure_2d=False, dtype=dtype,
|
||
|
order="C"
|
||
|
)
|
||
|
if sample_weight.ndim != 1:
|
||
|
raise ValueError("Sample weights must be 1D array or scalar")
|
||
|
|
||
|
if sample_weight.shape != (n_samples,):
|
||
|
raise ValueError("sample_weight.shape == {}, expected {}!"
|
||
|
.format(sample_weight.shape, (n_samples,)))
|
||
|
return sample_weight
|
||
|
|
||
|
|
||
|
def _allclose_dense_sparse(x, y, rtol=1e-7, atol=1e-9):
|
||
|
"""Check allclose for sparse and dense data.
|
||
|
|
||
|
Both x and y need to be either sparse or dense, they
|
||
|
can't be mixed.
|
||
|
|
||
|
Parameters
|
||
|
----------
|
||
|
x : {array-like, sparse matrix}
|
||
|
First array to compare.
|
||
|
|
||
|
y : {array-like, sparse matrix}
|
||
|
Second array to compare.
|
||
|
|
||
|
rtol : float, default=1e-7
|
||
|
Relative tolerance; see numpy.allclose.
|
||
|
|
||
|
atol : float, default=1e-9
|
||
|
absolute tolerance; see numpy.allclose. Note that the default here is
|
||
|
more tolerant than the default for numpy.testing.assert_allclose, where
|
||
|
atol=0.
|
||
|
"""
|
||
|
if sp.issparse(x) and sp.issparse(y):
|
||
|
x = x.tocsr()
|
||
|
y = y.tocsr()
|
||
|
x.sum_duplicates()
|
||
|
y.sum_duplicates()
|
||
|
return (np.array_equal(x.indices, y.indices) and
|
||
|
np.array_equal(x.indptr, y.indptr) and
|
||
|
np.allclose(x.data, y.data, rtol=rtol, atol=atol))
|
||
|
elif not sp.issparse(x) and not sp.issparse(y):
|
||
|
return np.allclose(x, y, rtol=rtol, atol=atol)
|
||
|
raise ValueError("Can only compare two sparse matrices, not a sparse "
|
||
|
"matrix and an array")
|
||
|
|
||
|
|
||
|
def _check_fit_params(X, fit_params, indices=None):
|
||
|
"""Check and validate the parameters passed during `fit`.
|
||
|
|
||
|
Parameters
|
||
|
----------
|
||
|
X : array-like of shape (n_samples, n_features)
|
||
|
Data array.
|
||
|
|
||
|
fit_params : dict
|
||
|
Dictionary containing the parameters passed at fit.
|
||
|
|
||
|
indices : array-like of shape (n_samples,), default=None
|
||
|
Indices to be selected if the parameter has the same size as `X`.
|
||
|
|
||
|
Returns
|
||
|
-------
|
||
|
fit_params_validated : dict
|
||
|
Validated parameters. We ensure that the values support indexing.
|
||
|
"""
|
||
|
from . import _safe_indexing
|
||
|
fit_params_validated = {}
|
||
|
for param_key, param_value in fit_params.items():
|
||
|
if (not _is_arraylike(param_value) or
|
||
|
_num_samples(param_value) != _num_samples(X)):
|
||
|
# Non-indexable pass-through (for now for backward-compatibility).
|
||
|
# https://github.com/scikit-learn/scikit-learn/issues/15805
|
||
|
fit_params_validated[param_key] = param_value
|
||
|
else:
|
||
|
# Any other fit_params should support indexing
|
||
|
# (e.g. for cross-validation).
|
||
|
fit_params_validated[param_key] = _make_indexable(param_value)
|
||
|
fit_params_validated[param_key] = _safe_indexing(
|
||
|
fit_params_validated[param_key], indices
|
||
|
)
|
||
|
|
||
|
return fit_params_validated
|