1110 lines
43 KiB
Python
1110 lines
43 KiB
Python
"""Hierarchical Agglomerative Clustering
|
|
|
|
These routines perform some hierarchical agglomerative clustering of some
|
|
input data.
|
|
|
|
Authors : Vincent Michel, Bertrand Thirion, Alexandre Gramfort,
|
|
Gael Varoquaux
|
|
License: BSD 3 clause
|
|
"""
|
|
import warnings
|
|
from heapq import heapify, heappop, heappush, heappushpop
|
|
|
|
import numpy as np
|
|
from scipy import sparse
|
|
from scipy.sparse.csgraph import connected_components
|
|
|
|
from ..base import BaseEstimator, ClusterMixin
|
|
from ..metrics.pairwise import paired_distances, pairwise_distances
|
|
from ..neighbors import DistanceMetric
|
|
from ..neighbors._dist_metrics import METRIC_MAPPING
|
|
from ..utils import check_array
|
|
from ..utils._fast_dict import IntFloatDict
|
|
from ..utils.fixes import _astype_copy_false
|
|
from ..utils.validation import _deprecate_positional_args, check_memory
|
|
# mypy error: Module 'sklearn.cluster' has no attribute '_hierarchical_fast'
|
|
from . import _hierarchical_fast as _hierarchical # type: ignore
|
|
from ._feature_agglomeration import AgglomerationTransform
|
|
|
|
###############################################################################
|
|
# For non fully-connected graphs
|
|
|
|
|
|
def _fix_connectivity(X, connectivity, affinity):
|
|
"""
|
|
Fixes the connectivity matrix
|
|
|
|
- copies it
|
|
- makes it symmetric
|
|
- converts it to LIL if necessary
|
|
- completes it if necessary
|
|
"""
|
|
n_samples = X.shape[0]
|
|
if (connectivity.shape[0] != n_samples or
|
|
connectivity.shape[1] != n_samples):
|
|
raise ValueError('Wrong shape for connectivity matrix: %s '
|
|
'when X is %s' % (connectivity.shape, X.shape))
|
|
|
|
# Make the connectivity matrix symmetric:
|
|
connectivity = connectivity + connectivity.T
|
|
|
|
# Convert connectivity matrix to LIL
|
|
if not sparse.isspmatrix_lil(connectivity):
|
|
if not sparse.isspmatrix(connectivity):
|
|
connectivity = sparse.lil_matrix(connectivity)
|
|
else:
|
|
connectivity = connectivity.tolil()
|
|
|
|
# Compute the number of nodes
|
|
n_connected_components, labels = connected_components(connectivity)
|
|
|
|
if n_connected_components > 1:
|
|
warnings.warn("the number of connected components of the "
|
|
"connectivity matrix is %d > 1. Completing it to avoid "
|
|
"stopping the tree early." % n_connected_components,
|
|
stacklevel=2)
|
|
# XXX: Can we do without completing the matrix?
|
|
for i in range(n_connected_components):
|
|
idx_i = np.where(labels == i)[0]
|
|
Xi = X[idx_i]
|
|
for j in range(i):
|
|
idx_j = np.where(labels == j)[0]
|
|
Xj = X[idx_j]
|
|
D = pairwise_distances(Xi, Xj, metric=affinity)
|
|
ii, jj = np.where(D == np.min(D))
|
|
ii = ii[0]
|
|
jj = jj[0]
|
|
connectivity[idx_i[ii], idx_j[jj]] = True
|
|
connectivity[idx_j[jj], idx_i[ii]] = True
|
|
|
|
return connectivity, n_connected_components
|
|
|
|
|
|
def _single_linkage_tree(connectivity, n_samples, n_nodes, n_clusters,
|
|
n_connected_components, return_distance):
|
|
"""
|
|
Perform single linkage clustering on sparse data via the minimum
|
|
spanning tree from scipy.sparse.csgraph, then using union-find to label.
|
|
The parent array is then generated by walking through the tree.
|
|
"""
|
|
from scipy.sparse.csgraph import minimum_spanning_tree
|
|
|
|
# explicitly cast connectivity to ensure safety
|
|
connectivity = connectivity.astype('float64',
|
|
**_astype_copy_false(connectivity))
|
|
|
|
# Ensure zero distances aren't ignored by setting them to "epsilon"
|
|
epsilon_value = np.finfo(dtype=connectivity.data.dtype).eps
|
|
connectivity.data[connectivity.data == 0] = epsilon_value
|
|
|
|
# Use scipy.sparse.csgraph to generate a minimum spanning tree
|
|
mst = minimum_spanning_tree(connectivity.tocsr())
|
|
|
|
# Convert the graph to scipy.cluster.hierarchy array format
|
|
mst = mst.tocoo()
|
|
|
|
# Undo the epsilon values
|
|
mst.data[mst.data == epsilon_value] = 0
|
|
|
|
mst_array = np.vstack([mst.row, mst.col, mst.data]).T
|
|
|
|
# Sort edges of the min_spanning_tree by weight
|
|
mst_array = mst_array[np.argsort(mst_array.T[2], kind='mergesort'), :]
|
|
|
|
# Convert edge list into standard hierarchical clustering format
|
|
single_linkage_tree = _hierarchical._single_linkage_label(mst_array)
|
|
children_ = single_linkage_tree[:, :2].astype(int)
|
|
|
|
# Compute parents
|
|
parent = np.arange(n_nodes, dtype=np.intp)
|
|
for i, (left, right) in enumerate(children_, n_samples):
|
|
if n_clusters is not None and i >= n_nodes:
|
|
break
|
|
if left < n_nodes:
|
|
parent[left] = i
|
|
if right < n_nodes:
|
|
parent[right] = i
|
|
|
|
if return_distance:
|
|
distances = single_linkage_tree[:, 2]
|
|
return children_, n_connected_components, n_samples, parent, distances
|
|
return children_, n_connected_components, n_samples, parent
|
|
|
|
|
|
###############################################################################
|
|
# Hierarchical tree building functions
|
|
|
|
@_deprecate_positional_args
|
|
def ward_tree(X, *, connectivity=None, n_clusters=None, return_distance=False):
|
|
"""Ward clustering based on a Feature matrix.
|
|
|
|
Recursively merges the pair of clusters that minimally increases
|
|
within-cluster variance.
|
|
|
|
The inertia matrix uses a Heapq-based representation.
|
|
|
|
This is the structured version, that takes into account some topological
|
|
structure between samples.
|
|
|
|
Read more in the :ref:`User Guide <hierarchical_clustering>`.
|
|
|
|
Parameters
|
|
----------
|
|
X : array-like of shape (n_samples, n_features)
|
|
feature matrix representing n_samples samples to be clustered
|
|
|
|
connectivity : sparse matrix, default=None
|
|
connectivity matrix. Defines for each sample the neighboring samples
|
|
following a given structure of the data. The matrix is assumed to
|
|
be symmetric and only the upper triangular half is used.
|
|
Default is None, i.e, the Ward algorithm is unstructured.
|
|
|
|
n_clusters : int, default=None
|
|
Stop early the construction of the tree at n_clusters. This is
|
|
useful to decrease computation time if the number of clusters is
|
|
not small compared to the number of samples. In this case, the
|
|
complete tree is not computed, thus the 'children' output is of
|
|
limited use, and the 'parents' output should rather be used.
|
|
This option is valid only when specifying a connectivity matrix.
|
|
|
|
return_distance : bool, default=None
|
|
If True, return the distance between the clusters.
|
|
|
|
Returns
|
|
-------
|
|
children : ndarray of shape (n_nodes-1, 2)
|
|
The children of each non-leaf node. Values less than `n_samples`
|
|
correspond to leaves of the tree which are the original samples.
|
|
A node `i` greater than or equal to `n_samples` is a non-leaf
|
|
node and has children `children_[i - n_samples]`. Alternatively
|
|
at the i-th iteration, children[i][0] and children[i][1]
|
|
are merged to form node `n_samples + i`
|
|
|
|
n_connected_components : int
|
|
The number of connected components in the graph.
|
|
|
|
n_leaves : int
|
|
The number of leaves in the tree
|
|
|
|
parents : ndarray of shape (n_nodes,) or None
|
|
The parent of each node. Only returned when a connectivity matrix
|
|
is specified, elsewhere 'None' is returned.
|
|
|
|
distances : ndarray of shape (n_nodes-1,)
|
|
Only returned if return_distance is set to True (for compatibility).
|
|
The distances between the centers of the nodes. `distances[i]`
|
|
corresponds to a weighted euclidean distance between
|
|
the nodes `children[i, 1]` and `children[i, 2]`. If the nodes refer to
|
|
leaves of the tree, then `distances[i]` is their unweighted euclidean
|
|
distance. Distances are updated in the following way
|
|
(from scipy.hierarchy.linkage):
|
|
|
|
The new entry :math:`d(u,v)` is computed as follows,
|
|
|
|
.. math::
|
|
|
|
d(u,v) = \\sqrt{\\frac{|v|+|s|}
|
|
{T}d(v,s)^2
|
|
+ \\frac{|v|+|t|}
|
|
{T}d(v,t)^2
|
|
- \\frac{|v|}
|
|
{T}d(s,t)^2}
|
|
|
|
where :math:`u` is the newly joined cluster consisting of
|
|
clusters :math:`s` and :math:`t`, :math:`v` is an unused
|
|
cluster in the forest, :math:`T=|v|+|s|+|t|`, and
|
|
:math:`|*|` is the cardinality of its argument. This is also
|
|
known as the incremental algorithm.
|
|
"""
|
|
X = np.asarray(X)
|
|
if X.ndim == 1:
|
|
X = np.reshape(X, (-1, 1))
|
|
n_samples, n_features = X.shape
|
|
|
|
if connectivity is None:
|
|
from scipy.cluster import hierarchy # imports PIL
|
|
|
|
if n_clusters is not None:
|
|
warnings.warn('Partial build of the tree is implemented '
|
|
'only for structured clustering (i.e. with '
|
|
'explicit connectivity). The algorithm '
|
|
'will build the full tree and only '
|
|
'retain the lower branches required '
|
|
'for the specified number of clusters',
|
|
stacklevel=2)
|
|
X = np.require(X, requirements="W")
|
|
out = hierarchy.ward(X)
|
|
children_ = out[:, :2].astype(np.intp)
|
|
|
|
if return_distance:
|
|
distances = out[:, 2]
|
|
return children_, 1, n_samples, None, distances
|
|
else:
|
|
return children_, 1, n_samples, None
|
|
|
|
connectivity, n_connected_components = _fix_connectivity(
|
|
X, connectivity,
|
|
affinity='euclidean')
|
|
if n_clusters is None:
|
|
n_nodes = 2 * n_samples - 1
|
|
else:
|
|
if n_clusters > n_samples:
|
|
raise ValueError('Cannot provide more clusters than samples. '
|
|
'%i n_clusters was asked, and there are %i '
|
|
'samples.' % (n_clusters, n_samples))
|
|
n_nodes = 2 * n_samples - n_clusters
|
|
|
|
# create inertia matrix
|
|
coord_row = []
|
|
coord_col = []
|
|
A = []
|
|
for ind, row in enumerate(connectivity.rows):
|
|
A.append(row)
|
|
# We keep only the upper triangular for the moments
|
|
# Generator expressions are faster than arrays on the following
|
|
row = [i for i in row if i < ind]
|
|
coord_row.extend(len(row) * [ind, ])
|
|
coord_col.extend(row)
|
|
|
|
coord_row = np.array(coord_row, dtype=np.intp, order='C')
|
|
coord_col = np.array(coord_col, dtype=np.intp, order='C')
|
|
|
|
# build moments as a list
|
|
moments_1 = np.zeros(n_nodes, order='C')
|
|
moments_1[:n_samples] = 1
|
|
moments_2 = np.zeros((n_nodes, n_features), order='C')
|
|
moments_2[:n_samples] = X
|
|
inertia = np.empty(len(coord_row), dtype=np.float64, order='C')
|
|
_hierarchical.compute_ward_dist(moments_1, moments_2, coord_row, coord_col,
|
|
inertia)
|
|
inertia = list(zip(inertia, coord_row, coord_col))
|
|
heapify(inertia)
|
|
|
|
# prepare the main fields
|
|
parent = np.arange(n_nodes, dtype=np.intp)
|
|
used_node = np.ones(n_nodes, dtype=bool)
|
|
children = []
|
|
if return_distance:
|
|
distances = np.empty(n_nodes - n_samples)
|
|
|
|
not_visited = np.empty(n_nodes, dtype=np.int8, order='C')
|
|
|
|
# recursive merge loop
|
|
for k in range(n_samples, n_nodes):
|
|
# identify the merge
|
|
while True:
|
|
inert, i, j = heappop(inertia)
|
|
if used_node[i] and used_node[j]:
|
|
break
|
|
parent[i], parent[j] = k, k
|
|
children.append((i, j))
|
|
used_node[i] = used_node[j] = False
|
|
if return_distance: # store inertia value
|
|
distances[k - n_samples] = inert
|
|
|
|
# update the moments
|
|
moments_1[k] = moments_1[i] + moments_1[j]
|
|
moments_2[k] = moments_2[i] + moments_2[j]
|
|
|
|
# update the structure matrix A and the inertia matrix
|
|
coord_col = []
|
|
not_visited.fill(1)
|
|
not_visited[k] = 0
|
|
_hierarchical._get_parents(A[i], coord_col, parent, not_visited)
|
|
_hierarchical._get_parents(A[j], coord_col, parent, not_visited)
|
|
# List comprehension is faster than a for loop
|
|
[A[col].append(k) for col in coord_col]
|
|
A.append(coord_col)
|
|
coord_col = np.array(coord_col, dtype=np.intp, order='C')
|
|
coord_row = np.empty(coord_col.shape, dtype=np.intp, order='C')
|
|
coord_row.fill(k)
|
|
n_additions = len(coord_row)
|
|
ini = np.empty(n_additions, dtype=np.float64, order='C')
|
|
|
|
_hierarchical.compute_ward_dist(moments_1, moments_2,
|
|
coord_row, coord_col, ini)
|
|
|
|
# List comprehension is faster than a for loop
|
|
[heappush(inertia, (ini[idx], k, coord_col[idx]))
|
|
for idx in range(n_additions)]
|
|
|
|
# Separate leaves in children (empty lists up to now)
|
|
n_leaves = n_samples
|
|
# sort children to get consistent output with unstructured version
|
|
children = [c[::-1] for c in children]
|
|
children = np.array(children) # return numpy array for efficient caching
|
|
|
|
if return_distance:
|
|
# 2 is scaling factor to compare w/ unstructured version
|
|
distances = np.sqrt(2. * distances)
|
|
return children, n_connected_components, n_leaves, parent, distances
|
|
else:
|
|
return children, n_connected_components, n_leaves, parent
|
|
|
|
|
|
# single average and complete linkage
|
|
def linkage_tree(X, connectivity=None, n_clusters=None, linkage='complete',
|
|
affinity="euclidean", return_distance=False):
|
|
"""Linkage agglomerative clustering based on a Feature matrix.
|
|
|
|
The inertia matrix uses a Heapq-based representation.
|
|
|
|
This is the structured version, that takes into account some topological
|
|
structure between samples.
|
|
|
|
Read more in the :ref:`User Guide <hierarchical_clustering>`.
|
|
|
|
Parameters
|
|
----------
|
|
X : array-like of shape (n_samples, n_features)
|
|
feature matrix representing n_samples samples to be clustered
|
|
|
|
connectivity : sparse matrix, default=None
|
|
connectivity matrix. Defines for each sample the neighboring samples
|
|
following a given structure of the data. The matrix is assumed to
|
|
be symmetric and only the upper triangular half is used.
|
|
Default is None, i.e, the Ward algorithm is unstructured.
|
|
|
|
n_clusters : int, default=None
|
|
Stop early the construction of the tree at n_clusters. This is
|
|
useful to decrease computation time if the number of clusters is
|
|
not small compared to the number of samples. In this case, the
|
|
complete tree is not computed, thus the 'children' output is of
|
|
limited use, and the 'parents' output should rather be used.
|
|
This option is valid only when specifying a connectivity matrix.
|
|
|
|
linkage : {"average", "complete", "single"}, default="complete"
|
|
Which linkage criteria to use. The linkage criterion determines which
|
|
distance to use between sets of observation.
|
|
- average uses the average of the distances of each observation of
|
|
the two sets
|
|
- complete or maximum linkage uses the maximum distances between
|
|
all observations of the two sets.
|
|
- single uses the minimum of the distances between all observations
|
|
of the two sets.
|
|
|
|
affinity : str or callable, default="euclidean".
|
|
which metric to use. Can be "euclidean", "manhattan", or any
|
|
distance know to paired distance (see metric.pairwise)
|
|
|
|
return_distance : bool, default=False
|
|
whether or not to return the distances between the clusters.
|
|
|
|
Returns
|
|
-------
|
|
children : ndarray of shape (n_nodes-1, 2)
|
|
The children of each non-leaf node. Values less than `n_samples`
|
|
correspond to leaves of the tree which are the original samples.
|
|
A node `i` greater than or equal to `n_samples` is a non-leaf
|
|
node and has children `children_[i - n_samples]`. Alternatively
|
|
at the i-th iteration, children[i][0] and children[i][1]
|
|
are merged to form node `n_samples + i`
|
|
|
|
n_connected_components : int
|
|
The number of connected components in the graph.
|
|
|
|
n_leaves : int
|
|
The number of leaves in the tree.
|
|
|
|
parents : ndarray of shape (n_nodes, ) or None
|
|
The parent of each node. Only returned when a connectivity matrix
|
|
is specified, elsewhere 'None' is returned.
|
|
|
|
distances : ndarray of shape (n_nodes-1,)
|
|
Returned when return_distance is set to True.
|
|
|
|
distances[i] refers to the distance between children[i][0] and
|
|
children[i][1] when they are merged.
|
|
|
|
See Also
|
|
--------
|
|
ward_tree : Hierarchical clustering with ward linkage.
|
|
"""
|
|
X = np.asarray(X)
|
|
if X.ndim == 1:
|
|
X = np.reshape(X, (-1, 1))
|
|
n_samples, n_features = X.shape
|
|
|
|
linkage_choices = {'complete': _hierarchical.max_merge,
|
|
'average': _hierarchical.average_merge,
|
|
'single': None} # Single linkage is handled differently
|
|
try:
|
|
join_func = linkage_choices[linkage]
|
|
except KeyError as e:
|
|
raise ValueError(
|
|
'Unknown linkage option, linkage should be one '
|
|
'of %s, but %s was given' % (linkage_choices.keys(), linkage)
|
|
) from e
|
|
|
|
if affinity == 'cosine' and np.any(~np.any(X, axis=1)):
|
|
raise ValueError(
|
|
'Cosine affinity cannot be used when X contains zero vectors')
|
|
|
|
if connectivity is None:
|
|
from scipy.cluster import hierarchy # imports PIL
|
|
|
|
if n_clusters is not None:
|
|
warnings.warn('Partial build of the tree is implemented '
|
|
'only for structured clustering (i.e. with '
|
|
'explicit connectivity). The algorithm '
|
|
'will build the full tree and only '
|
|
'retain the lower branches required '
|
|
'for the specified number of clusters',
|
|
stacklevel=2)
|
|
|
|
if affinity == 'precomputed':
|
|
# for the linkage function of hierarchy to work on precomputed
|
|
# data, provide as first argument an ndarray of the shape returned
|
|
# by sklearn.metrics.pairwise_distances.
|
|
if X.shape[0] != X.shape[1]:
|
|
raise ValueError(
|
|
'Distance matrix should be square, '
|
|
'Got matrix of shape {X.shape}'
|
|
)
|
|
i, j = np.triu_indices(X.shape[0], k=1)
|
|
X = X[i, j]
|
|
elif affinity == 'l2':
|
|
# Translate to something understood by scipy
|
|
affinity = 'euclidean'
|
|
elif affinity in ('l1', 'manhattan'):
|
|
affinity = 'cityblock'
|
|
elif callable(affinity):
|
|
X = affinity(X)
|
|
i, j = np.triu_indices(X.shape[0], k=1)
|
|
X = X[i, j]
|
|
if (linkage == 'single'
|
|
and affinity != 'precomputed'
|
|
and not callable(affinity)
|
|
and affinity in METRIC_MAPPING):
|
|
|
|
# We need the fast cythonized metric from neighbors
|
|
dist_metric = DistanceMetric.get_metric(affinity)
|
|
|
|
# The Cython routines used require contiguous arrays
|
|
X = np.ascontiguousarray(X, dtype=np.double)
|
|
|
|
mst = _hierarchical.mst_linkage_core(X, dist_metric)
|
|
# Sort edges of the min_spanning_tree by weight
|
|
mst = mst[np.argsort(mst.T[2], kind='mergesort'), :]
|
|
|
|
# Convert edge list into standard hierarchical clustering format
|
|
out = _hierarchical.single_linkage_label(mst)
|
|
else:
|
|
out = hierarchy.linkage(X, method=linkage, metric=affinity)
|
|
children_ = out[:, :2].astype(int, copy=False)
|
|
|
|
if return_distance:
|
|
distances = out[:, 2]
|
|
return children_, 1, n_samples, None, distances
|
|
return children_, 1, n_samples, None
|
|
|
|
connectivity, n_connected_components = _fix_connectivity(
|
|
X, connectivity,
|
|
affinity=affinity)
|
|
connectivity = connectivity.tocoo()
|
|
# Put the diagonal to zero
|
|
diag_mask = (connectivity.row != connectivity.col)
|
|
connectivity.row = connectivity.row[diag_mask]
|
|
connectivity.col = connectivity.col[diag_mask]
|
|
connectivity.data = connectivity.data[diag_mask]
|
|
del diag_mask
|
|
|
|
if affinity == 'precomputed':
|
|
distances = X[connectivity.row, connectivity.col].astype(
|
|
'float64', **_astype_copy_false(X))
|
|
else:
|
|
# FIXME We compute all the distances, while we could have only computed
|
|
# the "interesting" distances
|
|
distances = paired_distances(X[connectivity.row],
|
|
X[connectivity.col],
|
|
metric=affinity)
|
|
connectivity.data = distances
|
|
|
|
if n_clusters is None:
|
|
n_nodes = 2 * n_samples - 1
|
|
else:
|
|
assert n_clusters <= n_samples
|
|
n_nodes = 2 * n_samples - n_clusters
|
|
|
|
if linkage == 'single':
|
|
return _single_linkage_tree(connectivity, n_samples, n_nodes,
|
|
n_clusters, n_connected_components,
|
|
return_distance)
|
|
|
|
if return_distance:
|
|
distances = np.empty(n_nodes - n_samples)
|
|
# create inertia heap and connection matrix
|
|
A = np.empty(n_nodes, dtype=object)
|
|
inertia = list()
|
|
|
|
# LIL seems to the best format to access the rows quickly,
|
|
# without the numpy overhead of slicing CSR indices and data.
|
|
connectivity = connectivity.tolil()
|
|
# We are storing the graph in a list of IntFloatDict
|
|
for ind, (data, row) in enumerate(zip(connectivity.data,
|
|
connectivity.rows)):
|
|
A[ind] = IntFloatDict(np.asarray(row, dtype=np.intp),
|
|
np.asarray(data, dtype=np.float64))
|
|
# We keep only the upper triangular for the heap
|
|
# Generator expressions are faster than arrays on the following
|
|
inertia.extend(_hierarchical.WeightedEdge(d, ind, r)
|
|
for r, d in zip(row, data) if r < ind)
|
|
del connectivity
|
|
|
|
heapify(inertia)
|
|
|
|
# prepare the main fields
|
|
parent = np.arange(n_nodes, dtype=np.intp)
|
|
used_node = np.ones(n_nodes, dtype=np.intp)
|
|
children = []
|
|
|
|
# recursive merge loop
|
|
for k in range(n_samples, n_nodes):
|
|
# identify the merge
|
|
while True:
|
|
edge = heappop(inertia)
|
|
if used_node[edge.a] and used_node[edge.b]:
|
|
break
|
|
i = edge.a
|
|
j = edge.b
|
|
|
|
if return_distance:
|
|
# store distances
|
|
distances[k - n_samples] = edge.weight
|
|
|
|
parent[i] = parent[j] = k
|
|
children.append((i, j))
|
|
# Keep track of the number of elements per cluster
|
|
n_i = used_node[i]
|
|
n_j = used_node[j]
|
|
used_node[k] = n_i + n_j
|
|
used_node[i] = used_node[j] = False
|
|
|
|
# update the structure matrix A and the inertia matrix
|
|
# a clever 'min', or 'max' operation between A[i] and A[j]
|
|
coord_col = join_func(A[i], A[j], used_node, n_i, n_j)
|
|
for col, d in coord_col:
|
|
A[col].append(k, d)
|
|
# Here we use the information from coord_col (containing the
|
|
# distances) to update the heap
|
|
heappush(inertia, _hierarchical.WeightedEdge(d, k, col))
|
|
A[k] = coord_col
|
|
# Clear A[i] and A[j] to save memory
|
|
A[i] = A[j] = 0
|
|
|
|
# Separate leaves in children (empty lists up to now)
|
|
n_leaves = n_samples
|
|
|
|
# # return numpy array for efficient caching
|
|
children = np.array(children)[:, ::-1]
|
|
|
|
if return_distance:
|
|
return children, n_connected_components, n_leaves, parent, distances
|
|
return children, n_connected_components, n_leaves, parent
|
|
|
|
|
|
# Matching names to tree-building strategies
|
|
def _complete_linkage(*args, **kwargs):
|
|
kwargs['linkage'] = 'complete'
|
|
return linkage_tree(*args, **kwargs)
|
|
|
|
|
|
def _average_linkage(*args, **kwargs):
|
|
kwargs['linkage'] = 'average'
|
|
return linkage_tree(*args, **kwargs)
|
|
|
|
|
|
def _single_linkage(*args, **kwargs):
|
|
kwargs['linkage'] = 'single'
|
|
return linkage_tree(*args, **kwargs)
|
|
|
|
|
|
_TREE_BUILDERS = dict(
|
|
ward=ward_tree,
|
|
complete=_complete_linkage,
|
|
average=_average_linkage,
|
|
single=_single_linkage)
|
|
|
|
|
|
###############################################################################
|
|
# Functions for cutting hierarchical clustering tree
|
|
|
|
def _hc_cut(n_clusters, children, n_leaves):
|
|
"""Function cutting the ward tree for a given number of clusters.
|
|
|
|
Parameters
|
|
----------
|
|
n_clusters : int or ndarray
|
|
The number of clusters to form.
|
|
|
|
children : ndarray of shape (n_nodes-1, 2)
|
|
The children of each non-leaf node. Values less than `n_samples`
|
|
correspond to leaves of the tree which are the original samples.
|
|
A node `i` greater than or equal to `n_samples` is a non-leaf
|
|
node and has children `children_[i - n_samples]`. Alternatively
|
|
at the i-th iteration, children[i][0] and children[i][1]
|
|
are merged to form node `n_samples + i`
|
|
|
|
n_leaves : int
|
|
Number of leaves of the tree.
|
|
|
|
Returns
|
|
-------
|
|
labels : array [n_samples]
|
|
cluster labels for each point
|
|
|
|
"""
|
|
if n_clusters > n_leaves:
|
|
raise ValueError('Cannot extract more clusters than samples: '
|
|
'%s clusters where given for a tree with %s leaves.'
|
|
% (n_clusters, n_leaves))
|
|
# In this function, we store nodes as a heap to avoid recomputing
|
|
# the max of the nodes: the first element is always the smallest
|
|
# We use negated indices as heaps work on smallest elements, and we
|
|
# are interested in largest elements
|
|
# children[-1] is the root of the tree
|
|
nodes = [-(max(children[-1]) + 1)]
|
|
for _ in range(n_clusters - 1):
|
|
# As we have a heap, nodes[0] is the smallest element
|
|
these_children = children[-nodes[0] - n_leaves]
|
|
# Insert the 2 children and remove the largest node
|
|
heappush(nodes, -these_children[0])
|
|
heappushpop(nodes, -these_children[1])
|
|
label = np.zeros(n_leaves, dtype=np.intp)
|
|
for i, node in enumerate(nodes):
|
|
label[_hierarchical._hc_get_descendent(-node, children, n_leaves)] = i
|
|
return label
|
|
|
|
|
|
###############################################################################
|
|
|
|
class AgglomerativeClustering(ClusterMixin, BaseEstimator):
|
|
"""
|
|
Agglomerative Clustering
|
|
|
|
Recursively merges the pair of clusters that minimally increases
|
|
a given linkage distance.
|
|
|
|
Read more in the :ref:`User Guide <hierarchical_clustering>`.
|
|
|
|
Parameters
|
|
----------
|
|
n_clusters : int or None, default=2
|
|
The number of clusters to find. It must be ``None`` if
|
|
``distance_threshold`` is not ``None``.
|
|
|
|
affinity : str or callable, default='euclidean'
|
|
Metric used to compute the linkage. Can be "euclidean", "l1", "l2",
|
|
"manhattan", "cosine", or "precomputed".
|
|
If linkage is "ward", only "euclidean" is accepted.
|
|
If "precomputed", a distance matrix (instead of a similarity matrix)
|
|
is needed as input for the fit method.
|
|
|
|
memory : str or object with the joblib.Memory interface, default=None
|
|
Used to cache the output of the computation of the tree.
|
|
By default, no caching is done. If a string is given, it is the
|
|
path to the caching directory.
|
|
|
|
connectivity : array-like or callable, default=None
|
|
Connectivity matrix. Defines for each sample the neighboring
|
|
samples following a given structure of the data.
|
|
This can be a connectivity matrix itself or a callable that transforms
|
|
the data into a connectivity matrix, such as derived from
|
|
kneighbors_graph. Default is ``None``, i.e, the
|
|
hierarchical clustering algorithm is unstructured.
|
|
|
|
compute_full_tree : 'auto' or bool, default='auto'
|
|
Stop early the construction of the tree at ``n_clusters``. This is
|
|
useful to decrease computation time if the number of clusters is not
|
|
small compared to the number of samples. This option is useful only
|
|
when specifying a connectivity matrix. Note also that when varying the
|
|
number of clusters and using caching, it may be advantageous to compute
|
|
the full tree. It must be ``True`` if ``distance_threshold`` is not
|
|
``None``. By default `compute_full_tree` is "auto", which is equivalent
|
|
to `True` when `distance_threshold` is not `None` or that `n_clusters`
|
|
is inferior to the maximum between 100 or `0.02 * n_samples`.
|
|
Otherwise, "auto" is equivalent to `False`.
|
|
|
|
linkage : {'ward', 'complete', 'average', 'single'}, default='ward'
|
|
Which linkage criterion to use. The linkage criterion determines which
|
|
distance to use between sets of observation. The algorithm will merge
|
|
the pairs of cluster that minimize this criterion.
|
|
|
|
- 'ward' minimizes the variance of the clusters being merged.
|
|
- 'average' uses the average of the distances of each observation of
|
|
the two sets.
|
|
- 'complete' or 'maximum' linkage uses the maximum distances between
|
|
all observations of the two sets.
|
|
- 'single' uses the minimum of the distances between all observations
|
|
of the two sets.
|
|
|
|
.. versionadded:: 0.20
|
|
Added the 'single' option
|
|
|
|
distance_threshold : float, default=None
|
|
The linkage distance threshold above which, clusters will not be
|
|
merged. If not ``None``, ``n_clusters`` must be ``None`` and
|
|
``compute_full_tree`` must be ``True``.
|
|
|
|
.. versionadded:: 0.21
|
|
|
|
compute_distances : bool, default=False
|
|
Computes distances between clusters even if `distance_threshold` is not
|
|
used. This can be used to make dendrogram visualization, but introduces
|
|
a computational and memory overhead.
|
|
|
|
.. versionadded:: 0.24
|
|
|
|
Attributes
|
|
----------
|
|
n_clusters_ : int
|
|
The number of clusters found by the algorithm. If
|
|
``distance_threshold=None``, it will be equal to the given
|
|
``n_clusters``.
|
|
|
|
labels_ : ndarray of shape (n_samples)
|
|
cluster labels for each point
|
|
|
|
n_leaves_ : int
|
|
Number of leaves in the hierarchical tree.
|
|
|
|
n_connected_components_ : int
|
|
The estimated number of connected components in the graph.
|
|
|
|
.. versionadded:: 0.21
|
|
``n_connected_components_`` was added to replace ``n_components_``.
|
|
|
|
children_ : array-like of shape (n_samples-1, 2)
|
|
The children of each non-leaf node. Values less than `n_samples`
|
|
correspond to leaves of the tree which are the original samples.
|
|
A node `i` greater than or equal to `n_samples` is a non-leaf
|
|
node and has children `children_[i - n_samples]`. Alternatively
|
|
at the i-th iteration, children[i][0] and children[i][1]
|
|
are merged to form node `n_samples + i`
|
|
|
|
distances_ : array-like of shape (n_nodes-1,)
|
|
Distances between nodes in the corresponding place in `children_`.
|
|
Only computed if `distance_threshold` is used or `compute_distances`
|
|
is set to `True`.
|
|
|
|
Examples
|
|
--------
|
|
>>> from sklearn.cluster import AgglomerativeClustering
|
|
>>> import numpy as np
|
|
>>> X = np.array([[1, 2], [1, 4], [1, 0],
|
|
... [4, 2], [4, 4], [4, 0]])
|
|
>>> clustering = AgglomerativeClustering().fit(X)
|
|
>>> clustering
|
|
AgglomerativeClustering()
|
|
>>> clustering.labels_
|
|
array([1, 1, 1, 0, 0, 0])
|
|
|
|
"""
|
|
@_deprecate_positional_args
|
|
def __init__(self, n_clusters=2, *, affinity="euclidean",
|
|
memory=None,
|
|
connectivity=None, compute_full_tree='auto',
|
|
linkage='ward', distance_threshold=None,
|
|
compute_distances=False):
|
|
self.n_clusters = n_clusters
|
|
self.distance_threshold = distance_threshold
|
|
self.memory = memory
|
|
self.connectivity = connectivity
|
|
self.compute_full_tree = compute_full_tree
|
|
self.linkage = linkage
|
|
self.affinity = affinity
|
|
self.compute_distances = compute_distances
|
|
|
|
def fit(self, X, y=None):
|
|
"""Fit the hierarchical clustering from features, or distance matrix.
|
|
|
|
Parameters
|
|
----------
|
|
X : array-like, shape (n_samples, n_features) or (n_samples, n_samples)
|
|
Training instances to cluster, or distances between instances if
|
|
``affinity='precomputed'``.
|
|
|
|
y : Ignored
|
|
Not used, present here for API consistency by convention.
|
|
|
|
Returns
|
|
-------
|
|
self
|
|
"""
|
|
X = self._validate_data(X, ensure_min_samples=2, estimator=self)
|
|
memory = check_memory(self.memory)
|
|
|
|
if self.n_clusters is not None and self.n_clusters <= 0:
|
|
raise ValueError("n_clusters should be an integer greater than 0."
|
|
" %s was provided." % str(self.n_clusters))
|
|
|
|
if not ((self.n_clusters is None) ^ (self.distance_threshold is None)):
|
|
raise ValueError("Exactly one of n_clusters and "
|
|
"distance_threshold has to be set, and the other "
|
|
"needs to be None.")
|
|
|
|
if (self.distance_threshold is not None
|
|
and not self.compute_full_tree):
|
|
raise ValueError("compute_full_tree must be True if "
|
|
"distance_threshold is set.")
|
|
|
|
if self.linkage == "ward" and self.affinity != "euclidean":
|
|
raise ValueError("%s was provided as affinity. Ward can only "
|
|
"work with euclidean distances." %
|
|
(self.affinity, ))
|
|
|
|
if self.linkage not in _TREE_BUILDERS:
|
|
raise ValueError("Unknown linkage type %s. "
|
|
"Valid options are %s" % (self.linkage,
|
|
_TREE_BUILDERS.keys()))
|
|
tree_builder = _TREE_BUILDERS[self.linkage]
|
|
|
|
connectivity = self.connectivity
|
|
if self.connectivity is not None:
|
|
if callable(self.connectivity):
|
|
connectivity = self.connectivity(X)
|
|
connectivity = check_array(
|
|
connectivity, accept_sparse=['csr', 'coo', 'lil'])
|
|
|
|
n_samples = len(X)
|
|
compute_full_tree = self.compute_full_tree
|
|
if self.connectivity is None:
|
|
compute_full_tree = True
|
|
if compute_full_tree == 'auto':
|
|
if self.distance_threshold is not None:
|
|
compute_full_tree = True
|
|
else:
|
|
# Early stopping is likely to give a speed up only for
|
|
# a large number of clusters. The actual threshold
|
|
# implemented here is heuristic
|
|
compute_full_tree = self.n_clusters < max(100, .02 * n_samples)
|
|
n_clusters = self.n_clusters
|
|
if compute_full_tree:
|
|
n_clusters = None
|
|
|
|
# Construct the tree
|
|
kwargs = {}
|
|
if self.linkage != 'ward':
|
|
kwargs['linkage'] = self.linkage
|
|
kwargs['affinity'] = self.affinity
|
|
|
|
distance_threshold = self.distance_threshold
|
|
|
|
return_distance = (
|
|
(distance_threshold is not None) or self.compute_distances
|
|
)
|
|
|
|
out = memory.cache(tree_builder)(X, connectivity=connectivity,
|
|
n_clusters=n_clusters,
|
|
return_distance=return_distance,
|
|
**kwargs)
|
|
(self.children_,
|
|
self.n_connected_components_,
|
|
self.n_leaves_,
|
|
parents) = out[:4]
|
|
|
|
if return_distance:
|
|
self.distances_ = out[-1]
|
|
|
|
if self.distance_threshold is not None: # distance_threshold is used
|
|
self.n_clusters_ = np.count_nonzero(
|
|
self.distances_ >= distance_threshold) + 1
|
|
else: # n_clusters is used
|
|
self.n_clusters_ = self.n_clusters
|
|
|
|
# Cut the tree
|
|
if compute_full_tree:
|
|
self.labels_ = _hc_cut(self.n_clusters_, self.children_,
|
|
self.n_leaves_)
|
|
else:
|
|
labels = _hierarchical.hc_get_heads(parents, copy=False)
|
|
# copy to avoid holding a reference on the original array
|
|
labels = np.copy(labels[:n_samples])
|
|
# Reassign cluster numbers
|
|
self.labels_ = np.searchsorted(np.unique(labels), labels)
|
|
return self
|
|
|
|
def fit_predict(self, X, y=None):
|
|
"""Fit the hierarchical clustering from features or distance matrix,
|
|
and return cluster labels.
|
|
|
|
Parameters
|
|
----------
|
|
X : array-like of shape (n_samples, n_features) or \
|
|
(n_samples, n_samples)
|
|
Training instances to cluster, or distances between instances if
|
|
``affinity='precomputed'``.
|
|
|
|
y : Ignored
|
|
Not used, present here for API consistency by convention.
|
|
|
|
Returns
|
|
-------
|
|
labels : ndarray of shape (n_samples,)
|
|
Cluster labels.
|
|
"""
|
|
return super().fit_predict(X, y)
|
|
|
|
|
|
class FeatureAgglomeration(AgglomerativeClustering, AgglomerationTransform):
|
|
"""Agglomerate features.
|
|
|
|
Similar to AgglomerativeClustering, but recursively merges features
|
|
instead of samples.
|
|
|
|
Read more in the :ref:`User Guide <hierarchical_clustering>`.
|
|
|
|
Parameters
|
|
----------
|
|
n_clusters : int, default=2
|
|
The number of clusters to find. It must be ``None`` if
|
|
``distance_threshold`` is not ``None``.
|
|
|
|
affinity : str or callable, default='euclidean'
|
|
Metric used to compute the linkage. Can be "euclidean", "l1", "l2",
|
|
"manhattan", "cosine", or 'precomputed'.
|
|
If linkage is "ward", only "euclidean" is accepted.
|
|
|
|
memory : str or object with the joblib.Memory interface, default=None
|
|
Used to cache the output of the computation of the tree.
|
|
By default, no caching is done. If a string is given, it is the
|
|
path to the caching directory.
|
|
|
|
connectivity : array-like or callable, default=None
|
|
Connectivity matrix. Defines for each feature the neighboring
|
|
features following a given structure of the data.
|
|
This can be a connectivity matrix itself or a callable that transforms
|
|
the data into a connectivity matrix, such as derived from
|
|
kneighbors_graph. Default is None, i.e, the
|
|
hierarchical clustering algorithm is unstructured.
|
|
|
|
compute_full_tree : 'auto' or bool, default='auto'
|
|
Stop early the construction of the tree at n_clusters. This is useful
|
|
to decrease computation time if the number of clusters is not small
|
|
compared to the number of features. This option is useful only when
|
|
specifying a connectivity matrix. Note also that when varying the
|
|
number of clusters and using caching, it may be advantageous to compute
|
|
the full tree. It must be ``True`` if ``distance_threshold`` is not
|
|
``None``. By default `compute_full_tree` is "auto", which is equivalent
|
|
to `True` when `distance_threshold` is not `None` or that `n_clusters`
|
|
is inferior to the maximum between 100 or `0.02 * n_samples`.
|
|
Otherwise, "auto" is equivalent to `False`.
|
|
|
|
linkage : {'ward', 'complete', 'average', 'single'}, default='ward'
|
|
Which linkage criterion to use. The linkage criterion determines which
|
|
distance to use between sets of features. The algorithm will merge
|
|
the pairs of cluster that minimize this criterion.
|
|
|
|
- ward minimizes the variance of the clusters being merged.
|
|
- average uses the average of the distances of each feature of
|
|
the two sets.
|
|
- complete or maximum linkage uses the maximum distances between
|
|
all features of the two sets.
|
|
- single uses the minimum of the distances between all observations
|
|
of the two sets.
|
|
|
|
pooling_func : callable, default=np.mean
|
|
This combines the values of agglomerated features into a single
|
|
value, and should accept an array of shape [M, N] and the keyword
|
|
argument `axis=1`, and reduce it to an array of size [M].
|
|
|
|
distance_threshold : float, default=None
|
|
The linkage distance threshold above which, clusters will not be
|
|
merged. If not ``None``, ``n_clusters`` must be ``None`` and
|
|
``compute_full_tree`` must be ``True``.
|
|
|
|
.. versionadded:: 0.21
|
|
|
|
compute_distances : bool, default=False
|
|
Computes distances between clusters even if `distance_threshold` is not
|
|
used. This can be used to make dendrogram visualization, but introduces
|
|
a computational and memory overhead.
|
|
|
|
.. versionadded:: 0.24
|
|
|
|
Attributes
|
|
----------
|
|
n_clusters_ : int
|
|
The number of clusters found by the algorithm. If
|
|
``distance_threshold=None``, it will be equal to the given
|
|
``n_clusters``.
|
|
|
|
labels_ : array-like of (n_features,)
|
|
cluster labels for each feature.
|
|
|
|
n_leaves_ : int
|
|
Number of leaves in the hierarchical tree.
|
|
|
|
n_connected_components_ : int
|
|
The estimated number of connected components in the graph.
|
|
|
|
.. versionadded:: 0.21
|
|
``n_connected_components_`` was added to replace ``n_components_``.
|
|
|
|
children_ : array-like of shape (n_nodes-1, 2)
|
|
The children of each non-leaf node. Values less than `n_features`
|
|
correspond to leaves of the tree which are the original samples.
|
|
A node `i` greater than or equal to `n_features` is a non-leaf
|
|
node and has children `children_[i - n_features]`. Alternatively
|
|
at the i-th iteration, children[i][0] and children[i][1]
|
|
are merged to form node `n_features + i`
|
|
|
|
distances_ : array-like of shape (n_nodes-1,)
|
|
Distances between nodes in the corresponding place in `children_`.
|
|
Only computed if `distance_threshold` is used or `compute_distances`
|
|
is set to `True`.
|
|
|
|
Examples
|
|
--------
|
|
>>> import numpy as np
|
|
>>> from sklearn import datasets, cluster
|
|
>>> digits = datasets.load_digits()
|
|
>>> images = digits.images
|
|
>>> X = np.reshape(images, (len(images), -1))
|
|
>>> agglo = cluster.FeatureAgglomeration(n_clusters=32)
|
|
>>> agglo.fit(X)
|
|
FeatureAgglomeration(n_clusters=32)
|
|
>>> X_reduced = agglo.transform(X)
|
|
>>> X_reduced.shape
|
|
(1797, 32)
|
|
"""
|
|
@_deprecate_positional_args
|
|
def __init__(self, n_clusters=2, *, affinity="euclidean",
|
|
memory=None,
|
|
connectivity=None, compute_full_tree='auto',
|
|
linkage='ward', pooling_func=np.mean,
|
|
distance_threshold=None, compute_distances=False):
|
|
super().__init__(
|
|
n_clusters=n_clusters, memory=memory, connectivity=connectivity,
|
|
compute_full_tree=compute_full_tree, linkage=linkage,
|
|
affinity=affinity, distance_threshold=distance_threshold,
|
|
compute_distances=compute_distances)
|
|
self.pooling_func = pooling_func
|
|
|
|
def fit(self, X, y=None, **params):
|
|
"""Fit the hierarchical clustering on the data
|
|
|
|
Parameters
|
|
----------
|
|
X : array-like of shape (n_samples, n_features)
|
|
The data
|
|
|
|
y : Ignored
|
|
|
|
Returns
|
|
-------
|
|
self
|
|
"""
|
|
X = self._validate_data(X, accept_sparse=['csr', 'csc', 'coo'],
|
|
ensure_min_features=2, estimator=self)
|
|
# save n_features_in_ attribute here to reset it after, because it will
|
|
# be overridden in AgglomerativeClustering since we passed it X.T.
|
|
n_features_in_ = self.n_features_in_
|
|
AgglomerativeClustering.fit(self, X.T, **params)
|
|
self.n_features_in_ = n_features_in_
|
|
return self
|
|
|
|
@property
|
|
def fit_predict(self):
|
|
raise AttributeError
|