Coverage for nilearn/regions/hierarchical_kmeans_clustering.py: 19%

109 statements  

« prev     ^ index     » next       coverage.py v7.9.1, created at 2025-06-20 10:58 +0200

1"""Hierarchical k-means clustering.""" 

2 

3import warnings 

4 

5import numpy as np 

6from sklearn.base import BaseEstimator, ClusterMixin, TransformerMixin 

7from sklearn.cluster import MiniBatchKMeans 

8from sklearn.utils import check_array 

9from sklearn.utils.validation import check_is_fitted 

10 

11from nilearn._utils import fill_doc 

12from nilearn._utils.logger import find_stack_level 

13from nilearn._utils.tags import SKLEARN_LT_1_6 

14 

15 

16def _remove_empty_labels(labels): 

17 """Remove empty values label values from labels list. 

18 

19 Returns labels mapped to np.arange(n_unique), 

20 where n_unique is the number of unique values in labels 

21 """ 

22 vals = np.unique(labels) 

23 inverse_vals = -np.ones(labels.max() + 1, dtype=int) 

24 inverse_vals[vals] = np.arange(len(vals)) 

25 return inverse_vals[labels] 

26 

27 

28def _adjust_small_clusters(array, n_clusters): 

29 """Take a ndarray of floats summing to n_clusters \ 

30 and try to round it while enforcing rounded array still sum \ 

31 to n_clusters and every element is at least 1. 

32 """ 

33 array_round = np.rint(array).astype(int) 

34 array_round = np.maximum(array_round, 1) 

35 

36 if np.sum(array_round) < n_clusters: 

37 while np.sum(array_round) != n_clusters: 

38 idx = np.argmax(array - array_round) 

39 array_round[idx] += 1 

40 elif np.sum(array_round) == n_clusters: 

41 pass 

42 elif np.sum(array_round) > n_clusters: 

43 parent_idx_ = np.arange(array_round.shape[0]) 

44 while np.sum(array_round) != n_clusters: 

45 # prevent element rounded to 1 to be decreased in edge cases 

46 mask = array_round != 1 

47 idx = np.argmin(array[mask] - array_round[mask]) 

48 parent_idx = parent_idx_[mask][idx] 

49 array_round[parent_idx] -= 1 

50 return array_round 

51 

52 

53@fill_doc 

54def hierarchical_k_means( 

55 X, 

56 n_clusters, 

57 init="k-means++", 

58 batch_size=1000, 

59 n_init=10, 

60 max_no_improvement=10, 

61 verbose=0, 

62 random_state=0, 

63): 

64 """Use a recursive k-means to cluster X. 

65 

66 First clustering in sqrt(n_clusters) parcels, 

67 and Kmeans a second time on each parcel. 

68 

69 Parameters 

70 ---------- 

71 X : ndarray (n_samples, n_features) 

72 Data to cluster 

73 

74 n_clusters : :obj:`int`, 

75 The number of clusters to find. 

76 

77 init : {'k-means++', 'random' or an ndarray}, default='k-means++' 

78 Method for initialization. 

79 'k-means++' : selects initial cluster centers for k-means 

80 clustering in a smart way to speed up convergence. See section 

81 Notes in k_init for more details. 

82 'random': choose k observations (rows) at random from data for 

83 the initial centroids. 

84 If an ndarray is passed, it should be of shape (n_clusters, n_features) 

85 and gives the initial centers. 

86 

87 batch_size : :obj:`int`, default: 1000 

88 Size of the mini batches. (Kmeans performed through MiniBatchKMeans) 

89 

90 n_init : :obj:`int`, default=10 

91 Number of random initializations that are tried. 

92 In contrast to KMeans, the algorithm is only run once, using the 

93 best of the ``n_init`` initializations as measured by inertia. 

94 

95 max_no_improvement : :obj:`int`, default: 10 

96 Control early stopping based on the consecutive number of mini 

97 batches that does not yield an improvement on the smoothed inertia. 

98 To disable convergence detection based on inertia, set 

99 max_no_improvement to None. 

100 

101 random_state : :obj:`int`, RandomState instance or None, default=0 

102 Determines random number generation for centroid initialization and 

103 random reassignment. Use an int to make the randomness deterministic. 

104 

105 %(verbose0)s 

106 

107 Returns 

108 ------- 

109 labels : list of ints (len n_features) 

110 Parcellation of features in clusters 

111 """ 

112 n_big_clusters = int(np.sqrt(n_clusters)) 

113 mbk = MiniBatchKMeans( 

114 init=init, 

115 n_clusters=n_big_clusters, 

116 batch_size=batch_size, 

117 n_init=n_init, 

118 max_no_improvement=max_no_improvement, 

119 verbose=verbose, 

120 random_state=random_state, 

121 ).fit(X) 

122 coarse_labels = mbk.labels_ 

123 fine_labels = np.zeros_like(coarse_labels) 

124 q = 0 

125 counts = np.bincount(coarse_labels) 

126 exact_clusters = np.asarray( 

127 [ 

128 n_clusters * counts[i] * 1.0 / X.shape[0] 

129 for i in range(n_big_clusters) 

130 ] 

131 ) 

132 

133 adjusted_clusters = _adjust_small_clusters(exact_clusters, n_clusters) 

134 for i, n_small_clusters in enumerate(adjusted_clusters): 

135 mbk = MiniBatchKMeans( 

136 init=init, 

137 n_clusters=n_small_clusters, 

138 batch_size=batch_size, 

139 random_state=random_state, 

140 max_no_improvement=max_no_improvement, 

141 verbose=verbose, 

142 n_init=n_init, 

143 ).fit(X[coarse_labels == i]) 

144 fine_labels[coarse_labels == i] = q + mbk.labels_ 

145 q += n_small_clusters 

146 

147 return _remove_empty_labels(fine_labels) 

148 

149 

150@fill_doc 

151class HierarchicalKMeans(ClusterMixin, TransformerMixin, BaseEstimator): 

152 """Hierarchical KMeans. 

153 

154 First clusterize the samples into big clusters. Then clusterize the samples 

155 inside these big clusters into smaller ones. 

156 

157 Parameters 

158 ---------- 

159 n_clusters : :obj:`int` 

160 The number of clusters to find. 

161 

162 init : {'k-means++', 'random' or an ndarray}, default='k-means++' 

163 Method for initialization. 

164 

165 * 'k-means++' : selects initial cluster centers for k-means 

166 clustering in a smart way to speed up convergence. See section 

167 Notes in k_init for more details. 

168 

169 * 'random': choose k observations (rows) at random from data for 

170 the initial centroids. 

171 

172 * If an ndarray is passed, it should be of shape (n_clusters, 

173 n_features) and gives the initial centers. 

174 

175 batch_size : :obj:`int`, optional, default: 1000 

176 Size of the mini batches. (Kmeans performed through MiniBatchKMeans) 

177 

178 n_init : :obj:`int`, default=10 

179 Number of random initializations that are tried. 

180 In contrast to KMeans, the algorithm is only run once, using the 

181 best of the ``n_init`` initializations as measured by inertia. 

182 

183 max_no_improvement : :obj:`int`, default: 10 

184 Control early stopping based on the consecutive number of mini 

185 batches that does not yield an improvement on the smoothed inertia. 

186 To disable convergence detection based on inertia, set 

187 max_no_improvement to None. 

188 

189 random_state : :obj:`int`, RandomState instance or None, default=0 

190 Determines random number generation for centroid initialization and 

191 random reassignment. Use an int to make the randomness deterministic. 

192 

193 scaling : :obj:`bool`, default=False 

194 If scaling is True, each cluster is scaled by the square root of its 

195 size during transform(), preserving the l2-norm of the image. 

196 inverse_transform() will apply inversed scaling to yield an image with 

197 same l2-norm as input. 

198 

199 %(verbose0)s 

200 

201 Attributes 

202 ---------- 

203 labels_ : ndarray, shape = [n_features] 

204 cluster labels for each feature. 

205 

206 sizes_ : ndarray, shape = [n_features] 

207 It contains the size of each cluster. 

208 

209 """ 

210 

211 def __init__( 

212 self, 

213 n_clusters=None, 

214 init="k-means++", 

215 batch_size=1000, 

216 n_init=10, 

217 max_no_improvement=10, 

218 verbose=0, 

219 random_state=0, 

220 scaling=False, 

221 ): 

222 self.n_clusters = n_clusters 

223 self.init = init 

224 self.batch_size = batch_size 

225 self.n_init = n_init 

226 self.max_no_improvement = max_no_improvement 

227 self.verbose = verbose 

228 self.random_state = random_state 

229 self.scaling = scaling 

230 

231 def _more_tags(self): 

232 """Return estimator tags. 

233 

234 TODO remove when bumping sklearn_version > 1.5 

235 """ 

236 return self.__sklearn_tags__() 

237 

238 def __sklearn_tags__(self): 

239 """Return estimator tags. 

240 

241 See the sklearn documentation for more details on tags 

242 https://scikit-learn.org/1.6/developers/develop.html#estimator-tags 

243 """ 

244 # TODO 

245 # get rid of if block 

246 # bumping sklearn_version > 1.5 

247 if SKLEARN_LT_1_6: 

248 from nilearn._utils.tags import tags 

249 

250 return tags() 

251 

252 from nilearn._utils.tags import InputTags 

253 

254 tags = super().__sklearn_tags__() 

255 tags.input_tags = InputTags(niimg_like=False) 

256 return tags 

257 

258 @fill_doc 

259 def fit(self, X, y=None): 

260 """Compute clustering of the data. 

261 

262 Parameters 

263 ---------- 

264 X : ndarray, shape = [n_samples, n_features] 

265 Training data. 

266 

267 %(y_dummy)s 

268 

269 Returns 

270 ------- 

271 self 

272 """ 

273 del y 

274 X = check_array( 

275 X, ensure_min_features=2, ensure_min_samples=2, estimator=self 

276 ) 

277 # Transpose the data so that we can cluster features (voxels) 

278 # and input them as samples to the sklearn's clustering algorithm 

279 # This is because sklearn's clustering algorithm does clustering 

280 # on samples and not on features 

281 X = X.T 

282 # n_features for the sklearn's clustering algorithm would be the 

283 # number of samples in the input data 

284 n_features = X.shape[1] 

285 

286 if not isinstance(self.n_clusters, int) or self.n_clusters <= 0: 

287 raise ValueError( 

288 "n_clusters should be an integer greater than 0." 

289 f" {self.n_clusters} was provided." 

290 ) 

291 

292 if self.n_clusters > n_features: 

293 self.n_clusters = n_features 

294 warnings.warn( 

295 "n_clusters should be at most the number of " 

296 f"features. Taking n_clusters = {n_features} instead.", 

297 stacklevel=find_stack_level(), 

298 ) 

299 self.labels_ = hierarchical_k_means( 

300 X, 

301 self.n_clusters, 

302 self.init, 

303 self.batch_size, 

304 self.n_init, 

305 self.max_no_improvement, 

306 self.verbose, 

307 self.random_state, 

308 ) 

309 sizes = np.bincount(self.labels_) 

310 

311 self.sizes_ = sizes 

312 self.n_clusters = len(sizes) 

313 return self 

314 

315 def __sklearn_is_fitted__(self): 

316 return hasattr(self, "labels_") 

317 

318 @fill_doc 

319 def transform( 

320 self, 

321 X, 

322 y=None, # noqa: ARG002 

323 ): 

324 """Apply clustering, reduce the dimensionality of the data. 

325 

326 Parameters 

327 ---------- 

328 X : ndarray, shape = [n_samples, n_features] 

329 Data to transform with the fitted clustering. 

330 

331 %(y_dummy)s 

332 

333 Returns 

334 ------- 

335 X_red : ndarray, shape = [n_samples, n_clusters] 

336 Data reduced with agglomerated signal for each cluster 

337 """ 

338 check_is_fitted(self) 

339 

340 # Transpose the data so that we can cluster features (voxels) 

341 # and input them as samples to the sklearn's clustering algorithm 

342 X = X.T 

343 unique_labels = np.arange(self.n_clusters) 

344 

345 mean_cluster = np.empty( 

346 (len(unique_labels), X.shape[1]), dtype=X.dtype 

347 ) 

348 for label in unique_labels: 

349 mean_cluster[label] = np.mean(X[self.labels_ == label], axis=0) 

350 

351 X_red = np.array(mean_cluster) 

352 

353 if self.scaling: 

354 X_red = X_red * np.sqrt(self.sizes_[:, np.newaxis]) 

355 

356 # Transpose the data back to the original shape i.e. 

357 # (n_samples, n_clusters) 

358 X_red = X_red.T 

359 return X_red 

360 

361 def inverse_transform(self, X_red): 

362 """Send the reduced 2D data matrix back to the original feature \ 

363 space (voxels). 

364 

365 Parameters 

366 ---------- 

367 X_red : ndarray , shape = [n_samples, n_clusters] 

368 Data reduced with agglomerated signal for each cluster 

369 

370 Returns 

371 ------- 

372 X_inv : ndarray, shape = [n_samples, n_features] 

373 Data reduced expanded to the original feature space 

374 """ 

375 check_is_fitted(self) 

376 

377 X_red = X_red.T 

378 inverse = self.labels_ 

379 if self.scaling: 

380 X_red = X_red / np.sqrt(self.sizes_[:, np.newaxis]) 

381 X_inv = X_red[inverse, ...] 

382 X_inv = X_inv.T 

383 return X_inv 

384 

385 def set_output(self, *, transform=None): 

386 """Set the output container when ``"transform"`` is called. 

387 

388 .. warning:: 

389 

390 This has not been implemented yet. 

391 """ 

392 raise NotImplementedError()