Coverage for nilearn/datasets/tests/_testing.py: 34%

185 statements  

« prev     ^ index     » next       coverage.py v7.9.1, created at 2025-06-20 10:58 +0200

1"""Utilities for testing the dataset fetchers. 

2 

3Unit tests should not depend on an internet connection nor on external 

4resources such as the servers from which we download datasets. Otherwise, tests 

5can fail spuriously when a web service is unavailable, and tests are slow 

6because downloading data takes a lot of time. 

7 

8Therefore in the tests, we fake the downloads: the function from the requests 

9library that would normally download a file is replaced ("patched") by a 

10"mock", a function that mimics its interface but doesn't download anything and 

11returns fake data instead. 

12 

13As we only patch functions from urllib and requests, nilearn code is unaware of 

14the mocking and can be tested as usual, as long as we provide fake responses 

15that look similar to those we would obtain from dataset providers if we 

16actually sent requests over the network. 

17 

18This module provides the utilities for setting up this mocking: patching the 

19relevant requests and urllib functions, and creating the fake responses. The 

20function from the requests library that nilearn uses to send requests is 

21patched (replaced) by a `Sender` object defined in this module. The 

22corresponding docstring details how individual tests can configure what fake 

23responses it should return for specific URLs. 

24 

25To make sure tests don't rely on previously existing data and don't write 

26outside of temporary directories, this module also adds fixtures to patch the 

27home directory and other default nilearn data directories. 

28 

29""" 

30 

31import fnmatch 

32import json 

33import os 

34import pathlib 

35import pickle 

36import re 

37import shutil 

38import tempfile 

39from collections import OrderedDict 

40from pathlib import Path 

41from unittest.mock import MagicMock 

42 

43import pandas as pd 

44import pytest 

45from nibabel import Nifti1Image 

46from requests.exceptions import HTTPError 

47from sklearn.utils import Bunch 

48 

49from nilearn._utils.testing import serialize_niimg 

50from nilearn.surface.surface import PolyMesh, SurfaceImage 

51 

52 

53@pytest.fixture(autouse=True) 

54def temp_nilearn_data_dir(tmp_path_factory, monkeypatch): 

55 """Monkeypatch user home directory and NILEARN_DATA env variable. 

56 

57 This ensures that tests that use nilearn.datasets will not load datasets 

58 already present on the current machine, or write in the user's home or 

59 nilearn data directory. 

60 

61 This fixture uses 'autouse' and is imported in conftest.py to make sure it 

62 is used by every test, even those that do not explicitly ask for it. 

63 

64 """ 

65 home_dir = tmp_path_factory.mktemp("temp_nilearn_home") 

66 monkeypatch.setenv("HOME", str(home_dir)) 

67 monkeypatch.setenv("USERPROFILE", str(home_dir)) 

68 data_dir = home_dir / "nilearn_data" 

69 data_dir.mkdir() 

70 monkeypatch.setenv("NILEARN_DATA", str(data_dir)) 

71 shared_data_dir = home_dir / "nilearn_shared_data" 

72 monkeypatch.setenv("NILEARN_SHARED_DATA", str(shared_data_dir)) 

73 

74 

75@pytest.fixture(autouse=True) 

76def request_mocker(monkeypatch): 

77 """Monkeypatch requests and urllib functions for sending requests. 

78 

79 This ensures that test functions do not retrieve data from the network, but 

80 can still run using mock data. 

81 

82 request.send is patched with a Sender object, whose responses can be 

83 configured -- see the docstring for Sender. 

84 

85 urllib's open is simply patched with a MagicMock. As nilearn dataset 

86 fetchers use requests, most tests won't use this; it is patched to make 

87 sure network mocking is not worked around by using urllib directly instead 

88 of requests, and for testing the FTP adapter. 

89 

90 This fixture uses 'autouse' and is imported in conftest.py to make sure it 

91 is used by every test, even those that do not explicitly ask for it. 

92 

93 """ 

94 sender = Sender() 

95 monkeypatch.setattr("requests.sessions.Session.send", sender) 

96 monkeypatch.setattr("urllib.request.OpenerDirector.open", MagicMock()) 

97 return sender 

98 

99 

100class Response: 

101 """Response objects returned by Sender. 

102 

103 This class mocks requests.Response. It does not implement the full 

104 interface; only the parts used by nilearn functions. 

105 

106 """ 

107 

108 is_mock = True 

109 

110 def __init__(self, content, url, status_code=200): 

111 self.content = content 

112 self.url = url 

113 self.status_code = status_code 

114 self.headers = {"Content-Length": len(self.content)} 

115 self.iter_start = 0 

116 

117 def __enter__(self): 

118 return self 

119 

120 def __exit__(self, *args): 

121 pass 

122 

123 def iter_content(self, chunk_size=8): 

124 for i in range(self.iter_start, len(self.content), chunk_size): 

125 yield self.content[i : i + chunk_size] 

126 

127 @property 

128 def text(self): 

129 return self.content.decode("utf-8") 

130 

131 def json(self): 

132 return json.loads(self.text) 

133 

134 def raise_for_status(self): 

135 if 400 <= self.status_code < 600: 

136 raise HTTPError(f"{self.status_code} Error for url: {self.url}") 

137 

138 

139class Request: 

140 """A mock request class.""" 

141 

142 is_mock = True 

143 

144 def __init__(self, url): 

145 self.url = url 

146 

147 

148class Sender: 

149 r"""Mock class used to patch requests.sessions.Session.send. 

150 

151 In nilearn's tests this replaces the function used by requests to send 

152 requests over the network. 

153 

154 Test functions can configure this object to specify what response is 

155 expected when sending requests to specific URLs. This is done by adding 

156 items to the ordered dictionary self.url_mapping 

157 

158 When a Sender receives a request, it tries to match it against the keys in 

159 url_mapping, and then against urls found in 

160 nilearn/datasets/tests/data/archive_contents/ (details below). If a key 

161 matches, the corresponding value is used to compute the response. If no key 

162 matches, an response with empty content is returned. 

163 

164 If several keys match, the one that was inserted most recently in 

165 url_mapping is used. 

166 

167 Specifying keys 

168 --------------- 

169 Keys of url_mapping can be: 

170 

171 - a `str`: it is used as a glob pattern, matched against the url with 

172 fnmatch (If special characters need to be matched literally they can be 

173 escaped with []). For example: 

174 '*' matches everything 

175 '*example.com/*' matches 'https://www.example.com/data' 

176 but not 'https://example.org' 

177 

178 - a `re.Pattern` (ie a compiled regex): it is matched against the url, and 

179 groups can be used to capture parts needed to construct the response. For 

180 example: 

181 re.compile(r'.*example.org/subject_(\d+)\.tar\.gz') 

182 matches 'https://example.org/subject_12.tar.gz' and captures '12' 

183 but does not match 'https://example.org/subject_12.csv' 

184 

185 If none of the keys in url_mapping matches, the Sender turns to the 

186 contents of nilearn/datasets/tests/data/archive_contents. Files in this 

187 directory or any subdirectory are used to build responses that contain zip 

188 or tar archives containing a certain list of files. (.py and .pyc files are 

189 ignored) 

190 The first line of a file in archive_contents is a glob pattern stating to 

191 which urls it applies. If it matches, the subsequent lines are paths that 

192 will exist inside the archive. The files created in the archive are 

193 empty. For example, if a file looks like: 

194 https://example.org/subj_*.tar.gz 

195 README.txt 

196 data/img.nii.gz 

197 data/labels.csv 

198 the response will be a tar gzipped archive with this structure: 

199 . 

200 ├── data 

201 │ ├── img.nii.gz 

202 │ └── labels.csv 

203 └── README.txt 

204 

205 Moreover, if the first line starts with 'format:' it is used to determine 

206 the archive format. For example: 'format: zip', 'format: gztar' (see 

207 `shutil` documentation for available formats). In this case the second line 

208 contains the url pattern and the rest of the file lists the contents. 

209 The paths for the archive contents must use '/' as path separator, it gets 

210 converted to the OS's separator when the file is read. 

211 A helper script is provided in 

212 nilearn/datasets/tests/data/list_archive_contents.sh to generate such files 

213 from a url. 

214 

215 Finally, if no key and no file matches the request url, a response with an 

216 empty content is returned. 

217 

218 Specifying values 

219 ----------------- 

220 

221 Once a key matches, the corresponding value is used to build a response. 

222 The value can be: 

223 - a callable: it is called as value(match, request), where request is the 

224 input `Request` object, and match is the url if the key was a string and 

225 the `re.Match` resulting from matching the key if it was a `re.Pattern`. 

226 The result of this call is then processed as described below. 

227 - an instance of the Response class: it used without modification. 

228 - a `bytes`: result is a Response with status 200 and these bytes as 

229 content. 

230 - a str: if the key was a `re.Pattern`, the value can contain 

231 backreferences that are replaced with groups matched in the url, e.g. 

232 \1, \g<groupname>. The resulting string is then encoded with UTF-8 to 

233 build the response content. For example: 

234 re.compile(r'.*example\.org/(.*)'): r'hello, \1' 

235 results in b'hello, nilearn' if the url is https://example.org/nilearn 

236 - an int: results in an response with this status code. The content is 

237 b"ERROR" if the status code is in [400, 600[ and b"OK" otherwise 

238 - an `Exception`: it is raised 

239 - a `pathlib.Path`: the contents of the response are the contents of that 

240 file. (can also be anything that has a `read_bytes` attribute, 

241 e.g a `pathlib2.Path`) 

242 - an object with a `to_filename` method, eg a Nifti1Image: it is serialized 

243 to .nii.gz to produce the response content. 

244 

245 To help construct values that mock downloaded archives, this module 

246 provides `dict_to_archive` and `list_to_archive` helper functions; more 

247 details in their docstrings. 

248 

249 Inspecting history 

250 ------------------ 

251 Senders record all sent requests in `sent_requests`, the visited urls in 

252 `visited_urls`, and the number of sent requests in `url_count` 

253 

254 """ 

255 

256 is_mock = True 

257 

258 def __init__(self): 

259 self.url_mapping = OrderedDict() 

260 self.sent_requests = [] 

261 self._archive_contents_index = _index_archive_contents() 

262 

263 @property 

264 def visited_urls(self): 

265 return [request.url for request in self.sent_requests] 

266 

267 @property 

268 def url_count(self): 

269 return len(self.visited_urls) 

270 

271 def __call__( 

272 self, 

273 request, 

274 *args, # noqa: ARG002 

275 **kwargs, # noqa: ARG002 

276 ): 

277 if isinstance(request, str): 

278 request = Request(request) 

279 self.sent_requests.append(request) 

280 for key, value in list(self.url_mapping.items())[::-1]: 

281 match = self.match(key, request.url) 

282 if match is not None: 

283 return self.get_response(value, match, request) 

284 for key, file_path in self._archive_contents_index.items(): 

285 match = self.match(key, request.url) 

286 if match is not None: 

287 return Response( 

288 _archive_from_contents_file(file_path), request.url 

289 ) 

290 return self.default_response(request) 

291 

292 def default_response(self, request): 

293 return Response(b"", request.url) 

294 

295 def match(self, key, url): 

296 if isinstance(key, type(re.compile(r".*"))): 

297 return key.match(url) 

298 elif isinstance(key, str) and fnmatch.fnmatch(url, key): 

299 return url 

300 else: 

301 return None 

302 

303 def get_response(self, response, match, request): 

304 if callable(response): 

305 response = response(match, request) 

306 

307 if isinstance(response, Response): 

308 return response 

309 elif isinstance(response, Exception): 

310 raise response 

311 elif isinstance(response, int): 

312 if 400 <= response < 600: 

313 return Response(b"ERROR", request.url, status_code=response) 

314 else: 

315 return Response(b"OK", request.url, status_code=response) 

316 elif hasattr(response, "to_filename"): 

317 return Response(serialize_niimg(response), request.url) 

318 elif hasattr(response, "read_bytes"): 

319 return Response(response.read_bytes(), request.url) 

320 elif isinstance(response, str): 

321 if isinstance(match, type(re.match(r".*", ""))): 

322 response = match.expand(response) 

323 response = response.encode("utf-8") 

324 return Response(response, request.url) 

325 elif isinstance(response, bytes): 

326 return Response(response, request.url) 

327 else: 

328 raise TypeError( 

329 f"Don't know how to make a Response from: {response}" 

330 ) 

331 

332 

333def _get_format_and_pattern(file_path): 

334 file_path = Path(file_path) 

335 with file_path.open() as f: 

336 first_line = f.readline().strip() 

337 match = re.match(r"format *: *(.+)", first_line) 

338 if match is None: 

339 return "gztar", first_line, 1 

340 return match[1], f.readline().strip(), 2 

341 

342 

343def _index_archive_contents(): 

344 archive_contents_dir = ( 

345 Path(__file__).parent.parent / "tests" / "data" / "archive_contents" 

346 ) 

347 index = {} 

348 for file_path in sorted(archive_contents_dir.glob("**/*")): 

349 if file_path.is_file() and file_path.suffix not in [".py", ".pyc"]: 

350 fmt, url_pattern, n = _get_format_and_pattern(file_path) 

351 index[url_pattern] = str(file_path.resolve()) 

352 return index 

353 

354 

355def _archive_from_contents_file(file_path): 

356 file_path = Path(file_path) 

357 fmt, pattern, n_skip = _get_format_and_pattern(file_path) 

358 with file_path.open() as f: 

359 contents = [p.strip().replace("/", os.sep) for p in f] 

360 return list_to_archive(list(filter(bool, contents))[n_skip:], fmt) 

361 

362 

363def _add_to_archive(path, content): 

364 path.parent.mkdir(exist_ok=True, parents=True) 

365 if hasattr(content, "to_filename"): 

366 content.to_filename(str(path)) 

367 elif hasattr(content, "is_dir") and hasattr(content, "is_file"): 

368 if content.is_file(): 

369 shutil.copy(str(content), str(path)) 

370 elif content.is_dir(): 

371 shutil.copytree(str(content), str(path)) 

372 else: 

373 raise FileNotFoundError( 

374 f"Not found or not a regular file or a directory {content}" 

375 ) 

376 elif isinstance(content, str): 

377 with path.open("w") as f: 

378 f.write(content) 

379 elif isinstance(content, bytes): 

380 with path.open("wb") as f: 

381 f.write(content) 

382 else: 

383 with path.open("wb") as f: 

384 pickle.dump(content, f) 

385 

386 

387def dict_to_archive(data, archive_format="gztar"): 

388 """Transform a {path: content} dict to an archive. 

389 

390 Parameters 

391 ---------- 

392 data : dict 

393 Keys are strings or `pathlib.Path` objects and specify paths inside the 

394 archive. (If strings, must use the system path separator.) 

395 Values determine the contents of these files and can be: 

396 - an object with a `to_filename` method (e.g. a Nifti1Image): it is 

397 serialized to .nii.gz 

398 - a `pathlib.Path`: the contents are copied inside the archive (can 

399 point to a file or a directory). (can also be anything that has 

400 `is_file` and `is_directory` attributes, e.g. a `pathlib2.Path`) 

401 - a `str` or `bytes`: the contents of the file 

402 - anything else is pickled. 

403 

404 archive_format : str, default="gztar" 

405 The archive format. See `shutil` documentation for available formats. 

406 

407 Returns 

408 ------- 

409 bytes : the contents of the resulting archive file, to be used for example 

410 as the contents of a mock response object (see Sender). 

411 

412 Examples 

413 -------- 

414 if `data` is `{"README.txt": "hello", Path("Data") / "labels.csv": "a,b"}`, 

415 the resulting archive has this structure: 

416 . 

417 ├── Data 

418 │ └── labels.csv 

419 └── README.txt 

420 

421 where labels.csv and README.txt contain the corresponding values in `data` 

422 

423 """ 

424 with tempfile.TemporaryDirectory() as root_tmp_dir: 

425 root_tmp_dir = Path(root_tmp_dir) 

426 tmp_dir = root_tmp_dir / "tmp" 

427 tmp_dir.mkdir() 

428 for path, content in data.items(): 

429 _add_to_archive(tmp_dir / path, content) 

430 archive_path = shutil.make_archive( 

431 str(root_tmp_dir / "archive"), archive_format, str(tmp_dir) 

432 ) 

433 with Path(archive_path).open("rb") as f: 

434 return f.read() 

435 

436 

437def list_to_archive(sequence, archive_format="gztar", content=""): 

438 """Transform a list of paths to an archive. 

439 

440 This invokes dict_to_archive with the `sequence` items as keys and 

441 `content` (by default '') as values. 

442 

443 For example, if `sequence` is 

444 `["README.txt", Path("Data") / "labels.csv"]`, 

445 the resulting archive has this structure: 

446 . 

447 ├── Data 

448 │ └── labels.csv 

449 └── README.txt 

450 

451 and "labels.csv" and "README.txt" contain the value of `content`. 

452 

453 """ 

454 return dict_to_archive( 

455 dict.fromkeys(sequence, content), archive_format=archive_format 

456 ) 

457 

458 

459def check_type_fetcher(data): 

460 """Check type content of datasets. 

461 

462 Recursively checks the content returned by fetchers 

463 to make sure they do not contain only some allowed type of objects. 

464 

465 If the data is a Bunch and contains a dataset description, 

466 ensures the description is not empty. 

467 """ 

468 if isinstance( 

469 data, 

470 ( 

471 str, 

472 int, 

473 float, 

474 Nifti1Image, 

475 SurfaceImage, 

476 pd.DataFrame, 

477 PolyMesh, 

478 pathlib.Path, 

479 ), 

480 ): 

481 pass 

482 elif isinstance(data, (Bunch, dict)): 

483 for k, v in data.items(): 

484 if k == "description": 

485 assert isinstance(v, str) 

486 assert v != "" 

487 if not check_type_fetcher(v): 

488 raise TypeError(f"Found {k} : {type(v)}") 

489 elif isinstance(data, (set, list, tuple)): 

490 for v in data: 

491 if not check_type_fetcher(v): 

492 raise TypeError(f"{type(v)}") 

493 else: 

494 return False 

495 return True