Coverage for nilearn/datasets/tests/_testing.py: 34%
185 statements
« prev ^ index » next coverage.py v7.9.1, created at 2025-06-20 10:58 +0200
« prev ^ index » next coverage.py v7.9.1, created at 2025-06-20 10:58 +0200
1"""Utilities for testing the dataset fetchers.
3Unit tests should not depend on an internet connection nor on external
4resources such as the servers from which we download datasets. Otherwise, tests
5can fail spuriously when a web service is unavailable, and tests are slow
6because downloading data takes a lot of time.
8Therefore in the tests, we fake the downloads: the function from the requests
9library that would normally download a file is replaced ("patched") by a
10"mock", a function that mimics its interface but doesn't download anything and
11returns fake data instead.
13As we only patch functions from urllib and requests, nilearn code is unaware of
14the mocking and can be tested as usual, as long as we provide fake responses
15that look similar to those we would obtain from dataset providers if we
16actually sent requests over the network.
18This module provides the utilities for setting up this mocking: patching the
19relevant requests and urllib functions, and creating the fake responses. The
20function from the requests library that nilearn uses to send requests is
21patched (replaced) by a `Sender` object defined in this module. The
22corresponding docstring details how individual tests can configure what fake
23responses it should return for specific URLs.
25To make sure tests don't rely on previously existing data and don't write
26outside of temporary directories, this module also adds fixtures to patch the
27home directory and other default nilearn data directories.
29"""
31import fnmatch
32import json
33import os
34import pathlib
35import pickle
36import re
37import shutil
38import tempfile
39from collections import OrderedDict
40from pathlib import Path
41from unittest.mock import MagicMock
43import pandas as pd
44import pytest
45from nibabel import Nifti1Image
46from requests.exceptions import HTTPError
47from sklearn.utils import Bunch
49from nilearn._utils.testing import serialize_niimg
50from nilearn.surface.surface import PolyMesh, SurfaceImage
53@pytest.fixture(autouse=True)
54def temp_nilearn_data_dir(tmp_path_factory, monkeypatch):
55 """Monkeypatch user home directory and NILEARN_DATA env variable.
57 This ensures that tests that use nilearn.datasets will not load datasets
58 already present on the current machine, or write in the user's home or
59 nilearn data directory.
61 This fixture uses 'autouse' and is imported in conftest.py to make sure it
62 is used by every test, even those that do not explicitly ask for it.
64 """
65 home_dir = tmp_path_factory.mktemp("temp_nilearn_home")
66 monkeypatch.setenv("HOME", str(home_dir))
67 monkeypatch.setenv("USERPROFILE", str(home_dir))
68 data_dir = home_dir / "nilearn_data"
69 data_dir.mkdir()
70 monkeypatch.setenv("NILEARN_DATA", str(data_dir))
71 shared_data_dir = home_dir / "nilearn_shared_data"
72 monkeypatch.setenv("NILEARN_SHARED_DATA", str(shared_data_dir))
75@pytest.fixture(autouse=True)
76def request_mocker(monkeypatch):
77 """Monkeypatch requests and urllib functions for sending requests.
79 This ensures that test functions do not retrieve data from the network, but
80 can still run using mock data.
82 request.send is patched with a Sender object, whose responses can be
83 configured -- see the docstring for Sender.
85 urllib's open is simply patched with a MagicMock. As nilearn dataset
86 fetchers use requests, most tests won't use this; it is patched to make
87 sure network mocking is not worked around by using urllib directly instead
88 of requests, and for testing the FTP adapter.
90 This fixture uses 'autouse' and is imported in conftest.py to make sure it
91 is used by every test, even those that do not explicitly ask for it.
93 """
94 sender = Sender()
95 monkeypatch.setattr("requests.sessions.Session.send", sender)
96 monkeypatch.setattr("urllib.request.OpenerDirector.open", MagicMock())
97 return sender
100class Response:
101 """Response objects returned by Sender.
103 This class mocks requests.Response. It does not implement the full
104 interface; only the parts used by nilearn functions.
106 """
108 is_mock = True
110 def __init__(self, content, url, status_code=200):
111 self.content = content
112 self.url = url
113 self.status_code = status_code
114 self.headers = {"Content-Length": len(self.content)}
115 self.iter_start = 0
117 def __enter__(self):
118 return self
120 def __exit__(self, *args):
121 pass
123 def iter_content(self, chunk_size=8):
124 for i in range(self.iter_start, len(self.content), chunk_size):
125 yield self.content[i : i + chunk_size]
127 @property
128 def text(self):
129 return self.content.decode("utf-8")
131 def json(self):
132 return json.loads(self.text)
134 def raise_for_status(self):
135 if 400 <= self.status_code < 600:
136 raise HTTPError(f"{self.status_code} Error for url: {self.url}")
139class Request:
140 """A mock request class."""
142 is_mock = True
144 def __init__(self, url):
145 self.url = url
148class Sender:
149 r"""Mock class used to patch requests.sessions.Session.send.
151 In nilearn's tests this replaces the function used by requests to send
152 requests over the network.
154 Test functions can configure this object to specify what response is
155 expected when sending requests to specific URLs. This is done by adding
156 items to the ordered dictionary self.url_mapping
158 When a Sender receives a request, it tries to match it against the keys in
159 url_mapping, and then against urls found in
160 nilearn/datasets/tests/data/archive_contents/ (details below). If a key
161 matches, the corresponding value is used to compute the response. If no key
162 matches, an response with empty content is returned.
164 If several keys match, the one that was inserted most recently in
165 url_mapping is used.
167 Specifying keys
168 ---------------
169 Keys of url_mapping can be:
171 - a `str`: it is used as a glob pattern, matched against the url with
172 fnmatch (If special characters need to be matched literally they can be
173 escaped with []). For example:
174 '*' matches everything
175 '*example.com/*' matches 'https://www.example.com/data'
176 but not 'https://example.org'
178 - a `re.Pattern` (ie a compiled regex): it is matched against the url, and
179 groups can be used to capture parts needed to construct the response. For
180 example:
181 re.compile(r'.*example.org/subject_(\d+)\.tar\.gz')
182 matches 'https://example.org/subject_12.tar.gz' and captures '12'
183 but does not match 'https://example.org/subject_12.csv'
185 If none of the keys in url_mapping matches, the Sender turns to the
186 contents of nilearn/datasets/tests/data/archive_contents. Files in this
187 directory or any subdirectory are used to build responses that contain zip
188 or tar archives containing a certain list of files. (.py and .pyc files are
189 ignored)
190 The first line of a file in archive_contents is a glob pattern stating to
191 which urls it applies. If it matches, the subsequent lines are paths that
192 will exist inside the archive. The files created in the archive are
193 empty. For example, if a file looks like:
194 https://example.org/subj_*.tar.gz
195 README.txt
196 data/img.nii.gz
197 data/labels.csv
198 the response will be a tar gzipped archive with this structure:
199 .
200 ├── data
201 │ ├── img.nii.gz
202 │ └── labels.csv
203 └── README.txt
205 Moreover, if the first line starts with 'format:' it is used to determine
206 the archive format. For example: 'format: zip', 'format: gztar' (see
207 `shutil` documentation for available formats). In this case the second line
208 contains the url pattern and the rest of the file lists the contents.
209 The paths for the archive contents must use '/' as path separator, it gets
210 converted to the OS's separator when the file is read.
211 A helper script is provided in
212 nilearn/datasets/tests/data/list_archive_contents.sh to generate such files
213 from a url.
215 Finally, if no key and no file matches the request url, a response with an
216 empty content is returned.
218 Specifying values
219 -----------------
221 Once a key matches, the corresponding value is used to build a response.
222 The value can be:
223 - a callable: it is called as value(match, request), where request is the
224 input `Request` object, and match is the url if the key was a string and
225 the `re.Match` resulting from matching the key if it was a `re.Pattern`.
226 The result of this call is then processed as described below.
227 - an instance of the Response class: it used without modification.
228 - a `bytes`: result is a Response with status 200 and these bytes as
229 content.
230 - a str: if the key was a `re.Pattern`, the value can contain
231 backreferences that are replaced with groups matched in the url, e.g.
232 \1, \g<groupname>. The resulting string is then encoded with UTF-8 to
233 build the response content. For example:
234 re.compile(r'.*example\.org/(.*)'): r'hello, \1'
235 results in b'hello, nilearn' if the url is https://example.org/nilearn
236 - an int: results in an response with this status code. The content is
237 b"ERROR" if the status code is in [400, 600[ and b"OK" otherwise
238 - an `Exception`: it is raised
239 - a `pathlib.Path`: the contents of the response are the contents of that
240 file. (can also be anything that has a `read_bytes` attribute,
241 e.g a `pathlib2.Path`)
242 - an object with a `to_filename` method, eg a Nifti1Image: it is serialized
243 to .nii.gz to produce the response content.
245 To help construct values that mock downloaded archives, this module
246 provides `dict_to_archive` and `list_to_archive` helper functions; more
247 details in their docstrings.
249 Inspecting history
250 ------------------
251 Senders record all sent requests in `sent_requests`, the visited urls in
252 `visited_urls`, and the number of sent requests in `url_count`
254 """
256 is_mock = True
258 def __init__(self):
259 self.url_mapping = OrderedDict()
260 self.sent_requests = []
261 self._archive_contents_index = _index_archive_contents()
263 @property
264 def visited_urls(self):
265 return [request.url for request in self.sent_requests]
267 @property
268 def url_count(self):
269 return len(self.visited_urls)
271 def __call__(
272 self,
273 request,
274 *args, # noqa: ARG002
275 **kwargs, # noqa: ARG002
276 ):
277 if isinstance(request, str):
278 request = Request(request)
279 self.sent_requests.append(request)
280 for key, value in list(self.url_mapping.items())[::-1]:
281 match = self.match(key, request.url)
282 if match is not None:
283 return self.get_response(value, match, request)
284 for key, file_path in self._archive_contents_index.items():
285 match = self.match(key, request.url)
286 if match is not None:
287 return Response(
288 _archive_from_contents_file(file_path), request.url
289 )
290 return self.default_response(request)
292 def default_response(self, request):
293 return Response(b"", request.url)
295 def match(self, key, url):
296 if isinstance(key, type(re.compile(r".*"))):
297 return key.match(url)
298 elif isinstance(key, str) and fnmatch.fnmatch(url, key):
299 return url
300 else:
301 return None
303 def get_response(self, response, match, request):
304 if callable(response):
305 response = response(match, request)
307 if isinstance(response, Response):
308 return response
309 elif isinstance(response, Exception):
310 raise response
311 elif isinstance(response, int):
312 if 400 <= response < 600:
313 return Response(b"ERROR", request.url, status_code=response)
314 else:
315 return Response(b"OK", request.url, status_code=response)
316 elif hasattr(response, "to_filename"):
317 return Response(serialize_niimg(response), request.url)
318 elif hasattr(response, "read_bytes"):
319 return Response(response.read_bytes(), request.url)
320 elif isinstance(response, str):
321 if isinstance(match, type(re.match(r".*", ""))):
322 response = match.expand(response)
323 response = response.encode("utf-8")
324 return Response(response, request.url)
325 elif isinstance(response, bytes):
326 return Response(response, request.url)
327 else:
328 raise TypeError(
329 f"Don't know how to make a Response from: {response}"
330 )
333def _get_format_and_pattern(file_path):
334 file_path = Path(file_path)
335 with file_path.open() as f:
336 first_line = f.readline().strip()
337 match = re.match(r"format *: *(.+)", first_line)
338 if match is None:
339 return "gztar", first_line, 1
340 return match[1], f.readline().strip(), 2
343def _index_archive_contents():
344 archive_contents_dir = (
345 Path(__file__).parent.parent / "tests" / "data" / "archive_contents"
346 )
347 index = {}
348 for file_path in sorted(archive_contents_dir.glob("**/*")):
349 if file_path.is_file() and file_path.suffix not in [".py", ".pyc"]:
350 fmt, url_pattern, n = _get_format_and_pattern(file_path)
351 index[url_pattern] = str(file_path.resolve())
352 return index
355def _archive_from_contents_file(file_path):
356 file_path = Path(file_path)
357 fmt, pattern, n_skip = _get_format_and_pattern(file_path)
358 with file_path.open() as f:
359 contents = [p.strip().replace("/", os.sep) for p in f]
360 return list_to_archive(list(filter(bool, contents))[n_skip:], fmt)
363def _add_to_archive(path, content):
364 path.parent.mkdir(exist_ok=True, parents=True)
365 if hasattr(content, "to_filename"):
366 content.to_filename(str(path))
367 elif hasattr(content, "is_dir") and hasattr(content, "is_file"):
368 if content.is_file():
369 shutil.copy(str(content), str(path))
370 elif content.is_dir():
371 shutil.copytree(str(content), str(path))
372 else:
373 raise FileNotFoundError(
374 f"Not found or not a regular file or a directory {content}"
375 )
376 elif isinstance(content, str):
377 with path.open("w") as f:
378 f.write(content)
379 elif isinstance(content, bytes):
380 with path.open("wb") as f:
381 f.write(content)
382 else:
383 with path.open("wb") as f:
384 pickle.dump(content, f)
387def dict_to_archive(data, archive_format="gztar"):
388 """Transform a {path: content} dict to an archive.
390 Parameters
391 ----------
392 data : dict
393 Keys are strings or `pathlib.Path` objects and specify paths inside the
394 archive. (If strings, must use the system path separator.)
395 Values determine the contents of these files and can be:
396 - an object with a `to_filename` method (e.g. a Nifti1Image): it is
397 serialized to .nii.gz
398 - a `pathlib.Path`: the contents are copied inside the archive (can
399 point to a file or a directory). (can also be anything that has
400 `is_file` and `is_directory` attributes, e.g. a `pathlib2.Path`)
401 - a `str` or `bytes`: the contents of the file
402 - anything else is pickled.
404 archive_format : str, default="gztar"
405 The archive format. See `shutil` documentation for available formats.
407 Returns
408 -------
409 bytes : the contents of the resulting archive file, to be used for example
410 as the contents of a mock response object (see Sender).
412 Examples
413 --------
414 if `data` is `{"README.txt": "hello", Path("Data") / "labels.csv": "a,b"}`,
415 the resulting archive has this structure:
416 .
417 ├── Data
418 │ └── labels.csv
419 └── README.txt
421 where labels.csv and README.txt contain the corresponding values in `data`
423 """
424 with tempfile.TemporaryDirectory() as root_tmp_dir:
425 root_tmp_dir = Path(root_tmp_dir)
426 tmp_dir = root_tmp_dir / "tmp"
427 tmp_dir.mkdir()
428 for path, content in data.items():
429 _add_to_archive(tmp_dir / path, content)
430 archive_path = shutil.make_archive(
431 str(root_tmp_dir / "archive"), archive_format, str(tmp_dir)
432 )
433 with Path(archive_path).open("rb") as f:
434 return f.read()
437def list_to_archive(sequence, archive_format="gztar", content=""):
438 """Transform a list of paths to an archive.
440 This invokes dict_to_archive with the `sequence` items as keys and
441 `content` (by default '') as values.
443 For example, if `sequence` is
444 `["README.txt", Path("Data") / "labels.csv"]`,
445 the resulting archive has this structure:
446 .
447 ├── Data
448 │ └── labels.csv
449 └── README.txt
451 and "labels.csv" and "README.txt" contain the value of `content`.
453 """
454 return dict_to_archive(
455 dict.fromkeys(sequence, content), archive_format=archive_format
456 )
459def check_type_fetcher(data):
460 """Check type content of datasets.
462 Recursively checks the content returned by fetchers
463 to make sure they do not contain only some allowed type of objects.
465 If the data is a Bunch and contains a dataset description,
466 ensures the description is not empty.
467 """
468 if isinstance(
469 data,
470 (
471 str,
472 int,
473 float,
474 Nifti1Image,
475 SurfaceImage,
476 pd.DataFrame,
477 PolyMesh,
478 pathlib.Path,
479 ),
480 ):
481 pass
482 elif isinstance(data, (Bunch, dict)):
483 for k, v in data.items():
484 if k == "description":
485 assert isinstance(v, str)
486 assert v != ""
487 if not check_type_fetcher(v):
488 raise TypeError(f"Found {k} : {type(v)}")
489 elif isinstance(data, (set, list, tuple)):
490 for v in data:
491 if not check_type_fetcher(v):
492 raise TypeError(f"{type(v)}")
493 else:
494 return False
495 return True