Module gamslib.sip.utils

Utility functions for the GAMS SIP package creation and validation.

Provides helpers for validating object directories, extracting IDs, calculating hashes, counting files and bytes, and fetching JSON schemas for validation.

Features

  • Validates object directory structure and required files.
  • Extracts and validates object and datastream IDs.
  • Calculates MD5, SHA512 hashes for files.
  • Counts files and bytes in a directory tree.
  • Fetches and parses JSON schemas from URLs, with error handling.

Usage

Use validate_object_dir()(object_path) to check an object directory. Use extract_id(path) to extract and validate an object or datastream ID. Use md5hash()(file), sha512hash()(file), or sha256hash(file) for file checksums. Use count_bytes()(root_dir) or count_files()(root_dir) for directory statistics. Use fetch_json_schema()(url) to retrieve a JSON schema from a remote URL.

Functions

def count_bytes(root_dir: pathlib._local.Path) ‑> int
Expand source code
def count_bytes(root_dir: Path) -> int:
    """
    Count the number of bytes of all files below root_dir.

    Args:
        root_dir (Path): Directory to count bytes in.

    Returns:
        int: Total number of bytes in all files.
    """
    total_bytes = 0
    for file in root_dir.rglob("*"):
        if file.is_file():
            total_bytes += file.stat().st_size
    return total_bytes

Count the number of bytes of all files below root_dir.

Args

root_dir : Path
Directory to count bytes in.

Returns

int
Total number of bytes in all files.
def count_files(root_dir: pathlib._local.Path) ‑> int
Expand source code
def count_files(root_dir: Path) -> int:
    """
    Count the number of all files below root_dir.

    Args:
        root_dir (Path): Directory to count files in.

    Returns:
        int: Total number of files.
    """
    total_files = 0
    for file in root_dir.rglob("*"):
        if file.is_file():
            total_files += 1
    return total_files

Count the number of all files below root_dir.

Args

root_dir : Path
Directory to count files in.

Returns

int
Total number of files.
def fetch_json_schema(url: str) ‑> dict
Expand source code
@lru_cache()
def fetch_json_schema(url: str) -> dict:
    """
    Fetch a JSON schema from a URL.

    Args:
        url (str): URL to fetch the JSON schema from.

    Returns:
        dict: Parsed JSON schema.

    Raises:
        BagValidationError: If the schema cannot be fetched or is not valid JSON.
    """
    if url == GAMS_SIP_SCHEMA_URL:
        logger.debug("Using embedded GAMS SIP schema")
        return read_sip_schema_from_package()
    try:
        logger.debug("Fetching JSON schema from %s", url)
        response = requests.get(url, timeout=20)
        if not response.ok:
            raise BagValidationError(
                f"Failed to fetch JSON schema from '{url}': HTTP status code {response.status_code}"
            )
    except requests.RequestException as e:
        raise BagValidationError(
            f"Failed to fetch JSON schema from '{url}': {e}"
        ) from e

    try:
        return response.json()
    except (
        requests.JSONDecodeError,
        requests.exceptions.InvalidJSONError,
        TypeError,
    ) as e:
        raise BagValidationError(
            f"Schema referenced in 'sip.json' is not valid JSON: {e}"
        ) from e

Fetch a JSON schema from a URL.

Args

url : str
URL to fetch the JSON schema from.

Returns

dict
Parsed JSON schema.

Raises

BagValidationError
If the schema cannot be fetched or is not valid JSON.
def find_object_folders(root_folder: pathlib._local.Path) ‑> Generator[pathlib._local.Path, None, None]
Expand source code
def find_object_folders(root_folder: Path) -> Generator[Path, None, None]:
    """
    Find all object folders in the root folder or below.

    Args:
        root_folder (Path): Root directory to search for object folders.

    Yields:
        Path: Path to each object folder containing a DC.xml file.

    Notes:
        - Skips folders that do not contain a DC.xml file and logs a warning.
    """
    for root, _, files in os.walk(root_folder):
        if "DC.xml" in files:
            yield Path(root)
        elif not files or "project.toml" not in files:
            logger.warning(
                "Skipping folder %s as it does not contain a DC.xml file.", root
            )

Find all object folders in the root folder or below.

Args

root_folder : Path
Root directory to search for object folders.

Yields

Path
Path to each object folder containing a DC.xml file.

Notes

  • Skips folders that do not contain a DC.xml file and logs a warning.
def md5hash(file: pathlib._local.Path) ‑> str
Expand source code
def md5hash(file: Path) -> str:
    """
    Calculate the MD5 hash of a file.

    Args:
        file (Path): Path to the file.

    Returns:
        str: MD5 hash as a hexadecimal string.
    """
    return hashlib.md5(file.read_bytes()).hexdigest()

Calculate the MD5 hash of a file.

Args

file : Path
Path to the file.

Returns

str
MD5 hash as a hexadecimal string.
def read_sip_schema_from_package()
Expand source code
def read_sip_schema_from_package():
    """
    Read the SIP JSON schema from the package data.

    The schema file is located in the sip subpackage under the resources directory.

    Returns:
        dict: Parsed JSON schema.
    """
    with SCHEMA_PATH.open() as f:
        return json.load(f)

Read the SIP JSON schema from the package data.

The schema file is located in the sip subpackage under the resources directory.

Returns

dict
Parsed JSON schema.
def sha512hash(file: pathlib._local.Path) ‑> str
Expand source code
def sha512hash(file: Path) -> str:
    """
    Calculate the SHA512 hash of a file.

    Args:
        file (Path): Path to the file.

    Returns:
        str: SHA512 hash as a hexadecimal string.
    """
    return hashlib.sha512(file.read_bytes()).hexdigest()

Calculate the SHA512 hash of a file.

Args

file : Path
Path to the file.

Returns

str
SHA512 hash as a hexadecimal string.
def validate_object_dir(object_path: pathlib._local.Path) ‑> None
Expand source code
def validate_object_dir(object_path: Path) -> None:
    """
    Check if everything needed is present in the object directory.

    Args:
        object_path (Path): Path to the object directory.

    Raises:
        ObjectDirectoryValidationError: If the directory or required files are missing,
            or if object.csv is invalid.
    """
    if not object_path.is_dir():
        raise ObjectDirectoryValidationError(
            f"Object directory '{object_path}' does not exist or is not a directory."
        )

    if not (object_path / "DC.xml").exists():
        raise ObjectDirectoryValidationError(
            f"Object directory '{object_path}' does contain a DC.xml file."
        )

    # TODO: validate the DC.xml file? Do we require some fields?

    # Check the object.csv file
    objfile = object_path / "object.csv"
    if not objfile.exists():
        raise ObjectDirectoryValidationError(
            f"Object directory '{object_path}' does not contain an object.csv file."
        )
    # use the ObjectCSVFile class to validate contents of the object.csv file
    csv_mgr = ObjectCSVManager(object_path)
    csv_mgr.validate()

Check if everything needed is present in the object directory.

Args

object_path : Path
Path to the object directory.

Raises

ObjectDirectoryValidationError
If the directory or required files are missing, or if object.csv is invalid.