sdmxabs

Capture data from the Australian Bureau of Statistics (ABS) using the SDMX API.

 1"""Capture data from the Australian Bureau of Statistics (ABS) using the SDMX API."""
 2
 3from importlib.metadata import PackageNotFoundError, version
 4
 5from .download_cache import (
 6    CacheError,
 7    GetFileKwargs,
 8    HttpError,
 9    ModalityType,
10)
11from .fetch import fetch
12from .fetch_multi import fetch_multi
13from .fetch_selection import MatchCriteria, MatchItem, MatchType, fetch_selection, make_wanted, match_item
14from .flow_metadata import code_lists, data_dimensions, data_flows
15
16# --- version and author
17try:
18    __version__ = version(__name__)
19except PackageNotFoundError:
20    __version__ = "0.0.0"  # Fallback for development mode
21__author__ = "Bryan Palmer"
22
23# --- establish the package contents
24__all__ = [
25    "CacheError",
26    "GetFileKwargs",
27    "HttpError",
28    "MatchCriteria",
29    "MatchItem",
30    "MatchType",
31    "ModalityType",
32    "__author__",
33    "__version__",
34    "code_lists",
35    "data_dimensions",
36    "data_flows",
37    "fetch",
38    "fetch_multi",
39    "fetch_selection",
40    "make_wanted",
41    "match_item",
42]
class CacheError(builtins.Exception):
30class CacheError(Exception):
31    """A problem retrieving data from the cache."""

A problem retrieving data from the cache.

class GetFileKwargs(typing.TypedDict):
37class GetFileKwargs(TypedDict):
38    """TypedDict for acqure_url function arguments."""
39
40    verbose: NotRequired[bool]
41    """If True, print information about the data retrieval process."""
42    modality: NotRequired[ModalityType]
43    """Kind of retrieval: "prefer_cache", "prefer_url"."""

TypedDict for acqure_url function arguments.

verbose: NotRequired[bool]

If True, print information about the data retrieval process.

modality: NotRequired[Literal['prefer-cache', 'prefer-url']]

Kind of retrieval: "prefer_cache", "prefer_url".

class HttpError(builtins.Exception):
26class HttpError(Exception):
27    """A problem retrieving data from HTTP."""

A problem retrieving data from HTTP.

MatchCriteria = collections.abc.Sequence[tuple[str, str, MatchType]]
MatchItem = tuple[str, str, MatchType]
class MatchType(enum.Enum):
17class MatchType(Enum):
18    """Enumeration for match types."""
19
20    EXACT = 1
21    PARTIAL = 2
22    REGEX = 3

Enumeration for match types.

EXACT = <MatchType.EXACT: 1>
PARTIAL = <MatchType.PARTIAL: 2>
REGEX = <MatchType.REGEX: 3>
ModalityType = typing.Literal['prefer-cache', 'prefer-url']
__author__ = 'Bryan Palmer'
__version__ = '0.1.0'
@cache
def code_lists( cl_id: str, **kwargs: Unpack[GetFileKwargs]) -> dict[str, dict[str, str]]:
 90@cache
 91def code_lists(cl_id: str, **kwargs: Unpack[GetFileKwargs]) -> FlowMetaDict:
 92    """Get the code list metadata from the ABS SDMX API.
 93
 94    Args:
 95        cl_id (str): The ID of the code list to retrieve.
 96        **kwargs: Additional keyword arguments passed to acquire_url().
 97
 98    Returns:
 99        FlowMetaDict: A dictionary containing the codes and
100            their associated key=value pairs. A "name" key should always
101            be present. A "parent" key may also be present.
102
103    Raises:
104        HttpError: If there is an issue with the HTTP request.
105        CacheError: If there is an issue with the cache.
106        ValueError: If no XML root is found in the response.
107
108    Note:
109        You will get a CacheError if the codelist is not found on the ABS SDMX API.
110        (This package tries the website first, then the cache.)
111
112    """
113    tree = acquire_xml(f"{URL_STEM}/codelist/ABS/{cl_id}", **kwargs)
114
115    codes: FlowMetaDict = {}
116    for code in tree.findall(".//str:Code", NAME_SPACES):
117        code_id = code.get("id", None)
118        if code_id is None:
119            continue
120        elements: dict[str, str] = {}
121        name = code.find("com:Name", NAME_SPACES)
122        elements["name"] = str(name.text) if name is not None else "(missing)"
123        parent = code.find("str:Parent", NAME_SPACES)
124        parent_id = ""
125        if parent is not None:
126            ref = parent.find("Ref", NAME_SPACES)
127            if ref is not None:
128                parent_id = str(ref.get("id", ""))
129            elements["parent"] = parent_id
130        codes[code_id] = elements
131
132    return codes

Get the code list metadata from the ABS SDMX API.

Args: cl_id (str): The ID of the code list to retrieve. **kwargs: Additional keyword arguments passed to acquire_url().

Returns: FlowMetaDict: A dictionary containing the codes and their associated key=value pairs. A "name" key should always be present. A "parent" key may also be present.

Raises: HttpError: If there is an issue with the HTTP request. CacheError: If there is an issue with the cache. ValueError: If no XML root is found in the response.

Note: You will get a CacheError if the codelist is not found on the ABS SDMX API. (This package tries the website first, then the cache.)

@cache
def data_dimensions( flow_id: str, **kwargs: Unpack[GetFileKwargs]) -> dict[str, dict[str, str]]:
55@cache
56def data_dimensions(flow_id: str, **kwargs: Unpack[GetFileKwargs]) -> FlowMetaDict:
57    """Get the data dimensions metadata from the ABS SDMX API.
58
59    Args:
60        flow_id (str): The ID of the dataflow to retrieve dimensions for.
61        **kwargs: Additional keyword arguments passed to acquire_url().
62
63    Returns:
64        dict[str, dict[str, str]]: A dictionary containing the dimensions and
65            their metadata in key=value pairs.
66
67    Raises:
68        HttpError: If there is an issue with the HTTP request.
69        CacheError: If there is an issue with the cache.
70        ValueError: If no XML root is found in the response.
71
72    """
73    tree = acquire_xml(f"{URL_STEM}/datastructure/ABS/{flow_id}", **kwargs)
74
75    dimensions = {}
76    for dim in tree.findall(".//str:Dimension", NAME_SPACES):
77        dim_id = dim.get("id")
78        dim_pos = dim.get("position")
79        if dim_id is None or dim_pos is None:
80            continue
81        contents = {"position": dim_pos}
82        if (lr := dim.find("str:LocalRepresentation", NAME_SPACES)) is not None and (
83            enumer := lr.find("str:Enumeration/Ref", NAME_SPACES)
84        ) is not None:
85            contents = contents | enumer.attrib
86        dimensions[dim_id] = contents
87    return dimensions

Get the data dimensions metadata from the ABS SDMX API.

Args: flow_id (str): The ID of the dataflow to retrieve dimensions for. **kwargs: Additional keyword arguments passed to acquire_url().

Returns: dict[str, dict[str, str]]: A dictionary containing the dimensions and their metadata in key=value pairs.

Raises: HttpError: If there is an issue with the HTTP request. CacheError: If there is an issue with the cache. ValueError: If no XML root is found in the response.

@cache
def data_flows( flow_id: str = 'all', **kwargs: Unpack[GetFileKwargs]) -> dict[str, dict[str, str]]:
22@cache
23def data_flows(flow_id: str = "all", **kwargs: Unpack[GetFileKwargs]) -> FlowMetaDict:
24    """Get the toplevel metadata from the ABS SDMX API.
25
26    Args:
27        flow_id (str): The ID of the dataflow to retrieve. Defaults to "all".
28        **kwargs: Additional keyword arguments passed to acquire_url().
29
30    Returns:
31        dict[str, dict[str, str]]: A dictionary containing the dataflow IDs
32            and their metadatain key=value pairs.
33
34    Raises:
35        HttpError: If there is an issue with the HTTP request.
36        CacheError: If there is an issue with the cache.
37        ValueError: If no XML root is found in the response.
38
39    """
40    tree = acquire_xml(f"{URL_STEM}/dataflow/ABS/{flow_id}", **kwargs)
41
42    d_flows: FlowMetaDict = {}
43    for dataflow in tree.findall(".//str:Dataflow", NAME_SPACES):
44        attributes: dict[str, str] = dataflow.attrib.copy()
45        if "id" not in attributes:
46            continue
47        df_id = attributes.pop("id")
48        name_elem = dataflow.find("com:Name", NAME_SPACES)
49        df_name = name_elem.text if name_elem is not None else "(no name)"
50        attributes["name"] = str(df_name)
51        d_flows[df_id] = attributes
52    return d_flows

Get the toplevel metadata from the ABS SDMX API.

Args: flow_id (str): The ID of the dataflow to retrieve. Defaults to "all". **kwargs: Additional keyword arguments passed to acquire_url().

Returns: dict[str, dict[str, str]]: A dictionary containing the dataflow IDs and their metadatain key=value pairs.

Raises: HttpError: If there is an issue with the HTTP request. CacheError: If there is an issue with the cache. ValueError: If no XML root is found in the response.

def fetch( flow_id: str, dims: dict[str, str] | None = None, constraints: dict[str, str] | None = None, *, validate: bool = False, **kwargs: Unpack[GetFileKwargs]) -> tuple[pandas.core.frame.DataFrame, pandas.core.frame.DataFrame]:
140def fetch(
141    flow_id: str,
142    dims: dict[str, str] | None = None,
143    constraints: dict[str, str] | None = None,  # not implemented yet
144    *,
145    validate: bool = False,
146    **kwargs: Unpack[GetFileKwargs],
147) -> tuple[pd.DataFrame, pd.DataFrame]:
148    """Fetch data from the ABS SDMX API.
149
150    Args:
151        flow_id (str): The ID of the data flow from which to retrieve data items.
152        dims (dict[str, str], optional): A dictionary of dimensions to select the
153            data items. If None, the ABS fetch request will be for all data items,
154            which can be slow.
155        constraints (dict[str, str], optional): A dictionary of constraints to apply
156            to the data items. If None, no constraints are applied.
157            [Not implemented yet, will raise NotImplementedError if used.]
158        validate (bool): If True, print validation diagnostics for the proposed
159            dimensions against the metadata requirements. Defaults to False.
160        **kwargs (GetFileKwargs): Additional keyword arguments passed to acquire_xml().
161
162    Returns: a tuple of two DataFrames:
163        - The first DataFrame contains the fetched data.
164        - The second DataFrame contains the metadata.
165
166    Raises:
167        HttpError: If there is an issue with the HTTP request.
168        CacheError: If there is an issue with the cache.
169        ValueError: If no XML root is found in the response.
170        NotImplementedError: If constraints are provided, as they are not implemented yet.
171
172    Notes:
173        If the `dims` argument is not valid you should get a CacheError or HttpError.
174        If the `flow_id` is not valid, you should get a ValueError.
175
176    """
177    # --- deal with the not implemented constraints
178    if constraints is not None:
179        raise NotImplementedError(
180            "Constraints are not implemented yet. Please use the `dims` argument to select data items."
181        )
182
183    # --- prepare to get the XML root from the ABS SDMX API
184    kwargs["modality"] = kwargs.get("modality", "prefer-cache")
185    key = build_key(
186        flow_id,
187        dims,
188        validate=validate,
189    )
190    url = f"{URL_STEM}/data/{flow_id}/{key}"
191    xml_root = acquire_xml(url, **kwargs)
192    return extract(flow_id, xml_root)

Fetch data from the ABS SDMX API.

Args: flow_id (str): The ID of the data flow from which to retrieve data items. dims (dict[str, str], optional): A dictionary of dimensions to select the data items. If None, the ABS fetch request will be for all data items, which can be slow. constraints (dict[str, str], optional): A dictionary of constraints to apply to the data items. If None, no constraints are applied. [Not implemented yet, will raise NotImplementedError if used.] validate (bool): If True, print validation diagnostics for the proposed dimensions against the metadata requirements. Defaults to False. **kwargs (GetFileKwargs): Additional keyword arguments passed to acquire_xml().

Returns: a tuple of two DataFrames: - The first DataFrame contains the fetched data. - The second DataFrame contains the metadata.

Raises: HttpError: If there is an issue with the HTTP request. CacheError: If there is an issue with the cache. ValueError: If no XML root is found in the response. NotImplementedError: If constraints are provided, as they are not implemented yet.

Notes: If the dims argument is not valid you should get a CacheError or HttpError. If the flow_id is not valid, you should get a ValueError.

def fetch_multi( wanted: pandas.core.frame.DataFrame, *, validate: bool = False, **kwargs: Unpack[GetFileKwargs]) -> tuple[pandas.core.frame.DataFrame, pandas.core.frame.DataFrame]:
 77def fetch_multi(
 78    wanted: pd.DataFrame,
 79    *,
 80    validate: bool = False,
 81    **kwargs: Unpack[GetFileKwargs],
 82) -> tuple[pd.DataFrame, pd.DataFrame]:
 83    """Fetch multiple SDMX datasets based on a DataFrame of desired datasets.
 84
 85    Args:
 86        wanted: A DataFrame with rows for each desired data set (of one or more series).
 87                Each row should contain the necessary identifiers to fetch the dataset.
 88                The columns will be 'flow_id', plus the ABS dimensions relevant to the flow.
 89                The 'flow_id' column is mandatory, and the rest are optional.
 90                Note: the DataFrame index is not used in the fetching process.
 91        validate: If True, the function will validate dimensions and values against
 92                  the ABS SDMX API codelists. Defaults to False.
 93        **kwargs: Additional keyword arguments passed to the underlying data fetching function.
 94
 95    Returns:
 96        A tuple containing two DataFrames:
 97        - The first DataFrame contains the fetched data.
 98        - The second DataFrame contains metadata about the fetched datasets.
 99
100    Raises:
101        ValueError: If the 'flow_id' column is missing from the `wanted` DataFrame.
102
103    Note:
104        CacheError and HttpError are raised by the fetch function.
105        These will be caught and reported to standard output.
106
107    Caution:
108        The selected data should all have the same index. You cannot mix (for example)
109        Quarterly and Monthly data in the same DataFrame.
110
111    """
112    # --- quick sanity checks
113    if wanted.empty:
114        print("wanted DataFrame is empty, returning empty DataFrames.")
115        return pd.DataFrame(), pd.DataFrame()
116    if "flow_id" not in wanted.columns:
117        raise ValueError("The 'flow_id' column is required in the 'wanted' DataFrame.")
118
119    # --- do the work
120    return extract(wanted, validate=validate, **kwargs)

Fetch multiple SDMX datasets based on a DataFrame of desired datasets.

Args: wanted: A DataFrame with rows for each desired data set (of one or more series). Each row should contain the necessary identifiers to fetch the dataset. The columns will be 'flow_id', plus the ABS dimensions relevant to the flow. The 'flow_id' column is mandatory, and the rest are optional. Note: the DataFrame index is not used in the fetching process. validate: If True, the function will validate dimensions and values against the ABS SDMX API codelists. Defaults to False. **kwargs: Additional keyword arguments passed to the underlying data fetching function.

Returns: A tuple containing two DataFrames: - The first DataFrame contains the fetched data. - The second DataFrame contains metadata about the fetched datasets.

Raises: ValueError: If the 'flow_id' column is missing from the wanted DataFrame.

Note: CacheError and HttpError are raised by the fetch function. These will be caught and reported to standard output.

Caution: The selected data should all have the same index. You cannot mix (for example) Quarterly and Monthly data in the same DataFrame.

def fetch_selection( flow_id: str, criteria: Sequence[tuple[str, str, MatchType]], *, validate: bool = False, **kwargs: Unpack[GetFileKwargs]) -> tuple[pandas.core.frame.DataFrame, pandas.core.frame.DataFrame]:
148def fetch_selection(
149    flow_id: str,
150    criteria: MatchCriteria,
151    *,
152    validate: bool = False,
153    **kwargs: Unpack[GetFileKwargs],
154) -> tuple[pd.DataFrame, pd.DataFrame]:
155    """Fetch data based on a selection criteria for items.
156
157    Args:
158        flow_id (str): The ID of the data flow to fetch.
159        criteria (MatchCriteria): A sequence of match criteria to filter the data.
160        validate (bool, optional): If True, validate the selection against the flow's
161            required dimensions. Defaults to False.
162        **kwargs: Additional keyword arguments for the fetch_multi function.
163
164    Returns:
165        tuple[pd.DataFrame, pd.DataFrame]: A tuple containing the fetched data and metadata.
166
167    """
168    selection = make_wanted(flow_id, criteria)
169    return fetch_multi(selection, validate=validate, **kwargs)

Fetch data based on a selection criteria for items.

Args: flow_id (str): The ID of the data flow to fetch. criteria (MatchCriteria): A sequence of match criteria to filter the data. validate (bool, optional): If True, validate the selection against the flow's required dimensions. Defaults to False. **kwargs: Additional keyword arguments for the fetch_multi function.

Returns: tuple[pd.DataFrame, pd.DataFrame]: A tuple containing the fetched data and metadata.

def make_wanted( flow_id: str, criteria: Sequence[tuple[str, str, MatchType]]) -> pandas.core.frame.DataFrame:
 96def make_wanted(
 97    flow_id: str,
 98    criteria: MatchCriteria,
 99) -> pd.DataFrame:
100    """Build the `wanted` Dataframe for use by fetch_multi() by matching flow metadata.
101
102    Args:
103        flow_id (str): The ID of the data flow to select items from.
104        criteria (MatchElements): A sequence of tuples containing the element name,
105            the value to match, and the match type (exact, partial, or regex).
106
107    Returns:
108        pd.DataFrame: A DataFrame containing the selected items, which can be dropped
109            into the call of the function fetch_multi().
110
111    Raises:
112        ValueError: If the flow_id is not valid or if no items match the criteria.
113
114    Notes:
115    -   Should build a one line DataFrame. This Frame may select multiple data series,
116        when passed to fetch_multi. It also can be concatenated with other DataFrames
117        to build a larger selection.
118    -   If two match elements refer to the same dimension, only the `intersection` of the
119        matches will be returned.
120
121    """
122    # --- some sanity checks
123    if flow_id not in data_flows():
124        raise ValueError(f"Invalid flow_id: {flow_id}.")
125    dimensions = data_dimensions(flow_id)
126    if not dimensions:
127        raise ValueError(f"No dimensions found for flow_id: {flow_id}.")
128
129    # --- lets build the codelist dictionary
130    return_dict: dict[str, str] = {}
131    for pattern, dimension, match_type in criteria:
132        if dimension not in dimensions:
133            print(f"Dimension '{dimension}' not found for flow '{flow_id}'; (skipping)")
134            continue
135        dim_dict = dimensions[dimension]
136        if "package" not in dim_dict or dim_dict["package"] != "codelist" or "id" not in dim_dict:
137            print(f"Dimension '{dimension}' does not have a codelist; (skipping)")
138            continue
139        code_list_name = dim_dict.get("id")
140        codes = get_codes(code_lists(code_list_name), pattern, match_type)
141        package_codes(codes, dimension, return_dict)
142
143    # --- return as a (one row) `wanted` DataFrame
144    return_dict["flow_id"] = flow_id
145    return pd.DataFrame([return_dict]).astype(str)

Build the wanted Dataframe for use by fetch_multi() by matching flow metadata.

Args: flow_id (str): The ID of the data flow to select items from. criteria (MatchElements): A sequence of tuples containing the element name, the value to match, and the match type (exact, partial, or regex).

Returns: pd.DataFrame: A DataFrame containing the selected items, which can be dropped into the call of the function fetch_multi().

Raises: ValueError: If the flow_id is not valid or if no items match the criteria.

Notes:

  • Should build a one line DataFrame. This Frame may select multiple data series, when passed to fetch_multi. It also can be concatenated with other DataFrames to build a larger selection.
  • If two match elements refer to the same dimension, only the intersection of the matches will be returned.
def match_item( pattern: str, dimension: str, match_type: MatchType = <MatchType.PARTIAL: 2>) -> tuple[str, str, MatchType]:
77def match_item(
78    pattern: str,
79    dimension: str,
80    match_type: MatchType = MatchType.PARTIAL,
81) -> MatchItem:
82    """Create a new MatchItem for use in select_items() and fetch_selection().
83
84    Args:
85        pattern (str): The pattern to match.
86        dimension (str): The dimension to match against.
87        match_type (MatchType, optional): The type of match to perform. Defaults to MatchType.EXACT.
88
89    Returns:
90        MatchElement: A tuple representing the match element.
91
92    """
93    return (pattern, dimension, match_type)

Create a new MatchItem for use in select_items() and fetch_selection().

Args: pattern (str): The pattern to match. dimension (str): The dimension to match against. match_type (MatchType, optional): The type of match to perform. Defaults to MatchType.EXACT.

Returns: MatchElement: A tuple representing the match element.