sdmxabs.fetch

Obtain data from the ABS SDMX API.

  1"""Obtain data from the ABS SDMX API."""
  2
  3from typing import Unpack
  4from xml.etree.ElementTree import Element
  5
  6import numpy as np
  7import pandas as pd
  8
  9from sdmxabs.download_cache import GetFileKwargs
 10from sdmxabs.flow_metadata import FlowMetaDict, build_key, code_lists, data_dimensions, data_flows
 11from sdmxabs.xml_base import NAME_SPACES, URL_STEM, acquire_xml
 12
 13
 14def get_series_data(xml_series: Element, meta: pd.Series) -> pd.Series:
 15    """Extract observed data from the XML tree for a given single series."""
 16    series_elements = {}
 17    for item in xml_series.findall("gen:Obs", NAME_SPACES):
 18        # --- get the index and value from the XML item, or nan if not found
 19        index_container = item.find("gen:ObsDimension", NAME_SPACES)
 20        index_obs = index_container.attrib.get("value", None) if index_container is not None else None
 21        value_container = item.find("gen:ObsValue", NAME_SPACES)
 22        value_obs = value_container.attrib.get("value", None) if value_container is not None else None
 23        if index_obs is None or value_obs is None:
 24            continue
 25        series_elements[index_obs] = value_obs
 26    series: pd.Series = pd.Series(series_elements).sort_index()
 27
 28    # --- if we can, make the series values numeric
 29    series = series.replace("", np.nan)
 30    try:
 31        series = pd.to_numeric(series)
 32    except ValueError:
 33        # If conversion fails, keep the series as is (it may contain useful non-numeric data)
 34        print(f"Could not convert series {meta.name} to numeric, keeping as is.")
 35
 36    # --- if we can, make the index a PeriodIndex based on the frequency
 37    if "FREQ" in meta.index:
 38        freq = meta["FREQ"]
 39        if freq == "Annual":
 40            series.index = pd.PeriodIndex(series.index, freq="Y")
 41        elif freq == "Quarterly":
 42            series.index = pd.PeriodIndex(series.index, freq="Q")
 43        elif freq == "Monthly":
 44            series.index = pd.PeriodIndex(series.index, freq="M")
 45        elif freq in ("Daily", "Daily or businessweek"):
 46            series.index = pd.PeriodIndex(series.index, freq="D")
 47        else:
 48            print(f"Unknown frequency {freq}, leaving index as is.")
 49
 50    return series
 51
 52
 53def decode_meta_value(meta_value: str, meta_id: str, dims: FlowMetaDict) -> str:
 54    """Decode a metadata value based on its ID and the relevant ABS codelist."""
 55    return_value = meta_value  # default to returning the raw value
 56    if meta_id in dims and "id" in dims[meta_id] and "package" in dims[meta_id]:
 57        cl_id = dims[meta_id]["id"]
 58        cl_package_type = dims[meta_id]["package"]
 59        if cl_id and cl_package_type == "codelist":
 60            cl = code_lists(cl_id)
 61            if meta_value in cl and "name" in cl[meta_value]:
 62                return_value = cl[meta_value]["name"]
 63    return return_value
 64
 65
 66def get_series_meta_data(
 67    flow_id: str, xml_series: Element, series_count: int, dims: FlowMetaDict
 68) -> tuple[str, pd.Series]:
 69    """Extract and decode metadata from the XML tree for one given series.
 70
 71    Args:
 72        flow_id (str): The ID of the data flow to which the series belongs.
 73        xml_series (Element): The XML element representing the series.
 74        series_count (int): The index of the series in the XML tree.
 75        dims (FlowMetaDict): Dictionary containing metadata dimensions and
 76            their associated codelist names.
 77
 78    Returns:
 79        tuple[str, pd.Series]: A tuple containing the series label and a Series
 80            of metadata items for the series.
 81
 82    """
 83    item_count = 0
 84    keys = [flow_id]
 85    flow_name = data_flows().get(flow_id, {"name": flow_id})["name"]
 86    meta_items = {"DATAFLOW": flow_name}  # start with the flow ID
 87    key_sets = ("SeriesKey", "Attributes")
 88    for key_set in key_sets:
 89        attribs = xml_series.find(f"gen:{key_set}", NAME_SPACES)
 90        if attribs is None:
 91            print(f"No {key_set} found in series, skipping.")
 92            continue
 93        for item in attribs.findall("gen:Value", NAME_SPACES):
 94            # --- get the metadata item ID and value, or create a placeholder if missing
 95            meta_id = item.attrib.get("id", f"missing meta_id {series_count}-{item_count}")
 96            meta_value = item.attrib.get("value", f"missing meta_value {series_count}-{item_count}")
 97            keys.append(meta_value)
 98            decoded_meta_value = decode_meta_value(meta_value, meta_id, dims)
 99            meta_items[meta_id] = decoded_meta_value
100            item_count += 1
101
102    final_key = ".".join(keys)  # create a unique label for the series
103
104    return final_key, pd.Series(meta_items).rename(final_key)
105
106
107def extract(flow_id: str, tree: Element) -> tuple[pd.DataFrame, pd.DataFrame]:
108    """Extract data from the XML tree."""
109    # Get the data dimensions for the flow_id, it provides entree to the metadata
110    dims = data_dimensions(flow_id)
111
112    meta = {}
113    data = {}
114    for series_count, xml_series in enumerate(tree.findall(".//gen:Series", NAME_SPACES)):
115        if xml_series is None:
116            print("No Series found in XML tree, skipping.")
117            continue
118        label, dataset = get_series_meta_data(
119            flow_id,
120            # python typing is not smart enough to know that
121            # xml_series is an ElementTree
122            xml_series,
123            series_count,
124            dims,
125        )
126        if label in meta:
127            # this can happen if you implicitly select the same series multiple times
128            print(f"Duplicate series {label} in {flow_id} found, check your specifications, skipping.")
129            continue
130        meta[label] = dataset
131        series = get_series_data(xml_series, dataset)
132        series.name = label
133        data[label] = series
134
135    return pd.DataFrame(data), pd.DataFrame(meta).T  # data, meta
136
137
138# === public functions ===
139def fetch(
140    flow_id: str,
141    dims: dict[str, str] | None = None,
142    constraints: dict[str, str] | None = None,  # not implemented yet
143    *,
144    validate: bool = False,
145    **kwargs: Unpack[GetFileKwargs],
146) -> tuple[pd.DataFrame, pd.DataFrame]:
147    """Fetch data from the ABS SDMX API.
148
149    Args:
150        flow_id (str): The ID of the data flow from which to retrieve data items.
151        dims (dict[str, str], optional): A dictionary of dimensions to select the
152            data items. If None, the ABS fetch request will be for all data items,
153            which can be slow.
154        constraints (dict[str, str], optional): A dictionary of constraints to apply
155            to the data items. If None, no constraints are applied.
156            [Not implemented yet, will raise NotImplementedError if used.]
157        validate (bool): If True, print validation diagnostics for the proposed
158            dimensions against the metadata requirements. Defaults to False.
159        **kwargs (GetFileKwargs): Additional keyword arguments passed to acquire_xml().
160
161    Returns: a tuple of two DataFrames:
162        - The first DataFrame contains the fetched data.
163        - The second DataFrame contains the metadata.
164
165    Raises:
166        HttpError: If there is an issue with the HTTP request.
167        CacheError: If there is an issue with the cache.
168        ValueError: If no XML root is found in the response.
169        NotImplementedError: If constraints are provided, as they are not implemented yet.
170
171    Notes:
172        If the `dims` argument is not valid you should get a CacheError or HttpError.
173        If the `flow_id` is not valid, you should get a ValueError.
174
175    """
176    # --- deal with the not implemented constraints
177    if constraints is not None:
178        raise NotImplementedError(
179            "Constraints are not implemented yet. Please use the `dims` argument to select data items."
180        )
181
182    # --- prepare to get the XML root from the ABS SDMX API
183    kwargs["modality"] = kwargs.get("modality", "prefer-cache")
184    key = build_key(
185        flow_id,
186        dims,
187        validate=validate,
188    )
189    url = f"{URL_STEM}/data/{flow_id}/{key}"
190    xml_root = acquire_xml(url, **kwargs)
191    return extract(flow_id, xml_root)
192
193
194# --- quick and dirty testing
195if __name__ == "__main__":
196    # Example usage
197    FLOW_ID = "WPI"
198    DIMS = {
199        "MEASURE": "3",
200        "INDEX": "OHRPEB",
201        "SECTOR": "7",
202        "INDUSTRY": "TOT",
203        "TSEST": "10",
204        "REGION": "AUS",
205        "FREQ": "Q",
206    }
207
208    FETCHED_DATA, FETCHED_META = fetch(
209        FLOW_ID,
210        dims=DIMS,
211        validate=True,
212        modality="prefer-url",
213    )
214    # Note: The transpose (.T) is used here to make the output more readable
215    print("\nFetched Data:\n", FETCHED_DATA.T, sep="")
216    print("\nFetched Metadata:\n", FETCHED_META.T, sep="")
def get_series_data( xml_series: xml.etree.ElementTree.Element, meta: pandas.core.series.Series) -> pandas.core.series.Series:
15def get_series_data(xml_series: Element, meta: pd.Series) -> pd.Series:
16    """Extract observed data from the XML tree for a given single series."""
17    series_elements = {}
18    for item in xml_series.findall("gen:Obs", NAME_SPACES):
19        # --- get the index and value from the XML item, or nan if not found
20        index_container = item.find("gen:ObsDimension", NAME_SPACES)
21        index_obs = index_container.attrib.get("value", None) if index_container is not None else None
22        value_container = item.find("gen:ObsValue", NAME_SPACES)
23        value_obs = value_container.attrib.get("value", None) if value_container is not None else None
24        if index_obs is None or value_obs is None:
25            continue
26        series_elements[index_obs] = value_obs
27    series: pd.Series = pd.Series(series_elements).sort_index()
28
29    # --- if we can, make the series values numeric
30    series = series.replace("", np.nan)
31    try:
32        series = pd.to_numeric(series)
33    except ValueError:
34        # If conversion fails, keep the series as is (it may contain useful non-numeric data)
35        print(f"Could not convert series {meta.name} to numeric, keeping as is.")
36
37    # --- if we can, make the index a PeriodIndex based on the frequency
38    if "FREQ" in meta.index:
39        freq = meta["FREQ"]
40        if freq == "Annual":
41            series.index = pd.PeriodIndex(series.index, freq="Y")
42        elif freq == "Quarterly":
43            series.index = pd.PeriodIndex(series.index, freq="Q")
44        elif freq == "Monthly":
45            series.index = pd.PeriodIndex(series.index, freq="M")
46        elif freq in ("Daily", "Daily or businessweek"):
47            series.index = pd.PeriodIndex(series.index, freq="D")
48        else:
49            print(f"Unknown frequency {freq}, leaving index as is.")
50
51    return series

Extract observed data from the XML tree for a given single series.

def decode_meta_value(meta_value: str, meta_id: str, dims: dict[str, dict[str, str]]) -> str:
54def decode_meta_value(meta_value: str, meta_id: str, dims: FlowMetaDict) -> str:
55    """Decode a metadata value based on its ID and the relevant ABS codelist."""
56    return_value = meta_value  # default to returning the raw value
57    if meta_id in dims and "id" in dims[meta_id] and "package" in dims[meta_id]:
58        cl_id = dims[meta_id]["id"]
59        cl_package_type = dims[meta_id]["package"]
60        if cl_id and cl_package_type == "codelist":
61            cl = code_lists(cl_id)
62            if meta_value in cl and "name" in cl[meta_value]:
63                return_value = cl[meta_value]["name"]
64    return return_value

Decode a metadata value based on its ID and the relevant ABS codelist.

def get_series_meta_data( flow_id: str, xml_series: xml.etree.ElementTree.Element, series_count: int, dims: dict[str, dict[str, str]]) -> tuple[str, pandas.core.series.Series]:
 67def get_series_meta_data(
 68    flow_id: str, xml_series: Element, series_count: int, dims: FlowMetaDict
 69) -> tuple[str, pd.Series]:
 70    """Extract and decode metadata from the XML tree for one given series.
 71
 72    Args:
 73        flow_id (str): The ID of the data flow to which the series belongs.
 74        xml_series (Element): The XML element representing the series.
 75        series_count (int): The index of the series in the XML tree.
 76        dims (FlowMetaDict): Dictionary containing metadata dimensions and
 77            their associated codelist names.
 78
 79    Returns:
 80        tuple[str, pd.Series]: A tuple containing the series label and a Series
 81            of metadata items for the series.
 82
 83    """
 84    item_count = 0
 85    keys = [flow_id]
 86    flow_name = data_flows().get(flow_id, {"name": flow_id})["name"]
 87    meta_items = {"DATAFLOW": flow_name}  # start with the flow ID
 88    key_sets = ("SeriesKey", "Attributes")
 89    for key_set in key_sets:
 90        attribs = xml_series.find(f"gen:{key_set}", NAME_SPACES)
 91        if attribs is None:
 92            print(f"No {key_set} found in series, skipping.")
 93            continue
 94        for item in attribs.findall("gen:Value", NAME_SPACES):
 95            # --- get the metadata item ID and value, or create a placeholder if missing
 96            meta_id = item.attrib.get("id", f"missing meta_id {series_count}-{item_count}")
 97            meta_value = item.attrib.get("value", f"missing meta_value {series_count}-{item_count}")
 98            keys.append(meta_value)
 99            decoded_meta_value = decode_meta_value(meta_value, meta_id, dims)
100            meta_items[meta_id] = decoded_meta_value
101            item_count += 1
102
103    final_key = ".".join(keys)  # create a unique label for the series
104
105    return final_key, pd.Series(meta_items).rename(final_key)

Extract and decode metadata from the XML tree for one given series.

Args: flow_id (str): The ID of the data flow to which the series belongs. xml_series (Element): The XML element representing the series. series_count (int): The index of the series in the XML tree. dims (FlowMetaDict): Dictionary containing metadata dimensions and their associated codelist names.

Returns: tuple[str, pd.Series]: A tuple containing the series label and a Series of metadata items for the series.

def extract( flow_id: str, tree: xml.etree.ElementTree.Element) -> tuple[pandas.core.frame.DataFrame, pandas.core.frame.DataFrame]:
108def extract(flow_id: str, tree: Element) -> tuple[pd.DataFrame, pd.DataFrame]:
109    """Extract data from the XML tree."""
110    # Get the data dimensions for the flow_id, it provides entree to the metadata
111    dims = data_dimensions(flow_id)
112
113    meta = {}
114    data = {}
115    for series_count, xml_series in enumerate(tree.findall(".//gen:Series", NAME_SPACES)):
116        if xml_series is None:
117            print("No Series found in XML tree, skipping.")
118            continue
119        label, dataset = get_series_meta_data(
120            flow_id,
121            # python typing is not smart enough to know that
122            # xml_series is an ElementTree
123            xml_series,
124            series_count,
125            dims,
126        )
127        if label in meta:
128            # this can happen if you implicitly select the same series multiple times
129            print(f"Duplicate series {label} in {flow_id} found, check your specifications, skipping.")
130            continue
131        meta[label] = dataset
132        series = get_series_data(xml_series, dataset)
133        series.name = label
134        data[label] = series
135
136    return pd.DataFrame(data), pd.DataFrame(meta).T  # data, meta

Extract data from the XML tree.

def fetch( flow_id: str, dims: dict[str, str] | None = None, constraints: dict[str, str] | None = None, *, validate: bool = False, **kwargs: Unpack[sdmxabs.GetFileKwargs]) -> tuple[pandas.core.frame.DataFrame, pandas.core.frame.DataFrame]:
140def fetch(
141    flow_id: str,
142    dims: dict[str, str] | None = None,
143    constraints: dict[str, str] | None = None,  # not implemented yet
144    *,
145    validate: bool = False,
146    **kwargs: Unpack[GetFileKwargs],
147) -> tuple[pd.DataFrame, pd.DataFrame]:
148    """Fetch data from the ABS SDMX API.
149
150    Args:
151        flow_id (str): The ID of the data flow from which to retrieve data items.
152        dims (dict[str, str], optional): A dictionary of dimensions to select the
153            data items. If None, the ABS fetch request will be for all data items,
154            which can be slow.
155        constraints (dict[str, str], optional): A dictionary of constraints to apply
156            to the data items. If None, no constraints are applied.
157            [Not implemented yet, will raise NotImplementedError if used.]
158        validate (bool): If True, print validation diagnostics for the proposed
159            dimensions against the metadata requirements. Defaults to False.
160        **kwargs (GetFileKwargs): Additional keyword arguments passed to acquire_xml().
161
162    Returns: a tuple of two DataFrames:
163        - The first DataFrame contains the fetched data.
164        - The second DataFrame contains the metadata.
165
166    Raises:
167        HttpError: If there is an issue with the HTTP request.
168        CacheError: If there is an issue with the cache.
169        ValueError: If no XML root is found in the response.
170        NotImplementedError: If constraints are provided, as they are not implemented yet.
171
172    Notes:
173        If the `dims` argument is not valid you should get a CacheError or HttpError.
174        If the `flow_id` is not valid, you should get a ValueError.
175
176    """
177    # --- deal with the not implemented constraints
178    if constraints is not None:
179        raise NotImplementedError(
180            "Constraints are not implemented yet. Please use the `dims` argument to select data items."
181        )
182
183    # --- prepare to get the XML root from the ABS SDMX API
184    kwargs["modality"] = kwargs.get("modality", "prefer-cache")
185    key = build_key(
186        flow_id,
187        dims,
188        validate=validate,
189    )
190    url = f"{URL_STEM}/data/{flow_id}/{key}"
191    xml_root = acquire_xml(url, **kwargs)
192    return extract(flow_id, xml_root)

Fetch data from the ABS SDMX API.

Args: flow_id (str): The ID of the data flow from which to retrieve data items. dims (dict[str, str], optional): A dictionary of dimensions to select the data items. If None, the ABS fetch request will be for all data items, which can be slow. constraints (dict[str, str], optional): A dictionary of constraints to apply to the data items. If None, no constraints are applied. [Not implemented yet, will raise NotImplementedError if used.] validate (bool): If True, print validation diagnostics for the proposed dimensions against the metadata requirements. Defaults to False. **kwargs (GetFileKwargs): Additional keyword arguments passed to acquire_xml().

Returns: a tuple of two DataFrames: - The first DataFrame contains the fetched data. - The second DataFrame contains the metadata.

Raises: HttpError: If there is an issue with the HTTP request. CacheError: If there is an issue with the cache. ValueError: If no XML root is found in the response. NotImplementedError: If constraints are provided, as they are not implemented yet.

Notes: If the dims argument is not valid you should get a CacheError or HttpError. If the flow_id is not valid, you should get a ValueError.