sdmxabs.fetch
Obtain data from the ABS SDMX API.
1"""Obtain data from the ABS SDMX API.""" 2 3from typing import Unpack 4from xml.etree.ElementTree import Element 5 6import numpy as np 7import pandas as pd 8 9from sdmxabs.download_cache import GetFileKwargs 10from sdmxabs.flow_metadata import FlowMetaDict, build_key, code_lists, data_dimensions, data_flows 11from sdmxabs.xml_base import NAME_SPACES, URL_STEM, acquire_xml 12 13 14def get_series_data(xml_series: Element, meta: pd.Series) -> pd.Series: 15 """Extract observed data from the XML tree for a given single series.""" 16 series_elements = {} 17 for item in xml_series.findall("gen:Obs", NAME_SPACES): 18 # --- get the index and value from the XML item, or nan if not found 19 index_container = item.find("gen:ObsDimension", NAME_SPACES) 20 index_obs = index_container.attrib.get("value", None) if index_container is not None else None 21 value_container = item.find("gen:ObsValue", NAME_SPACES) 22 value_obs = value_container.attrib.get("value", None) if value_container is not None else None 23 if index_obs is None or value_obs is None: 24 continue 25 series_elements[index_obs] = value_obs 26 series: pd.Series = pd.Series(series_elements).sort_index() 27 28 # --- if we can, make the series values numeric 29 series = series.replace("", np.nan) 30 try: 31 series = pd.to_numeric(series) 32 except ValueError: 33 # If conversion fails, keep the series as is (it may contain useful non-numeric data) 34 print(f"Could not convert series {meta.name} to numeric, keeping as is.") 35 36 # --- if we can, make the index a PeriodIndex based on the frequency 37 if "FREQ" in meta.index: 38 freq = meta["FREQ"] 39 if freq == "Annual": 40 series.index = pd.PeriodIndex(series.index, freq="Y") 41 elif freq == "Quarterly": 42 series.index = pd.PeriodIndex(series.index, freq="Q") 43 elif freq == "Monthly": 44 series.index = pd.PeriodIndex(series.index, freq="M") 45 elif freq in ("Daily", "Daily or businessweek"): 46 series.index = pd.PeriodIndex(series.index, freq="D") 47 else: 48 print(f"Unknown frequency {freq}, leaving index as is.") 49 50 return series 51 52 53def decode_meta_value(meta_value: str, meta_id: str, dims: FlowMetaDict) -> str: 54 """Decode a metadata value based on its ID and the relevant ABS codelist.""" 55 return_value = meta_value # default to returning the raw value 56 if meta_id in dims and "id" in dims[meta_id] and "package" in dims[meta_id]: 57 cl_id = dims[meta_id]["id"] 58 cl_package_type = dims[meta_id]["package"] 59 if cl_id and cl_package_type == "codelist": 60 cl = code_lists(cl_id) 61 if meta_value in cl and "name" in cl[meta_value]: 62 return_value = cl[meta_value]["name"] 63 return return_value 64 65 66def get_series_meta_data( 67 flow_id: str, xml_series: Element, series_count: int, dims: FlowMetaDict 68) -> tuple[str, pd.Series]: 69 """Extract and decode metadata from the XML tree for one given series. 70 71 Args: 72 flow_id (str): The ID of the data flow to which the series belongs. 73 xml_series (Element): The XML element representing the series. 74 series_count (int): The index of the series in the XML tree. 75 dims (FlowMetaDict): Dictionary containing metadata dimensions and 76 their associated codelist names. 77 78 Returns: 79 tuple[str, pd.Series]: A tuple containing the series label and a Series 80 of metadata items for the series. 81 82 """ 83 item_count = 0 84 keys = [flow_id] 85 flow_name = data_flows().get(flow_id, {"name": flow_id})["name"] 86 meta_items = {"DATAFLOW": flow_name} # start with the flow ID 87 key_sets = ("SeriesKey", "Attributes") 88 for key_set in key_sets: 89 attribs = xml_series.find(f"gen:{key_set}", NAME_SPACES) 90 if attribs is None: 91 print(f"No {key_set} found in series, skipping.") 92 continue 93 for item in attribs.findall("gen:Value", NAME_SPACES): 94 # --- get the metadata item ID and value, or create a placeholder if missing 95 meta_id = item.attrib.get("id", f"missing meta_id {series_count}-{item_count}") 96 meta_value = item.attrib.get("value", f"missing meta_value {series_count}-{item_count}") 97 keys.append(meta_value) 98 decoded_meta_value = decode_meta_value(meta_value, meta_id, dims) 99 meta_items[meta_id] = decoded_meta_value 100 item_count += 1 101 102 final_key = ".".join(keys) # create a unique label for the series 103 104 return final_key, pd.Series(meta_items).rename(final_key) 105 106 107def extract(flow_id: str, tree: Element) -> tuple[pd.DataFrame, pd.DataFrame]: 108 """Extract data from the XML tree.""" 109 # Get the data dimensions for the flow_id, it provides entree to the metadata 110 dims = data_dimensions(flow_id) 111 112 meta = {} 113 data = {} 114 for series_count, xml_series in enumerate(tree.findall(".//gen:Series", NAME_SPACES)): 115 if xml_series is None: 116 print("No Series found in XML tree, skipping.") 117 continue 118 label, dataset = get_series_meta_data( 119 flow_id, 120 # python typing is not smart enough to know that 121 # xml_series is an ElementTree 122 xml_series, 123 series_count, 124 dims, 125 ) 126 if label in meta: 127 # this can happen if you implicitly select the same series multiple times 128 print(f"Duplicate series {label} in {flow_id} found, check your specifications, skipping.") 129 continue 130 meta[label] = dataset 131 series = get_series_data(xml_series, dataset) 132 series.name = label 133 data[label] = series 134 135 return pd.DataFrame(data), pd.DataFrame(meta).T # data, meta 136 137 138# === public functions === 139def fetch( 140 flow_id: str, 141 dims: dict[str, str] | None = None, 142 constraints: dict[str, str] | None = None, # not implemented yet 143 *, 144 validate: bool = False, 145 **kwargs: Unpack[GetFileKwargs], 146) -> tuple[pd.DataFrame, pd.DataFrame]: 147 """Fetch data from the ABS SDMX API. 148 149 Args: 150 flow_id (str): The ID of the data flow from which to retrieve data items. 151 dims (dict[str, str], optional): A dictionary of dimensions to select the 152 data items. If None, the ABS fetch request will be for all data items, 153 which can be slow. 154 constraints (dict[str, str], optional): A dictionary of constraints to apply 155 to the data items. If None, no constraints are applied. 156 [Not implemented yet, will raise NotImplementedError if used.] 157 validate (bool): If True, print validation diagnostics for the proposed 158 dimensions against the metadata requirements. Defaults to False. 159 **kwargs (GetFileKwargs): Additional keyword arguments passed to acquire_xml(). 160 161 Returns: a tuple of two DataFrames: 162 - The first DataFrame contains the fetched data. 163 - The second DataFrame contains the metadata. 164 165 Raises: 166 HttpError: If there is an issue with the HTTP request. 167 CacheError: If there is an issue with the cache. 168 ValueError: If no XML root is found in the response. 169 NotImplementedError: If constraints are provided, as they are not implemented yet. 170 171 Notes: 172 If the `dims` argument is not valid you should get a CacheError or HttpError. 173 If the `flow_id` is not valid, you should get a ValueError. 174 175 """ 176 # --- deal with the not implemented constraints 177 if constraints is not None: 178 raise NotImplementedError( 179 "Constraints are not implemented yet. Please use the `dims` argument to select data items." 180 ) 181 182 # --- prepare to get the XML root from the ABS SDMX API 183 kwargs["modality"] = kwargs.get("modality", "prefer-cache") 184 key = build_key( 185 flow_id, 186 dims, 187 validate=validate, 188 ) 189 url = f"{URL_STEM}/data/{flow_id}/{key}" 190 xml_root = acquire_xml(url, **kwargs) 191 return extract(flow_id, xml_root) 192 193 194# --- quick and dirty testing 195if __name__ == "__main__": 196 # Example usage 197 FLOW_ID = "WPI" 198 DIMS = { 199 "MEASURE": "3", 200 "INDEX": "OHRPEB", 201 "SECTOR": "7", 202 "INDUSTRY": "TOT", 203 "TSEST": "10", 204 "REGION": "AUS", 205 "FREQ": "Q", 206 } 207 208 FETCHED_DATA, FETCHED_META = fetch( 209 FLOW_ID, 210 dims=DIMS, 211 validate=True, 212 modality="prefer-url", 213 ) 214 # Note: The transpose (.T) is used here to make the output more readable 215 print("\nFetched Data:\n", FETCHED_DATA.T, sep="") 216 print("\nFetched Metadata:\n", FETCHED_META.T, sep="")
15def get_series_data(xml_series: Element, meta: pd.Series) -> pd.Series: 16 """Extract observed data from the XML tree for a given single series.""" 17 series_elements = {} 18 for item in xml_series.findall("gen:Obs", NAME_SPACES): 19 # --- get the index and value from the XML item, or nan if not found 20 index_container = item.find("gen:ObsDimension", NAME_SPACES) 21 index_obs = index_container.attrib.get("value", None) if index_container is not None else None 22 value_container = item.find("gen:ObsValue", NAME_SPACES) 23 value_obs = value_container.attrib.get("value", None) if value_container is not None else None 24 if index_obs is None or value_obs is None: 25 continue 26 series_elements[index_obs] = value_obs 27 series: pd.Series = pd.Series(series_elements).sort_index() 28 29 # --- if we can, make the series values numeric 30 series = series.replace("", np.nan) 31 try: 32 series = pd.to_numeric(series) 33 except ValueError: 34 # If conversion fails, keep the series as is (it may contain useful non-numeric data) 35 print(f"Could not convert series {meta.name} to numeric, keeping as is.") 36 37 # --- if we can, make the index a PeriodIndex based on the frequency 38 if "FREQ" in meta.index: 39 freq = meta["FREQ"] 40 if freq == "Annual": 41 series.index = pd.PeriodIndex(series.index, freq="Y") 42 elif freq == "Quarterly": 43 series.index = pd.PeriodIndex(series.index, freq="Q") 44 elif freq == "Monthly": 45 series.index = pd.PeriodIndex(series.index, freq="M") 46 elif freq in ("Daily", "Daily or businessweek"): 47 series.index = pd.PeriodIndex(series.index, freq="D") 48 else: 49 print(f"Unknown frequency {freq}, leaving index as is.") 50 51 return series
Extract observed data from the XML tree for a given single series.
54def decode_meta_value(meta_value: str, meta_id: str, dims: FlowMetaDict) -> str: 55 """Decode a metadata value based on its ID and the relevant ABS codelist.""" 56 return_value = meta_value # default to returning the raw value 57 if meta_id in dims and "id" in dims[meta_id] and "package" in dims[meta_id]: 58 cl_id = dims[meta_id]["id"] 59 cl_package_type = dims[meta_id]["package"] 60 if cl_id and cl_package_type == "codelist": 61 cl = code_lists(cl_id) 62 if meta_value in cl and "name" in cl[meta_value]: 63 return_value = cl[meta_value]["name"] 64 return return_value
Decode a metadata value based on its ID and the relevant ABS codelist.
67def get_series_meta_data( 68 flow_id: str, xml_series: Element, series_count: int, dims: FlowMetaDict 69) -> tuple[str, pd.Series]: 70 """Extract and decode metadata from the XML tree for one given series. 71 72 Args: 73 flow_id (str): The ID of the data flow to which the series belongs. 74 xml_series (Element): The XML element representing the series. 75 series_count (int): The index of the series in the XML tree. 76 dims (FlowMetaDict): Dictionary containing metadata dimensions and 77 their associated codelist names. 78 79 Returns: 80 tuple[str, pd.Series]: A tuple containing the series label and a Series 81 of metadata items for the series. 82 83 """ 84 item_count = 0 85 keys = [flow_id] 86 flow_name = data_flows().get(flow_id, {"name": flow_id})["name"] 87 meta_items = {"DATAFLOW": flow_name} # start with the flow ID 88 key_sets = ("SeriesKey", "Attributes") 89 for key_set in key_sets: 90 attribs = xml_series.find(f"gen:{key_set}", NAME_SPACES) 91 if attribs is None: 92 print(f"No {key_set} found in series, skipping.") 93 continue 94 for item in attribs.findall("gen:Value", NAME_SPACES): 95 # --- get the metadata item ID and value, or create a placeholder if missing 96 meta_id = item.attrib.get("id", f"missing meta_id {series_count}-{item_count}") 97 meta_value = item.attrib.get("value", f"missing meta_value {series_count}-{item_count}") 98 keys.append(meta_value) 99 decoded_meta_value = decode_meta_value(meta_value, meta_id, dims) 100 meta_items[meta_id] = decoded_meta_value 101 item_count += 1 102 103 final_key = ".".join(keys) # create a unique label for the series 104 105 return final_key, pd.Series(meta_items).rename(final_key)
Extract and decode metadata from the XML tree for one given series.
Args: flow_id (str): The ID of the data flow to which the series belongs. xml_series (Element): The XML element representing the series. series_count (int): The index of the series in the XML tree. dims (FlowMetaDict): Dictionary containing metadata dimensions and their associated codelist names.
Returns: tuple[str, pd.Series]: A tuple containing the series label and a Series of metadata items for the series.
108def extract(flow_id: str, tree: Element) -> tuple[pd.DataFrame, pd.DataFrame]: 109 """Extract data from the XML tree.""" 110 # Get the data dimensions for the flow_id, it provides entree to the metadata 111 dims = data_dimensions(flow_id) 112 113 meta = {} 114 data = {} 115 for series_count, xml_series in enumerate(tree.findall(".//gen:Series", NAME_SPACES)): 116 if xml_series is None: 117 print("No Series found in XML tree, skipping.") 118 continue 119 label, dataset = get_series_meta_data( 120 flow_id, 121 # python typing is not smart enough to know that 122 # xml_series is an ElementTree 123 xml_series, 124 series_count, 125 dims, 126 ) 127 if label in meta: 128 # this can happen if you implicitly select the same series multiple times 129 print(f"Duplicate series {label} in {flow_id} found, check your specifications, skipping.") 130 continue 131 meta[label] = dataset 132 series = get_series_data(xml_series, dataset) 133 series.name = label 134 data[label] = series 135 136 return pd.DataFrame(data), pd.DataFrame(meta).T # data, meta
Extract data from the XML tree.
140def fetch( 141 flow_id: str, 142 dims: dict[str, str] | None = None, 143 constraints: dict[str, str] | None = None, # not implemented yet 144 *, 145 validate: bool = False, 146 **kwargs: Unpack[GetFileKwargs], 147) -> tuple[pd.DataFrame, pd.DataFrame]: 148 """Fetch data from the ABS SDMX API. 149 150 Args: 151 flow_id (str): The ID of the data flow from which to retrieve data items. 152 dims (dict[str, str], optional): A dictionary of dimensions to select the 153 data items. If None, the ABS fetch request will be for all data items, 154 which can be slow. 155 constraints (dict[str, str], optional): A dictionary of constraints to apply 156 to the data items. If None, no constraints are applied. 157 [Not implemented yet, will raise NotImplementedError if used.] 158 validate (bool): If True, print validation diagnostics for the proposed 159 dimensions against the metadata requirements. Defaults to False. 160 **kwargs (GetFileKwargs): Additional keyword arguments passed to acquire_xml(). 161 162 Returns: a tuple of two DataFrames: 163 - The first DataFrame contains the fetched data. 164 - The second DataFrame contains the metadata. 165 166 Raises: 167 HttpError: If there is an issue with the HTTP request. 168 CacheError: If there is an issue with the cache. 169 ValueError: If no XML root is found in the response. 170 NotImplementedError: If constraints are provided, as they are not implemented yet. 171 172 Notes: 173 If the `dims` argument is not valid you should get a CacheError or HttpError. 174 If the `flow_id` is not valid, you should get a ValueError. 175 176 """ 177 # --- deal with the not implemented constraints 178 if constraints is not None: 179 raise NotImplementedError( 180 "Constraints are not implemented yet. Please use the `dims` argument to select data items." 181 ) 182 183 # --- prepare to get the XML root from the ABS SDMX API 184 kwargs["modality"] = kwargs.get("modality", "prefer-cache") 185 key = build_key( 186 flow_id, 187 dims, 188 validate=validate, 189 ) 190 url = f"{URL_STEM}/data/{flow_id}/{key}" 191 xml_root = acquire_xml(url, **kwargs) 192 return extract(flow_id, xml_root)
Fetch data from the ABS SDMX API.
Args: flow_id (str): The ID of the data flow from which to retrieve data items. dims (dict[str, str], optional): A dictionary of dimensions to select the data items. If None, the ABS fetch request will be for all data items, which can be slow. constraints (dict[str, str], optional): A dictionary of constraints to apply to the data items. If None, no constraints are applied. [Not implemented yet, will raise NotImplementedError if used.] validate (bool): If True, print validation diagnostics for the proposed dimensions against the metadata requirements. Defaults to False. **kwargs (GetFileKwargs): Additional keyword arguments passed to acquire_xml().
Returns: a tuple of two DataFrames: - The first DataFrame contains the fetched data. - The second DataFrame contains the metadata.
Raises: HttpError: If there is an issue with the HTTP request. CacheError: If there is an issue with the cache. ValueError: If no XML root is found in the response. NotImplementedError: If constraints are provided, as they are not implemented yet.
Notes:
If the dims argument is not valid you should get a CacheError or HttpError.
If the flow_id is not valid, you should get a ValueError.