sdmxabs
Capture data from the Australian Bureau of Statistics (ABS) using the SDMX API.
1"""Capture data from the Australian Bureau of Statistics (ABS) using the SDMX API.""" 2 3from importlib.metadata import PackageNotFoundError, version 4 5from .download_cache import ( 6 CacheError, 7 GetFileKwargs, 8 HttpError, 9 ModalityType, 10) 11from .fetch import fetch 12from .fetch_multi import fetch_multi 13from .fetch_selection import MatchCriteria, MatchItem, MatchType, fetch_selection, make_wanted, match_item 14from .flow_metadata import code_lists, data_dimensions, data_flows 15 16# --- version and author 17try: 18 __version__ = version(__name__) 19except PackageNotFoundError: 20 __version__ = "0.0.0" # Fallback for development mode 21__author__ = "Bryan Palmer" 22 23# --- establish the package contents 24__all__ = [ 25 "CacheError", 26 "GetFileKwargs", 27 "HttpError", 28 "MatchCriteria", 29 "MatchItem", 30 "MatchType", 31 "ModalityType", 32 "__author__", 33 "__version__", 34 "code_lists", 35 "data_dimensions", 36 "data_flows", 37 "fetch", 38 "fetch_multi", 39 "fetch_selection", 40 "make_wanted", 41 "match_item", 42]
A problem retrieving data from the cache.
37class GetFileKwargs(TypedDict): 38 """TypedDict for acqure_url function arguments.""" 39 40 verbose: NotRequired[bool] 41 """If True, print information about the data retrieval process.""" 42 modality: NotRequired[ModalityType] 43 """Kind of retrieval: "prefer_cache", "prefer_url"."""
TypedDict for acqure_url function arguments.
A problem retrieving data from HTTP.
17class MatchType(Enum): 18 """Enumeration for match types.""" 19 20 EXACT = 1 21 PARTIAL = 2 22 REGEX = 3
Enumeration for match types.
90@cache 91def code_lists(cl_id: str, **kwargs: Unpack[GetFileKwargs]) -> FlowMetaDict: 92 """Get the code list metadata from the ABS SDMX API. 93 94 Args: 95 cl_id (str): The ID of the code list to retrieve. 96 **kwargs: Additional keyword arguments passed to acquire_url(). 97 98 Returns: 99 FlowMetaDict: A dictionary containing the codes and 100 their associated key=value pairs. A "name" key should always 101 be present. A "parent" key may also be present. 102 103 Raises: 104 HttpError: If there is an issue with the HTTP request. 105 CacheError: If there is an issue with the cache. 106 ValueError: If no XML root is found in the response. 107 108 Note: 109 You will get a CacheError if the codelist is not found on the ABS SDMX API. 110 (This package tries the website first, then the cache.) 111 112 """ 113 tree = acquire_xml(f"{URL_STEM}/codelist/ABS/{cl_id}", **kwargs) 114 115 codes: FlowMetaDict = {} 116 for code in tree.findall(".//str:Code", NAME_SPACES): 117 code_id = code.get("id", None) 118 if code_id is None: 119 continue 120 elements: dict[str, str] = {} 121 name = code.find("com:Name", NAME_SPACES) 122 elements["name"] = str(name.text) if name is not None else "(missing)" 123 parent = code.find("str:Parent", NAME_SPACES) 124 parent_id = "" 125 if parent is not None: 126 ref = parent.find("Ref", NAME_SPACES) 127 if ref is not None: 128 parent_id = str(ref.get("id", "")) 129 elements["parent"] = parent_id 130 codes[code_id] = elements 131 132 return codes
Get the code list metadata from the ABS SDMX API.
Args: cl_id (str): The ID of the code list to retrieve. **kwargs: Additional keyword arguments passed to acquire_url().
Returns: FlowMetaDict: A dictionary containing the codes and their associated key=value pairs. A "name" key should always be present. A "parent" key may also be present.
Raises: HttpError: If there is an issue with the HTTP request. CacheError: If there is an issue with the cache. ValueError: If no XML root is found in the response.
Note: You will get a CacheError if the codelist is not found on the ABS SDMX API. (This package tries the website first, then the cache.)
55@cache 56def data_dimensions(flow_id: str, **kwargs: Unpack[GetFileKwargs]) -> FlowMetaDict: 57 """Get the data dimensions metadata from the ABS SDMX API. 58 59 Args: 60 flow_id (str): The ID of the dataflow to retrieve dimensions for. 61 **kwargs: Additional keyword arguments passed to acquire_url(). 62 63 Returns: 64 dict[str, dict[str, str]]: A dictionary containing the dimensions and 65 their metadata in key=value pairs. 66 67 Raises: 68 HttpError: If there is an issue with the HTTP request. 69 CacheError: If there is an issue with the cache. 70 ValueError: If no XML root is found in the response. 71 72 """ 73 tree = acquire_xml(f"{URL_STEM}/datastructure/ABS/{flow_id}", **kwargs) 74 75 dimensions = {} 76 for dim in tree.findall(".//str:Dimension", NAME_SPACES): 77 dim_id = dim.get("id") 78 dim_pos = dim.get("position") 79 if dim_id is None or dim_pos is None: 80 continue 81 contents = {"position": dim_pos} 82 if (lr := dim.find("str:LocalRepresentation", NAME_SPACES)) is not None and ( 83 enumer := lr.find("str:Enumeration/Ref", NAME_SPACES) 84 ) is not None: 85 contents = contents | enumer.attrib 86 dimensions[dim_id] = contents 87 return dimensions
Get the data dimensions metadata from the ABS SDMX API.
Args: flow_id (str): The ID of the dataflow to retrieve dimensions for. **kwargs: Additional keyword arguments passed to acquire_url().
Returns: dict[str, dict[str, str]]: A dictionary containing the dimensions and their metadata in key=value pairs.
Raises: HttpError: If there is an issue with the HTTP request. CacheError: If there is an issue with the cache. ValueError: If no XML root is found in the response.
22@cache 23def data_flows(flow_id: str = "all", **kwargs: Unpack[GetFileKwargs]) -> FlowMetaDict: 24 """Get the toplevel metadata from the ABS SDMX API. 25 26 Args: 27 flow_id (str): The ID of the dataflow to retrieve. Defaults to "all". 28 **kwargs: Additional keyword arguments passed to acquire_url(). 29 30 Returns: 31 dict[str, dict[str, str]]: A dictionary containing the dataflow IDs 32 and their metadatain key=value pairs. 33 34 Raises: 35 HttpError: If there is an issue with the HTTP request. 36 CacheError: If there is an issue with the cache. 37 ValueError: If no XML root is found in the response. 38 39 """ 40 tree = acquire_xml(f"{URL_STEM}/dataflow/ABS/{flow_id}", **kwargs) 41 42 d_flows: FlowMetaDict = {} 43 for dataflow in tree.findall(".//str:Dataflow", NAME_SPACES): 44 attributes: dict[str, str] = dataflow.attrib.copy() 45 if "id" not in attributes: 46 continue 47 df_id = attributes.pop("id") 48 name_elem = dataflow.find("com:Name", NAME_SPACES) 49 df_name = name_elem.text if name_elem is not None else "(no name)" 50 attributes["name"] = str(df_name) 51 d_flows[df_id] = attributes 52 return d_flows
Get the toplevel metadata from the ABS SDMX API.
Args: flow_id (str): The ID of the dataflow to retrieve. Defaults to "all". **kwargs: Additional keyword arguments passed to acquire_url().
Returns: dict[str, dict[str, str]]: A dictionary containing the dataflow IDs and their metadatain key=value pairs.
Raises: HttpError: If there is an issue with the HTTP request. CacheError: If there is an issue with the cache. ValueError: If no XML root is found in the response.
140def fetch( 141 flow_id: str, 142 dims: dict[str, str] | None = None, 143 constraints: dict[str, str] | None = None, # not implemented yet 144 *, 145 validate: bool = False, 146 **kwargs: Unpack[GetFileKwargs], 147) -> tuple[pd.DataFrame, pd.DataFrame]: 148 """Fetch data from the ABS SDMX API. 149 150 Args: 151 flow_id (str): The ID of the data flow from which to retrieve data items. 152 dims (dict[str, str], optional): A dictionary of dimensions to select the 153 data items. If None, the ABS fetch request will be for all data items, 154 which can be slow. 155 constraints (dict[str, str], optional): A dictionary of constraints to apply 156 to the data items. If None, no constraints are applied. 157 [Not implemented yet, will raise NotImplementedError if used.] 158 validate (bool): If True, print validation diagnostics for the proposed 159 dimensions against the metadata requirements. Defaults to False. 160 **kwargs (GetFileKwargs): Additional keyword arguments passed to acquire_xml(). 161 162 Returns: a tuple of two DataFrames: 163 - The first DataFrame contains the fetched data. 164 - The second DataFrame contains the metadata. 165 166 Raises: 167 HttpError: If there is an issue with the HTTP request. 168 CacheError: If there is an issue with the cache. 169 ValueError: If no XML root is found in the response. 170 NotImplementedError: If constraints are provided, as they are not implemented yet. 171 172 Notes: 173 If the `dims` argument is not valid you should get a CacheError or HttpError. 174 If the `flow_id` is not valid, you should get a ValueError. 175 176 """ 177 # --- deal with the not implemented constraints 178 if constraints is not None: 179 raise NotImplementedError( 180 "Constraints are not implemented yet. Please use the `dims` argument to select data items." 181 ) 182 183 # --- prepare to get the XML root from the ABS SDMX API 184 kwargs["modality"] = kwargs.get("modality", "prefer-cache") 185 key = build_key( 186 flow_id, 187 dims, 188 validate=validate, 189 ) 190 url = f"{URL_STEM}/data/{flow_id}/{key}" 191 xml_root = acquire_xml(url, **kwargs) 192 return extract(flow_id, xml_root)
Fetch data from the ABS SDMX API.
Args: flow_id (str): The ID of the data flow from which to retrieve data items. dims (dict[str, str], optional): A dictionary of dimensions to select the data items. If None, the ABS fetch request will be for all data items, which can be slow. constraints (dict[str, str], optional): A dictionary of constraints to apply to the data items. If None, no constraints are applied. [Not implemented yet, will raise NotImplementedError if used.] validate (bool): If True, print validation diagnostics for the proposed dimensions against the metadata requirements. Defaults to False. **kwargs (GetFileKwargs): Additional keyword arguments passed to acquire_xml().
Returns: a tuple of two DataFrames: - The first DataFrame contains the fetched data. - The second DataFrame contains the metadata.
Raises: HttpError: If there is an issue with the HTTP request. CacheError: If there is an issue with the cache. ValueError: If no XML root is found in the response. NotImplementedError: If constraints are provided, as they are not implemented yet.
Notes:
If the dims argument is not valid you should get a CacheError or HttpError.
If the flow_id is not valid, you should get a ValueError.
77def fetch_multi( 78 wanted: pd.DataFrame, 79 *, 80 validate: bool = False, 81 **kwargs: Unpack[GetFileKwargs], 82) -> tuple[pd.DataFrame, pd.DataFrame]: 83 """Fetch multiple SDMX datasets based on a DataFrame of desired datasets. 84 85 Args: 86 wanted: A DataFrame with rows for each desired data set (of one or more series). 87 Each row should contain the necessary identifiers to fetch the dataset. 88 The columns will be 'flow_id', plus the ABS dimensions relevant to the flow. 89 The 'flow_id' column is mandatory, and the rest are optional. 90 Note: the DataFrame index is not used in the fetching process. 91 validate: If True, the function will validate dimensions and values against 92 the ABS SDMX API codelists. Defaults to False. 93 **kwargs: Additional keyword arguments passed to the underlying data fetching function. 94 95 Returns: 96 A tuple containing two DataFrames: 97 - The first DataFrame contains the fetched data. 98 - The second DataFrame contains metadata about the fetched datasets. 99 100 Raises: 101 ValueError: If the 'flow_id' column is missing from the `wanted` DataFrame. 102 103 Note: 104 CacheError and HttpError are raised by the fetch function. 105 These will be caught and reported to standard output. 106 107 Caution: 108 The selected data should all have the same index. You cannot mix (for example) 109 Quarterly and Monthly data in the same DataFrame. 110 111 """ 112 # --- quick sanity checks 113 if wanted.empty: 114 print("wanted DataFrame is empty, returning empty DataFrames.") 115 return pd.DataFrame(), pd.DataFrame() 116 if "flow_id" not in wanted.columns: 117 raise ValueError("The 'flow_id' column is required in the 'wanted' DataFrame.") 118 119 # --- do the work 120 return extract(wanted, validate=validate, **kwargs)
Fetch multiple SDMX datasets based on a DataFrame of desired datasets.
Args: wanted: A DataFrame with rows for each desired data set (of one or more series). Each row should contain the necessary identifiers to fetch the dataset. The columns will be 'flow_id', plus the ABS dimensions relevant to the flow. The 'flow_id' column is mandatory, and the rest are optional. Note: the DataFrame index is not used in the fetching process. validate: If True, the function will validate dimensions and values against the ABS SDMX API codelists. Defaults to False. **kwargs: Additional keyword arguments passed to the underlying data fetching function.
Returns: A tuple containing two DataFrames: - The first DataFrame contains the fetched data. - The second DataFrame contains metadata about the fetched datasets.
Raises:
ValueError: If the 'flow_id' column is missing from the wanted DataFrame.
Note: CacheError and HttpError are raised by the fetch function. These will be caught and reported to standard output.
Caution: The selected data should all have the same index. You cannot mix (for example) Quarterly and Monthly data in the same DataFrame.
148def fetch_selection( 149 flow_id: str, 150 criteria: MatchCriteria, 151 *, 152 validate: bool = False, 153 **kwargs: Unpack[GetFileKwargs], 154) -> tuple[pd.DataFrame, pd.DataFrame]: 155 """Fetch data based on a selection criteria for items. 156 157 Args: 158 flow_id (str): The ID of the data flow to fetch. 159 criteria (MatchCriteria): A sequence of match criteria to filter the data. 160 validate (bool, optional): If True, validate the selection against the flow's 161 required dimensions. Defaults to False. 162 **kwargs: Additional keyword arguments for the fetch_multi function. 163 164 Returns: 165 tuple[pd.DataFrame, pd.DataFrame]: A tuple containing the fetched data and metadata. 166 167 """ 168 selection = make_wanted(flow_id, criteria) 169 return fetch_multi(selection, validate=validate, **kwargs)
Fetch data based on a selection criteria for items.
Args: flow_id (str): The ID of the data flow to fetch. criteria (MatchCriteria): A sequence of match criteria to filter the data. validate (bool, optional): If True, validate the selection against the flow's required dimensions. Defaults to False. **kwargs: Additional keyword arguments for the fetch_multi function.
Returns: tuple[pd.DataFrame, pd.DataFrame]: A tuple containing the fetched data and metadata.
96def make_wanted( 97 flow_id: str, 98 criteria: MatchCriteria, 99) -> pd.DataFrame: 100 """Build the `wanted` Dataframe for use by fetch_multi() by matching flow metadata. 101 102 Args: 103 flow_id (str): The ID of the data flow to select items from. 104 criteria (MatchElements): A sequence of tuples containing the element name, 105 the value to match, and the match type (exact, partial, or regex). 106 107 Returns: 108 pd.DataFrame: A DataFrame containing the selected items, which can be dropped 109 into the call of the function fetch_multi(). 110 111 Raises: 112 ValueError: If the flow_id is not valid or if no items match the criteria. 113 114 Notes: 115 - Should build a one line DataFrame. This Frame may select multiple data series, 116 when passed to fetch_multi. It also can be concatenated with other DataFrames 117 to build a larger selection. 118 - If two match elements refer to the same dimension, only the `intersection` of the 119 matches will be returned. 120 121 """ 122 # --- some sanity checks 123 if flow_id not in data_flows(): 124 raise ValueError(f"Invalid flow_id: {flow_id}.") 125 dimensions = data_dimensions(flow_id) 126 if not dimensions: 127 raise ValueError(f"No dimensions found for flow_id: {flow_id}.") 128 129 # --- lets build the codelist dictionary 130 return_dict: dict[str, str] = {} 131 for pattern, dimension, match_type in criteria: 132 if dimension not in dimensions: 133 print(f"Dimension '{dimension}' not found for flow '{flow_id}'; (skipping)") 134 continue 135 dim_dict = dimensions[dimension] 136 if "package" not in dim_dict or dim_dict["package"] != "codelist" or "id" not in dim_dict: 137 print(f"Dimension '{dimension}' does not have a codelist; (skipping)") 138 continue 139 code_list_name = dim_dict.get("id") 140 codes = get_codes(code_lists(code_list_name), pattern, match_type) 141 package_codes(codes, dimension, return_dict) 142 143 # --- return as a (one row) `wanted` DataFrame 144 return_dict["flow_id"] = flow_id 145 return pd.DataFrame([return_dict]).astype(str)
Build the wanted Dataframe for use by fetch_multi() by matching flow metadata.
Args: flow_id (str): The ID of the data flow to select items from. criteria (MatchElements): A sequence of tuples containing the element name, the value to match, and the match type (exact, partial, or regex).
Returns: pd.DataFrame: A DataFrame containing the selected items, which can be dropped into the call of the function fetch_multi().
Raises: ValueError: If the flow_id is not valid or if no items match the criteria.
Notes:
- Should build a one line DataFrame. This Frame may select multiple data series, when passed to fetch_multi. It also can be concatenated with other DataFrames to build a larger selection.
- If two match elements refer to the same dimension, only the
intersectionof the matches will be returned.
77def match_item( 78 pattern: str, 79 dimension: str, 80 match_type: MatchType = MatchType.PARTIAL, 81) -> MatchItem: 82 """Create a new MatchItem for use in select_items() and fetch_selection(). 83 84 Args: 85 pattern (str): The pattern to match. 86 dimension (str): The dimension to match against. 87 match_type (MatchType, optional): The type of match to perform. Defaults to MatchType.EXACT. 88 89 Returns: 90 MatchElement: A tuple representing the match element. 91 92 """ 93 return (pattern, dimension, match_type)
Create a new MatchItem for use in select_items() and fetch_selection().
Args: pattern (str): The pattern to match. dimension (str): The dimension to match against. match_type (MatchType, optional): The type of match to perform. Defaults to MatchType.EXACT.
Returns: MatchElement: A tuple representing the match element.