sdmxabs.fetch_selection
Select items from the ABS Catalogue based on search criteria.
1"""Select items from the ABS Catalogue based on search criteria.""" 2 3import re 4from collections.abc import Sequence 5from enum import Enum 6from typing import Unpack 7 8import pandas as pd 9 10from sdmxabs.download_cache import GetFileKwargs 11from sdmxabs.fetch_multi import fetch_multi 12from sdmxabs.flow_metadata import FlowMetaDict, code_lists, data_dimensions, data_flows 13 14 15# --- some types specific to this module 16class MatchType(Enum): 17 """Enumeration for match types.""" 18 19 EXACT = 1 20 PARTIAL = 2 21 REGEX = 3 22 23 24MatchItem = tuple[str, str, MatchType] # pattern, dimension, MatchType 25MatchCriteria = Sequence[MatchItem] # Sequence of tuples containing (pattern, dimension, MatchType) 26 27 28def package_codes(codes: list[str], dimension: str, return_dict: dict[str, str]) -> None: 29 """Package the codes into the return dictionary for a given dimension. 30 31 If the dimension already exists in the return_dict, we will intersect the newly 32 identified codes with the existing codes. If the intersection is a null set, the 33 dimension will be removed from the return_dict (ie. the global match). 34 35 Note: multiple matched codes are separated by a '+' sign in the return_dict. 36 37 """ 38 if dimension in return_dict: 39 previous = return_dict[dimension].split("+") 40 codes = list(set(previous).intersection(set(codes))) 41 if not codes: 42 del return_dict[dimension] # no matches, remove dimension 43 if codes: 44 return_dict[dimension] = "+".join(list(set(codes))) 45 46 47# --- private functions 48def get_codes( 49 code_list_dict: FlowMetaDict, 50 pattern: str, 51 match_type: MatchType = MatchType.PARTIAL, 52) -> list[str]: 53 """Obtain all codes matching the pattern.""" 54 codes = [] 55 for code, code_list in code_list_dict.items(): 56 name = code_list.get("name", "") 57 if not name: 58 # should not happen. 59 print(f"Code {code} has no name; (skipping)") 60 continue 61 match match_type: 62 case MatchType.EXACT: 63 if name == pattern: 64 codes.append(code) 65 case MatchType.PARTIAL: 66 # Case-insensitive partial match 67 if pattern.lower() in name.lower(): 68 codes.append(code) 69 case MatchType.REGEX: 70 if re.match(pattern, name): 71 codes.append(code) 72 return codes 73 74 75# --- public functions 76def match_item( 77 pattern: str, 78 dimension: str, 79 match_type: MatchType = MatchType.PARTIAL, 80) -> MatchItem: 81 """Create a new MatchItem for use in select_items() and fetch_selection(). 82 83 Args: 84 pattern (str): The pattern to match. 85 dimension (str): The dimension to match against. 86 match_type (MatchType, optional): The type of match to perform. Defaults to MatchType.EXACT. 87 88 Returns: 89 MatchElement: A tuple representing the match element. 90 91 """ 92 return (pattern, dimension, match_type) 93 94 95def make_wanted( 96 flow_id: str, 97 criteria: MatchCriteria, 98) -> pd.DataFrame: 99 """Build the `wanted` Dataframe for use by fetch_multi() by matching flow metadata. 100 101 Args: 102 flow_id (str): The ID of the data flow to select items from. 103 criteria (MatchElements): A sequence of tuples containing the element name, 104 the value to match, and the match type (exact, partial, or regex). 105 106 Returns: 107 pd.DataFrame: A DataFrame containing the selected items, which can be dropped 108 into the call of the function fetch_multi(). 109 110 Raises: 111 ValueError: If the flow_id is not valid or if no items match the criteria. 112 113 Notes: 114 - Should build a one line DataFrame. This Frame may select multiple data series, 115 when passed to fetch_multi. It also can be concatenated with other DataFrames 116 to build a larger selection. 117 - If two match elements refer to the same dimension, only the `intersection` of the 118 matches will be returned. 119 120 """ 121 # --- some sanity checks 122 if flow_id not in data_flows(): 123 raise ValueError(f"Invalid flow_id: {flow_id}.") 124 dimensions = data_dimensions(flow_id) 125 if not dimensions: 126 raise ValueError(f"No dimensions found for flow_id: {flow_id}.") 127 128 # --- lets build the codelist dictionary 129 return_dict: dict[str, str] = {} 130 for pattern, dimension, match_type in criteria: 131 if dimension not in dimensions: 132 print(f"Dimension '{dimension}' not found for flow '{flow_id}'; (skipping)") 133 continue 134 dim_dict = dimensions[dimension] 135 if "package" not in dim_dict or dim_dict["package"] != "codelist" or "id" not in dim_dict: 136 print(f"Dimension '{dimension}' does not have a codelist; (skipping)") 137 continue 138 code_list_name = dim_dict.get("id") 139 codes = get_codes(code_lists(code_list_name), pattern, match_type) 140 package_codes(codes, dimension, return_dict) 141 142 # --- return as a (one row) `wanted` DataFrame 143 return_dict["flow_id"] = flow_id 144 return pd.DataFrame([return_dict]).astype(str) 145 146 147def fetch_selection( 148 flow_id: str, 149 criteria: MatchCriteria, 150 *, 151 validate: bool = False, 152 **kwargs: Unpack[GetFileKwargs], 153) -> tuple[pd.DataFrame, pd.DataFrame]: 154 """Fetch data based on a selection criteria for items. 155 156 Args: 157 flow_id (str): The ID of the data flow to fetch. 158 criteria (MatchCriteria): A sequence of match criteria to filter the data. 159 validate (bool, optional): If True, validate the selection against the flow's 160 required dimensions. Defaults to False. 161 **kwargs: Additional keyword arguments for the fetch_multi function. 162 163 Returns: 164 tuple[pd.DataFrame, pd.DataFrame]: A tuple containing the fetched data and metadata. 165 166 """ 167 selection = make_wanted(flow_id, criteria) 168 return fetch_multi(selection, validate=validate, **kwargs) 169 170 171# --- quick and dirty testing 172if __name__ == "__main__": 173 # --- specify a selection from the Wage Price Index (WPI) data flow 174 mat_criteria = [] 175 mat_criteria.append(match_item("Australia", "REGION", MatchType.EXACT)) 176 mat_criteria.append( 177 match_item( 178 "Percentage change from corresponding quarter of previous year", "MEASURE", MatchType.EXACT 179 ) 180 ) 181 mat_criteria.append(match_item("Total hourly rates of pay excluding bonuses", "INDEX", MatchType.PARTIAL)) 182 mat_criteria.append(match_item("Seas|Trend", "TSEST", MatchType.REGEX)) 183 mat_criteria.append(match_item("13-Industry aggregate", "INDUSTRY", MatchType.EXACT)) 184 mat_criteria.append(match_item("Private and Public", "SECTOR", MatchType.EXACT)) 185 186 # --- test the selection 187 print(make_wanted("WPI", mat_criteria)) 188 data, meta = fetch_selection("WPI", mat_criteria) 189 print(f"Number of data series: {len(meta)}") # should be 2 190 print(meta.T) # should have the Trend and Seasonally Adjusted series
17class MatchType(Enum): 18 """Enumeration for match types.""" 19 20 EXACT = 1 21 PARTIAL = 2 22 REGEX = 3
Enumeration for match types.
29def package_codes(codes: list[str], dimension: str, return_dict: dict[str, str]) -> None: 30 """Package the codes into the return dictionary for a given dimension. 31 32 If the dimension already exists in the return_dict, we will intersect the newly 33 identified codes with the existing codes. If the intersection is a null set, the 34 dimension will be removed from the return_dict (ie. the global match). 35 36 Note: multiple matched codes are separated by a '+' sign in the return_dict. 37 38 """ 39 if dimension in return_dict: 40 previous = return_dict[dimension].split("+") 41 codes = list(set(previous).intersection(set(codes))) 42 if not codes: 43 del return_dict[dimension] # no matches, remove dimension 44 if codes: 45 return_dict[dimension] = "+".join(list(set(codes)))
Package the codes into the return dictionary for a given dimension.
If the dimension already exists in the return_dict, we will intersect the newly identified codes with the existing codes. If the intersection is a null set, the dimension will be removed from the return_dict (ie. the global match).
Note: multiple matched codes are separated by a '+' sign in the return_dict.
49def get_codes( 50 code_list_dict: FlowMetaDict, 51 pattern: str, 52 match_type: MatchType = MatchType.PARTIAL, 53) -> list[str]: 54 """Obtain all codes matching the pattern.""" 55 codes = [] 56 for code, code_list in code_list_dict.items(): 57 name = code_list.get("name", "") 58 if not name: 59 # should not happen. 60 print(f"Code {code} has no name; (skipping)") 61 continue 62 match match_type: 63 case MatchType.EXACT: 64 if name == pattern: 65 codes.append(code) 66 case MatchType.PARTIAL: 67 # Case-insensitive partial match 68 if pattern.lower() in name.lower(): 69 codes.append(code) 70 case MatchType.REGEX: 71 if re.match(pattern, name): 72 codes.append(code) 73 return codes
Obtain all codes matching the pattern.
77def match_item( 78 pattern: str, 79 dimension: str, 80 match_type: MatchType = MatchType.PARTIAL, 81) -> MatchItem: 82 """Create a new MatchItem for use in select_items() and fetch_selection(). 83 84 Args: 85 pattern (str): The pattern to match. 86 dimension (str): The dimension to match against. 87 match_type (MatchType, optional): The type of match to perform. Defaults to MatchType.EXACT. 88 89 Returns: 90 MatchElement: A tuple representing the match element. 91 92 """ 93 return (pattern, dimension, match_type)
Create a new MatchItem for use in select_items() and fetch_selection().
Args: pattern (str): The pattern to match. dimension (str): The dimension to match against. match_type (MatchType, optional): The type of match to perform. Defaults to MatchType.EXACT.
Returns: MatchElement: A tuple representing the match element.
96def make_wanted( 97 flow_id: str, 98 criteria: MatchCriteria, 99) -> pd.DataFrame: 100 """Build the `wanted` Dataframe for use by fetch_multi() by matching flow metadata. 101 102 Args: 103 flow_id (str): The ID of the data flow to select items from. 104 criteria (MatchElements): A sequence of tuples containing the element name, 105 the value to match, and the match type (exact, partial, or regex). 106 107 Returns: 108 pd.DataFrame: A DataFrame containing the selected items, which can be dropped 109 into the call of the function fetch_multi(). 110 111 Raises: 112 ValueError: If the flow_id is not valid or if no items match the criteria. 113 114 Notes: 115 - Should build a one line DataFrame. This Frame may select multiple data series, 116 when passed to fetch_multi. It also can be concatenated with other DataFrames 117 to build a larger selection. 118 - If two match elements refer to the same dimension, only the `intersection` of the 119 matches will be returned. 120 121 """ 122 # --- some sanity checks 123 if flow_id not in data_flows(): 124 raise ValueError(f"Invalid flow_id: {flow_id}.") 125 dimensions = data_dimensions(flow_id) 126 if not dimensions: 127 raise ValueError(f"No dimensions found for flow_id: {flow_id}.") 128 129 # --- lets build the codelist dictionary 130 return_dict: dict[str, str] = {} 131 for pattern, dimension, match_type in criteria: 132 if dimension not in dimensions: 133 print(f"Dimension '{dimension}' not found for flow '{flow_id}'; (skipping)") 134 continue 135 dim_dict = dimensions[dimension] 136 if "package" not in dim_dict or dim_dict["package"] != "codelist" or "id" not in dim_dict: 137 print(f"Dimension '{dimension}' does not have a codelist; (skipping)") 138 continue 139 code_list_name = dim_dict.get("id") 140 codes = get_codes(code_lists(code_list_name), pattern, match_type) 141 package_codes(codes, dimension, return_dict) 142 143 # --- return as a (one row) `wanted` DataFrame 144 return_dict["flow_id"] = flow_id 145 return pd.DataFrame([return_dict]).astype(str)
Build the wanted Dataframe for use by fetch_multi() by matching flow metadata.
Args: flow_id (str): The ID of the data flow to select items from. criteria (MatchElements): A sequence of tuples containing the element name, the value to match, and the match type (exact, partial, or regex).
Returns: pd.DataFrame: A DataFrame containing the selected items, which can be dropped into the call of the function fetch_multi().
Raises: ValueError: If the flow_id is not valid or if no items match the criteria.
Notes:
- Should build a one line DataFrame. This Frame may select multiple data series, when passed to fetch_multi. It also can be concatenated with other DataFrames to build a larger selection.
- If two match elements refer to the same dimension, only the
intersectionof the matches will be returned.
148def fetch_selection( 149 flow_id: str, 150 criteria: MatchCriteria, 151 *, 152 validate: bool = False, 153 **kwargs: Unpack[GetFileKwargs], 154) -> tuple[pd.DataFrame, pd.DataFrame]: 155 """Fetch data based on a selection criteria for items. 156 157 Args: 158 flow_id (str): The ID of the data flow to fetch. 159 criteria (MatchCriteria): A sequence of match criteria to filter the data. 160 validate (bool, optional): If True, validate the selection against the flow's 161 required dimensions. Defaults to False. 162 **kwargs: Additional keyword arguments for the fetch_multi function. 163 164 Returns: 165 tuple[pd.DataFrame, pd.DataFrame]: A tuple containing the fetched data and metadata. 166 167 """ 168 selection = make_wanted(flow_id, criteria) 169 return fetch_multi(selection, validate=validate, **kwargs)
Fetch data based on a selection criteria for items.
Args: flow_id (str): The ID of the data flow to fetch. criteria (MatchCriteria): A sequence of match criteria to filter the data. validate (bool, optional): If True, validate the selection against the flow's required dimensions. Defaults to False. **kwargs: Additional keyword arguments for the fetch_multi function.
Returns: tuple[pd.DataFrame, pd.DataFrame]: A tuple containing the fetched data and metadata.