sdmxabs.fetch_selection

Select items from the ABS Catalogue based on search criteria.

  1"""Select items from the ABS Catalogue based on search criteria."""
  2
  3import re
  4from collections.abc import Sequence
  5from enum import Enum
  6from typing import Unpack
  7
  8import pandas as pd
  9
 10from sdmxabs.download_cache import GetFileKwargs
 11from sdmxabs.fetch_multi import fetch_multi
 12from sdmxabs.flow_metadata import FlowMetaDict, code_lists, data_dimensions, data_flows
 13
 14
 15# --- some types specific to this module
 16class MatchType(Enum):
 17    """Enumeration for match types."""
 18
 19    EXACT = 1
 20    PARTIAL = 2
 21    REGEX = 3
 22
 23
 24MatchItem = tuple[str, str, MatchType]  # pattern, dimension, MatchType
 25MatchCriteria = Sequence[MatchItem]  # Sequence of tuples containing (pattern, dimension, MatchType)
 26
 27
 28def package_codes(codes: list[str], dimension: str, return_dict: dict[str, str]) -> None:
 29    """Package the codes into the return dictionary for a given dimension.
 30
 31    If the dimension already exists in the return_dict, we will intersect the newly
 32    identified  codes with the existing codes. If the intersection is a null set, the
 33    dimension will be removed from the return_dict (ie. the global match).
 34
 35    Note: multiple matched codes are separated by a '+' sign in the return_dict.
 36
 37    """
 38    if dimension in return_dict:
 39        previous = return_dict[dimension].split("+")
 40        codes = list(set(previous).intersection(set(codes)))
 41        if not codes:
 42            del return_dict[dimension]  # no matches, remove dimension
 43    if codes:
 44        return_dict[dimension] = "+".join(list(set(codes)))
 45
 46
 47# --- private functions
 48def get_codes(
 49    code_list_dict: FlowMetaDict,
 50    pattern: str,
 51    match_type: MatchType = MatchType.PARTIAL,
 52) -> list[str]:
 53    """Obtain all codes matching the pattern."""
 54    codes = []
 55    for code, code_list in code_list_dict.items():
 56        name = code_list.get("name", "")
 57        if not name:
 58            # should not happen.
 59            print(f"Code {code} has no name; (skipping)")
 60            continue
 61        match match_type:
 62            case MatchType.EXACT:
 63                if name == pattern:
 64                    codes.append(code)
 65            case MatchType.PARTIAL:
 66                # Case-insensitive partial match
 67                if pattern.lower() in name.lower():
 68                    codes.append(code)
 69            case MatchType.REGEX:
 70                if re.match(pattern, name):
 71                    codes.append(code)
 72    return codes
 73
 74
 75# --- public functions
 76def match_item(
 77    pattern: str,
 78    dimension: str,
 79    match_type: MatchType = MatchType.PARTIAL,
 80) -> MatchItem:
 81    """Create a new MatchItem for use in select_items() and fetch_selection().
 82
 83    Args:
 84        pattern (str): The pattern to match.
 85        dimension (str): The dimension to match against.
 86        match_type (MatchType, optional): The type of match to perform. Defaults to MatchType.EXACT.
 87
 88    Returns:
 89        MatchElement: A tuple representing the match element.
 90
 91    """
 92    return (pattern, dimension, match_type)
 93
 94
 95def make_wanted(
 96    flow_id: str,
 97    criteria: MatchCriteria,
 98) -> pd.DataFrame:
 99    """Build the `wanted` Dataframe for use by fetch_multi() by matching flow metadata.
100
101    Args:
102        flow_id (str): The ID of the data flow to select items from.
103        criteria (MatchElements): A sequence of tuples containing the element name,
104            the value to match, and the match type (exact, partial, or regex).
105
106    Returns:
107        pd.DataFrame: A DataFrame containing the selected items, which can be dropped
108            into the call of the function fetch_multi().
109
110    Raises:
111        ValueError: If the flow_id is not valid or if no items match the criteria.
112
113    Notes:
114    -   Should build a one line DataFrame. This Frame may select multiple data series,
115        when passed to fetch_multi. It also can be concatenated with other DataFrames
116        to build a larger selection.
117    -   If two match elements refer to the same dimension, only the `intersection` of the
118        matches will be returned.
119
120    """
121    # --- some sanity checks
122    if flow_id not in data_flows():
123        raise ValueError(f"Invalid flow_id: {flow_id}.")
124    dimensions = data_dimensions(flow_id)
125    if not dimensions:
126        raise ValueError(f"No dimensions found for flow_id: {flow_id}.")
127
128    # --- lets build the codelist dictionary
129    return_dict: dict[str, str] = {}
130    for pattern, dimension, match_type in criteria:
131        if dimension not in dimensions:
132            print(f"Dimension '{dimension}' not found for flow '{flow_id}'; (skipping)")
133            continue
134        dim_dict = dimensions[dimension]
135        if "package" not in dim_dict or dim_dict["package"] != "codelist" or "id" not in dim_dict:
136            print(f"Dimension '{dimension}' does not have a codelist; (skipping)")
137            continue
138        code_list_name = dim_dict.get("id")
139        codes = get_codes(code_lists(code_list_name), pattern, match_type)
140        package_codes(codes, dimension, return_dict)
141
142    # --- return as a (one row) `wanted` DataFrame
143    return_dict["flow_id"] = flow_id
144    return pd.DataFrame([return_dict]).astype(str)
145
146
147def fetch_selection(
148    flow_id: str,
149    criteria: MatchCriteria,
150    *,
151    validate: bool = False,
152    **kwargs: Unpack[GetFileKwargs],
153) -> tuple[pd.DataFrame, pd.DataFrame]:
154    """Fetch data based on a selection criteria for items.
155
156    Args:
157        flow_id (str): The ID of the data flow to fetch.
158        criteria (MatchCriteria): A sequence of match criteria to filter the data.
159        validate (bool, optional): If True, validate the selection against the flow's
160            required dimensions. Defaults to False.
161        **kwargs: Additional keyword arguments for the fetch_multi function.
162
163    Returns:
164        tuple[pd.DataFrame, pd.DataFrame]: A tuple containing the fetched data and metadata.
165
166    """
167    selection = make_wanted(flow_id, criteria)
168    return fetch_multi(selection, validate=validate, **kwargs)
169
170
171# --- quick and dirty testing
172if __name__ == "__main__":
173    # --- specify a selection from the Wage Price Index (WPI) data flow
174    mat_criteria = []
175    mat_criteria.append(match_item("Australia", "REGION", MatchType.EXACT))
176    mat_criteria.append(
177        match_item(
178            "Percentage change from corresponding quarter of previous year", "MEASURE", MatchType.EXACT
179        )
180    )
181    mat_criteria.append(match_item("Total hourly rates of pay excluding bonuses", "INDEX", MatchType.PARTIAL))
182    mat_criteria.append(match_item("Seas|Trend", "TSEST", MatchType.REGEX))
183    mat_criteria.append(match_item("13-Industry aggregate", "INDUSTRY", MatchType.EXACT))
184    mat_criteria.append(match_item("Private and Public", "SECTOR", MatchType.EXACT))
185
186    # --- test the selection
187    print(make_wanted("WPI", mat_criteria))
188    data, meta = fetch_selection("WPI", mat_criteria)
189    print(f"Number of data series: {len(meta)}")  # should be 2
190    print(meta.T)  # should have the Trend and Seasonally Adjusted series
class MatchType(enum.Enum):
17class MatchType(Enum):
18    """Enumeration for match types."""
19
20    EXACT = 1
21    PARTIAL = 2
22    REGEX = 3

Enumeration for match types.

EXACT = <MatchType.EXACT: 1>
PARTIAL = <MatchType.PARTIAL: 2>
REGEX = <MatchType.REGEX: 3>
MatchItem = tuple[str, str, MatchType]
MatchCriteria = collections.abc.Sequence[tuple[str, str, MatchType]]
def package_codes(codes: list[str], dimension: str, return_dict: dict[str, str]) -> None:
29def package_codes(codes: list[str], dimension: str, return_dict: dict[str, str]) -> None:
30    """Package the codes into the return dictionary for a given dimension.
31
32    If the dimension already exists in the return_dict, we will intersect the newly
33    identified  codes with the existing codes. If the intersection is a null set, the
34    dimension will be removed from the return_dict (ie. the global match).
35
36    Note: multiple matched codes are separated by a '+' sign in the return_dict.
37
38    """
39    if dimension in return_dict:
40        previous = return_dict[dimension].split("+")
41        codes = list(set(previous).intersection(set(codes)))
42        if not codes:
43            del return_dict[dimension]  # no matches, remove dimension
44    if codes:
45        return_dict[dimension] = "+".join(list(set(codes)))

Package the codes into the return dictionary for a given dimension.

If the dimension already exists in the return_dict, we will intersect the newly identified codes with the existing codes. If the intersection is a null set, the dimension will be removed from the return_dict (ie. the global match).

Note: multiple matched codes are separated by a '+' sign in the return_dict.

def get_codes( code_list_dict: dict[str, dict[str, str]], pattern: str, match_type: MatchType = <MatchType.PARTIAL: 2>) -> list[str]:
49def get_codes(
50    code_list_dict: FlowMetaDict,
51    pattern: str,
52    match_type: MatchType = MatchType.PARTIAL,
53) -> list[str]:
54    """Obtain all codes matching the pattern."""
55    codes = []
56    for code, code_list in code_list_dict.items():
57        name = code_list.get("name", "")
58        if not name:
59            # should not happen.
60            print(f"Code {code} has no name; (skipping)")
61            continue
62        match match_type:
63            case MatchType.EXACT:
64                if name == pattern:
65                    codes.append(code)
66            case MatchType.PARTIAL:
67                # Case-insensitive partial match
68                if pattern.lower() in name.lower():
69                    codes.append(code)
70            case MatchType.REGEX:
71                if re.match(pattern, name):
72                    codes.append(code)
73    return codes

Obtain all codes matching the pattern.

def match_item( pattern: str, dimension: str, match_type: MatchType = <MatchType.PARTIAL: 2>) -> tuple[str, str, MatchType]:
77def match_item(
78    pattern: str,
79    dimension: str,
80    match_type: MatchType = MatchType.PARTIAL,
81) -> MatchItem:
82    """Create a new MatchItem for use in select_items() and fetch_selection().
83
84    Args:
85        pattern (str): The pattern to match.
86        dimension (str): The dimension to match against.
87        match_type (MatchType, optional): The type of match to perform. Defaults to MatchType.EXACT.
88
89    Returns:
90        MatchElement: A tuple representing the match element.
91
92    """
93    return (pattern, dimension, match_type)

Create a new MatchItem for use in select_items() and fetch_selection().

Args: pattern (str): The pattern to match. dimension (str): The dimension to match against. match_type (MatchType, optional): The type of match to perform. Defaults to MatchType.EXACT.

Returns: MatchElement: A tuple representing the match element.

def make_wanted( flow_id: str, criteria: Sequence[tuple[str, str, MatchType]]) -> pandas.core.frame.DataFrame:
 96def make_wanted(
 97    flow_id: str,
 98    criteria: MatchCriteria,
 99) -> pd.DataFrame:
100    """Build the `wanted` Dataframe for use by fetch_multi() by matching flow metadata.
101
102    Args:
103        flow_id (str): The ID of the data flow to select items from.
104        criteria (MatchElements): A sequence of tuples containing the element name,
105            the value to match, and the match type (exact, partial, or regex).
106
107    Returns:
108        pd.DataFrame: A DataFrame containing the selected items, which can be dropped
109            into the call of the function fetch_multi().
110
111    Raises:
112        ValueError: If the flow_id is not valid or if no items match the criteria.
113
114    Notes:
115    -   Should build a one line DataFrame. This Frame may select multiple data series,
116        when passed to fetch_multi. It also can be concatenated with other DataFrames
117        to build a larger selection.
118    -   If two match elements refer to the same dimension, only the `intersection` of the
119        matches will be returned.
120
121    """
122    # --- some sanity checks
123    if flow_id not in data_flows():
124        raise ValueError(f"Invalid flow_id: {flow_id}.")
125    dimensions = data_dimensions(flow_id)
126    if not dimensions:
127        raise ValueError(f"No dimensions found for flow_id: {flow_id}.")
128
129    # --- lets build the codelist dictionary
130    return_dict: dict[str, str] = {}
131    for pattern, dimension, match_type in criteria:
132        if dimension not in dimensions:
133            print(f"Dimension '{dimension}' not found for flow '{flow_id}'; (skipping)")
134            continue
135        dim_dict = dimensions[dimension]
136        if "package" not in dim_dict or dim_dict["package"] != "codelist" or "id" not in dim_dict:
137            print(f"Dimension '{dimension}' does not have a codelist; (skipping)")
138            continue
139        code_list_name = dim_dict.get("id")
140        codes = get_codes(code_lists(code_list_name), pattern, match_type)
141        package_codes(codes, dimension, return_dict)
142
143    # --- return as a (one row) `wanted` DataFrame
144    return_dict["flow_id"] = flow_id
145    return pd.DataFrame([return_dict]).astype(str)

Build the wanted Dataframe for use by fetch_multi() by matching flow metadata.

Args: flow_id (str): The ID of the data flow to select items from. criteria (MatchElements): A sequence of tuples containing the element name, the value to match, and the match type (exact, partial, or regex).

Returns: pd.DataFrame: A DataFrame containing the selected items, which can be dropped into the call of the function fetch_multi().

Raises: ValueError: If the flow_id is not valid or if no items match the criteria.

Notes:

  • Should build a one line DataFrame. This Frame may select multiple data series, when passed to fetch_multi. It also can be concatenated with other DataFrames to build a larger selection.
  • If two match elements refer to the same dimension, only the intersection of the matches will be returned.
def fetch_selection( flow_id: str, criteria: Sequence[tuple[str, str, MatchType]], *, validate: bool = False, **kwargs: Unpack[sdmxabs.GetFileKwargs]) -> tuple[pandas.core.frame.DataFrame, pandas.core.frame.DataFrame]:
148def fetch_selection(
149    flow_id: str,
150    criteria: MatchCriteria,
151    *,
152    validate: bool = False,
153    **kwargs: Unpack[GetFileKwargs],
154) -> tuple[pd.DataFrame, pd.DataFrame]:
155    """Fetch data based on a selection criteria for items.
156
157    Args:
158        flow_id (str): The ID of the data flow to fetch.
159        criteria (MatchCriteria): A sequence of match criteria to filter the data.
160        validate (bool, optional): If True, validate the selection against the flow's
161            required dimensions. Defaults to False.
162        **kwargs: Additional keyword arguments for the fetch_multi function.
163
164    Returns:
165        tuple[pd.DataFrame, pd.DataFrame]: A tuple containing the fetched data and metadata.
166
167    """
168    selection = make_wanted(flow_id, criteria)
169    return fetch_multi(selection, validate=validate, **kwargs)

Fetch data based on a selection criteria for items.

Args: flow_id (str): The ID of the data flow to fetch. criteria (MatchCriteria): A sequence of match criteria to filter the data. validate (bool, optional): If True, validate the selection against the flow's required dimensions. Defaults to False. **kwargs: Additional keyword arguments for the fetch_multi function.

Returns: tuple[pd.DataFrame, pd.DataFrame]: A tuple containing the fetched data and metadata.