sdmxabs.fetch_multi

Fetch multiple datasets from the SDMX API.

  1"""Fetch multiple datasets from the SDMX API."""
  2
  3from typing import Unpack
  4
  5import pandas as pd
  6
  7from sdmxabs.download_cache import CacheError, GetFileKwargs, HttpError
  8from sdmxabs.fetch import fetch
  9
 10
 11# --- private function
 12def extract(
 13    wanted: pd.DataFrame, *, validate: bool = False, **kwargs: Unpack[GetFileKwargs]
 14) -> tuple[pd.DataFrame, pd.DataFrame]:  # data / metadata
 15    """Extract the data and metadata for each row in the dimensions DataFrame.
 16
 17    Args:
 18        wanted (pd.DataFrame): DataFrame containing the dimensions to fetch.
 19                               DataFrame cells with NAN values will be ignored.
 20                               The DataFrame must have a populated 'flow_id' column.
 21        validate (bool): If True, the function will validate the dimensions and values
 22                         against the ABS SDMX API codelists. Defaults to False.
 23        **kwargs: Additional keyword arguments passed to the underlying data fetching function.
 24
 25    Returns:
 26        tuple[pd.DataFrame, pd.DataFrame]: A DataFrame with the fetched data and
 27                                        a DataFrame with the metadata.
 28
 29    Raises:
 30        ValueError: if any input data is not as expected.
 31
 32    Note: CacheError and HttpError are raised by the fetch function.
 33          These will be caught and reported to standard output.
 34
 35    """
 36    # --- initial setup - empty return results
 37    return_meta = {}
 38    return_data = {}
 39    counter = 0
 40
 41    # --- loop over the rows of the wanted DataFrame
 42    for _index, row in wanted.iterrows():
 43        # --- get the arguments for the fetch (ignoring NaN values)
 44        row_dict: dict[str, str] = row.dropna().to_dict()
 45        flow_id = row_dict.pop("flow_id", "")
 46        if not flow_id:
 47            # --- if there is no flow_id, we will skip this row
 48            print(f"Skipping row with no flow_id: {row_dict}")
 49            continue
 50
 51        # --- fetch the data and meta data for each row of the selection table
 52        try:
 53            data, meta = fetch(flow_id, dims=row_dict, validate=validate, **kwargs)
 54        except (CacheError, HttpError, ValueError) as e:
 55            # --- if there is an error, we will skip this row
 56            print(f"Error fetching {flow_id} with dimensions {row_dict}: {e}")
 57            continue
 58        if data.empty or meta.empty:
 59            # --- this should not happen, but if it does, we will skip this row
 60            print(f"No data for {flow_id} with dimensions {row_dict}")
 61            continue
 62
 63        # --- manage duplicates
 64        for col in data.columns:
 65            counter += 1
 66            save_name = col
 67            if save_name in return_data:
 68                save_name += f"_{counter:03d}"
 69            return_data[save_name] = data[col]
 70            return_meta[save_name] = meta.loc[col]
 71
 72    return pd.DataFrame(return_data), pd.DataFrame(return_meta).T
 73
 74
 75# --- public function
 76def fetch_multi(
 77    wanted: pd.DataFrame,
 78    *,
 79    validate: bool = False,
 80    **kwargs: Unpack[GetFileKwargs],
 81) -> tuple[pd.DataFrame, pd.DataFrame]:
 82    """Fetch multiple SDMX datasets based on a DataFrame of desired datasets.
 83
 84    Args:
 85        wanted: A DataFrame with rows for each desired data set (of one or more series).
 86                Each row should contain the necessary identifiers to fetch the dataset.
 87                The columns will be 'flow_id', plus the ABS dimensions relevant to the flow.
 88                The 'flow_id' column is mandatory, and the rest are optional.
 89                Note: the DataFrame index is not used in the fetching process.
 90        validate: If True, the function will validate dimensions and values against
 91                  the ABS SDMX API codelists. Defaults to False.
 92        **kwargs: Additional keyword arguments passed to the underlying data fetching function.
 93
 94    Returns:
 95        A tuple containing two DataFrames:
 96        - The first DataFrame contains the fetched data.
 97        - The second DataFrame contains metadata about the fetched datasets.
 98
 99    Raises:
100        ValueError: If the 'flow_id' column is missing from the `wanted` DataFrame.
101
102    Note:
103        CacheError and HttpError are raised by the fetch function.
104        These will be caught and reported to standard output.
105
106    Caution:
107        The selected data should all have the same index. You cannot mix (for example)
108        Quarterly and Monthly data in the same DataFrame.
109
110    """
111    # --- quick sanity checks
112    if wanted.empty:
113        print("wanted DataFrame is empty, returning empty DataFrames.")
114        return pd.DataFrame(), pd.DataFrame()
115    if "flow_id" not in wanted.columns:
116        raise ValueError("The 'flow_id' column is required in the 'wanted' DataFrame.")
117
118    # --- do the work
119    return extract(wanted, validate=validate, **kwargs)
def extract( wanted: pandas.core.frame.DataFrame, *, validate: bool = False, **kwargs: Unpack[sdmxabs.GetFileKwargs]) -> tuple[pandas.core.frame.DataFrame, pandas.core.frame.DataFrame]:
13def extract(
14    wanted: pd.DataFrame, *, validate: bool = False, **kwargs: Unpack[GetFileKwargs]
15) -> tuple[pd.DataFrame, pd.DataFrame]:  # data / metadata
16    """Extract the data and metadata for each row in the dimensions DataFrame.
17
18    Args:
19        wanted (pd.DataFrame): DataFrame containing the dimensions to fetch.
20                               DataFrame cells with NAN values will be ignored.
21                               The DataFrame must have a populated 'flow_id' column.
22        validate (bool): If True, the function will validate the dimensions and values
23                         against the ABS SDMX API codelists. Defaults to False.
24        **kwargs: Additional keyword arguments passed to the underlying data fetching function.
25
26    Returns:
27        tuple[pd.DataFrame, pd.DataFrame]: A DataFrame with the fetched data and
28                                        a DataFrame with the metadata.
29
30    Raises:
31        ValueError: if any input data is not as expected.
32
33    Note: CacheError and HttpError are raised by the fetch function.
34          These will be caught and reported to standard output.
35
36    """
37    # --- initial setup - empty return results
38    return_meta = {}
39    return_data = {}
40    counter = 0
41
42    # --- loop over the rows of the wanted DataFrame
43    for _index, row in wanted.iterrows():
44        # --- get the arguments for the fetch (ignoring NaN values)
45        row_dict: dict[str, str] = row.dropna().to_dict()
46        flow_id = row_dict.pop("flow_id", "")
47        if not flow_id:
48            # --- if there is no flow_id, we will skip this row
49            print(f"Skipping row with no flow_id: {row_dict}")
50            continue
51
52        # --- fetch the data and meta data for each row of the selection table
53        try:
54            data, meta = fetch(flow_id, dims=row_dict, validate=validate, **kwargs)
55        except (CacheError, HttpError, ValueError) as e:
56            # --- if there is an error, we will skip this row
57            print(f"Error fetching {flow_id} with dimensions {row_dict}: {e}")
58            continue
59        if data.empty or meta.empty:
60            # --- this should not happen, but if it does, we will skip this row
61            print(f"No data for {flow_id} with dimensions {row_dict}")
62            continue
63
64        # --- manage duplicates
65        for col in data.columns:
66            counter += 1
67            save_name = col
68            if save_name in return_data:
69                save_name += f"_{counter:03d}"
70            return_data[save_name] = data[col]
71            return_meta[save_name] = meta.loc[col]
72
73    return pd.DataFrame(return_data), pd.DataFrame(return_meta).T

Extract the data and metadata for each row in the dimensions DataFrame.

Args: wanted (pd.DataFrame): DataFrame containing the dimensions to fetch. DataFrame cells with NAN values will be ignored. The DataFrame must have a populated 'flow_id' column. validate (bool): If True, the function will validate the dimensions and values against the ABS SDMX API codelists. Defaults to False. **kwargs: Additional keyword arguments passed to the underlying data fetching function.

Returns: tuple[pd.DataFrame, pd.DataFrame]: A DataFrame with the fetched data and a DataFrame with the metadata.

Raises: ValueError: if any input data is not as expected.

Note: CacheError and HttpError are raised by the fetch function. These will be caught and reported to standard output.

def fetch_multi( wanted: pandas.core.frame.DataFrame, *, validate: bool = False, **kwargs: Unpack[sdmxabs.GetFileKwargs]) -> tuple[pandas.core.frame.DataFrame, pandas.core.frame.DataFrame]:
 77def fetch_multi(
 78    wanted: pd.DataFrame,
 79    *,
 80    validate: bool = False,
 81    **kwargs: Unpack[GetFileKwargs],
 82) -> tuple[pd.DataFrame, pd.DataFrame]:
 83    """Fetch multiple SDMX datasets based on a DataFrame of desired datasets.
 84
 85    Args:
 86        wanted: A DataFrame with rows for each desired data set (of one or more series).
 87                Each row should contain the necessary identifiers to fetch the dataset.
 88                The columns will be 'flow_id', plus the ABS dimensions relevant to the flow.
 89                The 'flow_id' column is mandatory, and the rest are optional.
 90                Note: the DataFrame index is not used in the fetching process.
 91        validate: If True, the function will validate dimensions and values against
 92                  the ABS SDMX API codelists. Defaults to False.
 93        **kwargs: Additional keyword arguments passed to the underlying data fetching function.
 94
 95    Returns:
 96        A tuple containing two DataFrames:
 97        - The first DataFrame contains the fetched data.
 98        - The second DataFrame contains metadata about the fetched datasets.
 99
100    Raises:
101        ValueError: If the 'flow_id' column is missing from the `wanted` DataFrame.
102
103    Note:
104        CacheError and HttpError are raised by the fetch function.
105        These will be caught and reported to standard output.
106
107    Caution:
108        The selected data should all have the same index. You cannot mix (for example)
109        Quarterly and Monthly data in the same DataFrame.
110
111    """
112    # --- quick sanity checks
113    if wanted.empty:
114        print("wanted DataFrame is empty, returning empty DataFrames.")
115        return pd.DataFrame(), pd.DataFrame()
116    if "flow_id" not in wanted.columns:
117        raise ValueError("The 'flow_id' column is required in the 'wanted' DataFrame.")
118
119    # --- do the work
120    return extract(wanted, validate=validate, **kwargs)

Fetch multiple SDMX datasets based on a DataFrame of desired datasets.

Args: wanted: A DataFrame with rows for each desired data set (of one or more series). Each row should contain the necessary identifiers to fetch the dataset. The columns will be 'flow_id', plus the ABS dimensions relevant to the flow. The 'flow_id' column is mandatory, and the rest are optional. Note: the DataFrame index is not used in the fetching process. validate: If True, the function will validate dimensions and values against the ABS SDMX API codelists. Defaults to False. **kwargs: Additional keyword arguments passed to the underlying data fetching function.

Returns: A tuple containing two DataFrames: - The first DataFrame contains the fetched data. - The second DataFrame contains metadata about the fetched datasets.

Raises: ValueError: If the 'flow_id' column is missing from the wanted DataFrame.

Note: CacheError and HttpError are raised by the fetch function. These will be caught and reported to standard output.

Caution: The selected data should all have the same index. You cannot mix (for example) Quarterly and Monthly data in the same DataFrame.