mgplot.summary_plot

summary_plot.py:

Produce a summary plot for the data in a given DataFrame. The data is normalised to z-scores and scaled.

  1"""
  2summary_plot.py:
  3
  4Produce a summary plot for the data in a given DataFrame.
  5The data is normalised to z-scores and scaled.
  6"""
  7
  8# --- imports
  9# system imports
 10from typing import Any, NotRequired, Unpack
 11
 12# analytic third-party imports
 13from numpy import ndarray, array
 14from matplotlib.pyplot import Axes
 15from pandas import DataFrame, Period
 16
 17# local imports
 18from mgplot.settings import DataT
 19from mgplot.utilities import get_axes, label_period
 20from mgplot.finalise_plot import make_legend
 21from mgplot.utilities import constrain_data, check_clean_timeseries
 22from mgplot.keyword_checking import (
 23    report_kwargs,
 24    validate_kwargs,
 25    BaseKwargs,
 26)
 27
 28
 29# --- constants
 30ME = "summary_plot"
 31ZSCORES = "zscores"
 32ZSCALED = "zscaled"
 33
 34
 35class SummaryKwargs(BaseKwargs):
 36    """Keyword arguments for the summary_plot function."""
 37
 38    ax: NotRequired[Axes | None]
 39    verbose: NotRequired[bool]
 40    middle: NotRequired[float]
 41    plot_type: NotRequired[str]
 42    plot_from: NotRequired[int | Period]
 43    legend: NotRequired[dict[str, Any]]
 44    xlabel: NotRequired[str | None]
 45
 46
 47# --- functions
 48def _calc_quantiles(middle: float) -> ndarray:
 49    """Calculate the quantiles for the middle of the data."""
 50    return array([(1 - middle) / 2.0, 1 - (1 - middle) / 2.0])
 51
 52
 53def calculate_z(
 54    original: DataFrame,  # only contains the data points of interest
 55    middle: float,  # middle proportion of data to highlight (eg. 0.8)
 56    verbose: bool = False,  # print the summary data
 57) -> tuple[DataFrame, DataFrame]:
 58    """Calculate z-scores, scaled z-scores and middle quantiles.
 59    Return z_scores, z_scaled, q (which are the quantiles for the
 60    start/end of the middle proportion of data to highlight)."""
 61
 62    # calculate z-scores, scaled scores and middle quantiles
 63    z_scores: DataFrame = (original - original.mean()) / original.std()
 64    z_scaled: DataFrame = (
 65        # scale z-scores between -1 and +1
 66        (((z_scores - z_scores.min()) / (z_scores.max() - z_scores.min())) - 0.5) * 2
 67    )
 68    q_middle = _calc_quantiles(middle)
 69
 70    if verbose:
 71        frame = DataFrame(
 72            {
 73                "count": original.count(),
 74                "mean": original.mean(),
 75                "median": original.median(),
 76                "min shaded": original.quantile(q=q_middle[0]),
 77                "max shaded": original.quantile(q=q_middle[1]),
 78                "z-scores": z_scores.iloc[-1],
 79                "scaled": z_scaled.iloc[-1],
 80            }
 81        )
 82        print(frame)
 83
 84    return DataFrame(z_scores), DataFrame(z_scaled)  # syntactic sugar for type hinting
 85
 86
 87def _plot_middle_bars(
 88    adjusted: DataFrame,
 89    middle: float,
 90    kwargs: dict[str, Any],  # must be a dictionary, not a splat
 91) -> Axes:
 92    """Plot the middle (typically 80%) of the data as a bar.
 93    Note: also sets the x-axis limits in kwargs.
 94    Return the matplotlib Axes object."""
 95
 96    q = _calc_quantiles(middle)
 97    lo_hi: DataFrame = adjusted.quantile(q=q).T  # get the middle section of data
 98    span = 1.15
 99    space = 0.2
100    low = min(adjusted.iloc[-1].min(), lo_hi.min().min(), -span) - space
101    high = max(adjusted.iloc[-1].max(), lo_hi.max().max(), span) + space
102    kwargs["xlim"] = (low, high)  # update the kwargs with the xlim
103    ax, _ = get_axes(**kwargs)
104    ax.barh(
105        y=lo_hi.index,
106        width=lo_hi[q[1]] - lo_hi[q[0]],
107        left=lo_hi[q[0]],
108        color="#bbbbbb",
109        label=f"Middle {middle * 100:0.0f}% of prints",
110    )
111    return ax
112
113
114def plot_latest_datapoint(
115    ax: Axes,
116    original: DataFrame,
117    adjusted: DataFrame,
118    f_size: int | str,
119) -> None:
120    """Add the latest datapoints to the summary plot"""
121
122    ax.scatter(adjusted.iloc[-1], adjusted.columns, color="darkorange", label="Latest")
123    f_size = 10
124    row = adjusted.index[-1]
125    for col_num, col_name in enumerate(original.columns):
126        ax.text(
127            x=adjusted.at[row, col_name],
128            y=col_num,
129            s=f"{original.at[row, col_name]:.1f}",
130            ha="center",
131            va="center",
132            size=f_size,
133        )
134
135
136def label_extremes(
137    ax: Axes,
138    data: tuple[DataFrame, DataFrame],
139    plot_type: str,
140    f_size: int | str,
141    kwargs: dict[str, Any],  # must be a dictionary, not a splat
142) -> None:
143    """Label the extremes in the scaled plots."""
144
145    original, adjusted = data
146    low, high = kwargs["xlim"]
147    ax.set_xlim(low, high)  # set the x-axis limits
148    if plot_type == ZSCALED:
149        ax.scatter(
150            adjusted.median(),
151            adjusted.columns,
152            color="darkorchid",
153            marker="x",
154            s=5,
155            label="Median",
156        )
157        for col_num, col_name in enumerate(original.columns):
158            ax.text(
159                low,
160                col_num,
161                f" {original[col_name].min():.2f}",
162                ha="left",
163                va="center",
164                size=f_size,
165            )
166            ax.text(
167                high,
168                col_num,
169                f"{original[col_name].max():.2f} ",
170                ha="right",
171                va="center",
172                size=f_size,
173            )
174
175
176def horizontal_bar_plot(
177    original: DataFrame,
178    adjusted: DataFrame,
179    middle: float,
180    plot_type: str,
181    kwargs: dict[str, Any],  # must be a dictionary, not a splat
182) -> Axes:
183    """Plot horizontal bars for the middle of the data."""
184
185    # kwargs is a dictionary, not a splat
186    # so that we can pass it to the Axes object and
187    # set the x-axis limits.
188
189    ax = _plot_middle_bars(adjusted, middle, kwargs)
190    f_size = "x-small"
191    plot_latest_datapoint(ax, original, adjusted, f_size)
192    label_extremes(ax, data=(original, adjusted), plot_type=plot_type, f_size=f_size, kwargs=kwargs)
193
194    return ax
195
196
197def label_x_axis(plot_from: int | Period, label: str | None, plot_type: str, ax: Axes, df: DataFrame) -> None:
198    """Label the x-axis for the plot."""
199
200    start: Period = plot_from if isinstance(plot_from, Period) else df.index[plot_from]
201    if label is not None:
202        if not label:
203            if plot_type == ZSCORES:
204                label = f"Z-scores for prints since {label_period(start)}"
205            else:
206                label = f"-1 to 1 scaled z-scores since {label_period(start)}"
207        ax.set_xlabel(label)
208
209
210def mark_reference_lines(plot_type: str, ax: Axes) -> None:
211    """Mark the reference lines for the plot."""
212
213    if plot_type == ZSCALED:
214        ax.axvline(-1, color="#555555", linewidth=0.5, linestyle="--", label="-1")
215        ax.axvline(1, color="#555555", linewidth=0.5, linestyle="--", label="+1")
216    elif plot_type == ZSCORES:
217        ax.axvline(0, color="#555555", linewidth=0.5, linestyle="--", label="0")
218
219
220def plot_the_data(df: DataFrame, **kwargs) -> tuple[Axes, str]:
221    """Plot the data as a summary plot.
222    Args:
223    - df: DataFrame containing the data to plot.
224    - kwargs
225
226    Returns:
227    - ax: Axes object containing the plot.
228    - plot_type: type of plot, either 'zscores' or 'zscaled'.
229    """
230
231    # get the data, calculate z-scores and scaled scores based on the start period
232    verbose = kwargs.pop("verbose", False)
233    middle = float(kwargs.pop("middle", 0.8))
234    plot_type = kwargs.pop("plot_type", ZSCORES)
235    subset, kwargsd = constrain_data(df, **kwargs)
236    z_scores, z_scaled = calculate_z(subset, middle, verbose=verbose)
237
238    # plot as required by the plot_types argument
239    adjusted = z_scores if plot_type == ZSCORES else z_scaled
240    ax = horizontal_bar_plot(subset, adjusted, middle, plot_type, kwargsd)
241    ax.tick_params(axis="y", labelsize="small")
242    make_legend(ax, kwargsd["legend"])
243    ax.set_xlim(kwargsd.get("xlim"))  # provide space for the labels
244
245    return ax, plot_type
246
247
248# --- public
249def summary_plot(data: DataT, **kwargs: Unpack[SummaryKwargs]) -> Axes:
250    """Plot a summary of historical data for a given DataFrame.
251
252    Args:x
253    - summary: DataFrame containing the summary data. The column names are
254      used as labels for the plot.
255    - kwargs: additional arguments for the plot, including:
256
257    Returns Axes.
258    """
259
260    # --- check the kwargs
261    me = "summary_plot"
262    report_kwargs(caller=me, **kwargs)
263    validate_kwargs(schema=SummaryKwargs, caller=me, **kwargs)
264
265    # --- check the data
266    data = check_clean_timeseries(data, me)
267    if not isinstance(data, DataFrame):
268        raise TypeError("data must be a pandas DataFrame for summary_plot()")
269    df = DataFrame(data)  # syntactic sugar for type hinting
270
271    # --- legend
272    kwargs["legend"] = kwargs.get(
273        "legend",
274        {
275            # put the legend below the x-axis label
276            "loc": "upper center",
277            "fontsize": "xx-small",
278            "bbox_to_anchor": (0.5, -0.125),
279            "ncol": 4,
280        },
281    )
282
283    # --- and plot it ...
284    ax, plot_type = plot_the_data(df, **kwargs)
285    label_x_axis(
286        kwargs.get("plot_from", 0), label=kwargs.get("xlabel", ""), plot_type=plot_type, ax=ax, df=df
287    )
288    mark_reference_lines(plot_type, ax)
289
290    return ax
ME = 'summary_plot'
ZSCORES = 'zscores'
ZSCALED = 'zscaled'
class SummaryKwargs(mgplot.keyword_checking.BaseKwargs):
36class SummaryKwargs(BaseKwargs):
37    """Keyword arguments for the summary_plot function."""
38
39    ax: NotRequired[Axes | None]
40    verbose: NotRequired[bool]
41    middle: NotRequired[float]
42    plot_type: NotRequired[str]
43    plot_from: NotRequired[int | Period]
44    legend: NotRequired[dict[str, Any]]
45    xlabel: NotRequired[str | None]

Keyword arguments for the summary_plot function.

ax: NotRequired[matplotlib.axes._axes.Axes | None]
verbose: NotRequired[bool]
middle: NotRequired[float]
plot_type: NotRequired[str]
plot_from: NotRequired[int | pandas._libs.tslibs.period.Period]
legend: NotRequired[dict[str, Any]]
xlabel: NotRequired[str | None]
def calculate_z( original: pandas.core.frame.DataFrame, middle: float, verbose: bool = False) -> tuple[pandas.core.frame.DataFrame, pandas.core.frame.DataFrame]:
54def calculate_z(
55    original: DataFrame,  # only contains the data points of interest
56    middle: float,  # middle proportion of data to highlight (eg. 0.8)
57    verbose: bool = False,  # print the summary data
58) -> tuple[DataFrame, DataFrame]:
59    """Calculate z-scores, scaled z-scores and middle quantiles.
60    Return z_scores, z_scaled, q (which are the quantiles for the
61    start/end of the middle proportion of data to highlight)."""
62
63    # calculate z-scores, scaled scores and middle quantiles
64    z_scores: DataFrame = (original - original.mean()) / original.std()
65    z_scaled: DataFrame = (
66        # scale z-scores between -1 and +1
67        (((z_scores - z_scores.min()) / (z_scores.max() - z_scores.min())) - 0.5) * 2
68    )
69    q_middle = _calc_quantiles(middle)
70
71    if verbose:
72        frame = DataFrame(
73            {
74                "count": original.count(),
75                "mean": original.mean(),
76                "median": original.median(),
77                "min shaded": original.quantile(q=q_middle[0]),
78                "max shaded": original.quantile(q=q_middle[1]),
79                "z-scores": z_scores.iloc[-1],
80                "scaled": z_scaled.iloc[-1],
81            }
82        )
83        print(frame)
84
85    return DataFrame(z_scores), DataFrame(z_scaled)  # syntactic sugar for type hinting

Calculate z-scores, scaled z-scores and middle quantiles. Return z_scores, z_scaled, q (which are the quantiles for the start/end of the middle proportion of data to highlight).

def plot_latest_datapoint( ax: matplotlib.axes._axes.Axes, original: pandas.core.frame.DataFrame, adjusted: pandas.core.frame.DataFrame, f_size: int | str) -> None:
115def plot_latest_datapoint(
116    ax: Axes,
117    original: DataFrame,
118    adjusted: DataFrame,
119    f_size: int | str,
120) -> None:
121    """Add the latest datapoints to the summary plot"""
122
123    ax.scatter(adjusted.iloc[-1], adjusted.columns, color="darkorange", label="Latest")
124    f_size = 10
125    row = adjusted.index[-1]
126    for col_num, col_name in enumerate(original.columns):
127        ax.text(
128            x=adjusted.at[row, col_name],
129            y=col_num,
130            s=f"{original.at[row, col_name]:.1f}",
131            ha="center",
132            va="center",
133            size=f_size,
134        )

Add the latest datapoints to the summary plot

def label_extremes( ax: matplotlib.axes._axes.Axes, data: tuple[pandas.core.frame.DataFrame, pandas.core.frame.DataFrame], plot_type: str, f_size: int | str, kwargs: dict[str, typing.Any]) -> None:
137def label_extremes(
138    ax: Axes,
139    data: tuple[DataFrame, DataFrame],
140    plot_type: str,
141    f_size: int | str,
142    kwargs: dict[str, Any],  # must be a dictionary, not a splat
143) -> None:
144    """Label the extremes in the scaled plots."""
145
146    original, adjusted = data
147    low, high = kwargs["xlim"]
148    ax.set_xlim(low, high)  # set the x-axis limits
149    if plot_type == ZSCALED:
150        ax.scatter(
151            adjusted.median(),
152            adjusted.columns,
153            color="darkorchid",
154            marker="x",
155            s=5,
156            label="Median",
157        )
158        for col_num, col_name in enumerate(original.columns):
159            ax.text(
160                low,
161                col_num,
162                f" {original[col_name].min():.2f}",
163                ha="left",
164                va="center",
165                size=f_size,
166            )
167            ax.text(
168                high,
169                col_num,
170                f"{original[col_name].max():.2f} ",
171                ha="right",
172                va="center",
173                size=f_size,
174            )

Label the extremes in the scaled plots.

def horizontal_bar_plot( original: pandas.core.frame.DataFrame, adjusted: pandas.core.frame.DataFrame, middle: float, plot_type: str, kwargs: dict[str, typing.Any]) -> matplotlib.axes._axes.Axes:
177def horizontal_bar_plot(
178    original: DataFrame,
179    adjusted: DataFrame,
180    middle: float,
181    plot_type: str,
182    kwargs: dict[str, Any],  # must be a dictionary, not a splat
183) -> Axes:
184    """Plot horizontal bars for the middle of the data."""
185
186    # kwargs is a dictionary, not a splat
187    # so that we can pass it to the Axes object and
188    # set the x-axis limits.
189
190    ax = _plot_middle_bars(adjusted, middle, kwargs)
191    f_size = "x-small"
192    plot_latest_datapoint(ax, original, adjusted, f_size)
193    label_extremes(ax, data=(original, adjusted), plot_type=plot_type, f_size=f_size, kwargs=kwargs)
194
195    return ax

Plot horizontal bars for the middle of the data.

def label_x_axis( plot_from: int | pandas._libs.tslibs.period.Period, label: str | None, plot_type: str, ax: matplotlib.axes._axes.Axes, df: pandas.core.frame.DataFrame) -> None:
198def label_x_axis(plot_from: int | Period, label: str | None, plot_type: str, ax: Axes, df: DataFrame) -> None:
199    """Label the x-axis for the plot."""
200
201    start: Period = plot_from if isinstance(plot_from, Period) else df.index[plot_from]
202    if label is not None:
203        if not label:
204            if plot_type == ZSCORES:
205                label = f"Z-scores for prints since {label_period(start)}"
206            else:
207                label = f"-1 to 1 scaled z-scores since {label_period(start)}"
208        ax.set_xlabel(label)

Label the x-axis for the plot.

def mark_reference_lines(plot_type: str, ax: matplotlib.axes._axes.Axes) -> None:
211def mark_reference_lines(plot_type: str, ax: Axes) -> None:
212    """Mark the reference lines for the plot."""
213
214    if plot_type == ZSCALED:
215        ax.axvline(-1, color="#555555", linewidth=0.5, linestyle="--", label="-1")
216        ax.axvline(1, color="#555555", linewidth=0.5, linestyle="--", label="+1")
217    elif plot_type == ZSCORES:
218        ax.axvline(0, color="#555555", linewidth=0.5, linestyle="--", label="0")

Mark the reference lines for the plot.

def plot_the_data( df: pandas.core.frame.DataFrame, **kwargs) -> tuple[matplotlib.axes._axes.Axes, str]:
221def plot_the_data(df: DataFrame, **kwargs) -> tuple[Axes, str]:
222    """Plot the data as a summary plot.
223    Args:
224    - df: DataFrame containing the data to plot.
225    - kwargs
226
227    Returns:
228    - ax: Axes object containing the plot.
229    - plot_type: type of plot, either 'zscores' or 'zscaled'.
230    """
231
232    # get the data, calculate z-scores and scaled scores based on the start period
233    verbose = kwargs.pop("verbose", False)
234    middle = float(kwargs.pop("middle", 0.8))
235    plot_type = kwargs.pop("plot_type", ZSCORES)
236    subset, kwargsd = constrain_data(df, **kwargs)
237    z_scores, z_scaled = calculate_z(subset, middle, verbose=verbose)
238
239    # plot as required by the plot_types argument
240    adjusted = z_scores if plot_type == ZSCORES else z_scaled
241    ax = horizontal_bar_plot(subset, adjusted, middle, plot_type, kwargsd)
242    ax.tick_params(axis="y", labelsize="small")
243    make_legend(ax, kwargsd["legend"])
244    ax.set_xlim(kwargsd.get("xlim"))  # provide space for the labels
245
246    return ax, plot_type

Plot the data as a summary plot. Args:

  • df: DataFrame containing the data to plot.
  • kwargs

Returns:

  • ax: Axes object containing the plot.
  • plot_type: type of plot, either 'zscores' or 'zscaled'.
def summary_plot( data: ~DataT, **kwargs: Unpack[SummaryKwargs]) -> matplotlib.axes._axes.Axes:
250def summary_plot(data: DataT, **kwargs: Unpack[SummaryKwargs]) -> Axes:
251    """Plot a summary of historical data for a given DataFrame.
252
253    Args:x
254    - summary: DataFrame containing the summary data. The column names are
255      used as labels for the plot.
256    - kwargs: additional arguments for the plot, including:
257
258    Returns Axes.
259    """
260
261    # --- check the kwargs
262    me = "summary_plot"
263    report_kwargs(caller=me, **kwargs)
264    validate_kwargs(schema=SummaryKwargs, caller=me, **kwargs)
265
266    # --- check the data
267    data = check_clean_timeseries(data, me)
268    if not isinstance(data, DataFrame):
269        raise TypeError("data must be a pandas DataFrame for summary_plot()")
270    df = DataFrame(data)  # syntactic sugar for type hinting
271
272    # --- legend
273    kwargs["legend"] = kwargs.get(
274        "legend",
275        {
276            # put the legend below the x-axis label
277            "loc": "upper center",
278            "fontsize": "xx-small",
279            "bbox_to_anchor": (0.5, -0.125),
280            "ncol": 4,
281        },
282    )
283
284    # --- and plot it ...
285    ax, plot_type = plot_the_data(df, **kwargs)
286    label_x_axis(
287        kwargs.get("plot_from", 0), label=kwargs.get("xlabel", ""), plot_type=plot_type, ax=ax, df=df
288    )
289    mark_reference_lines(plot_type, ax)
290
291    return ax

Plot a summary of historical data for a given DataFrame.

Args:x

  • summary: DataFrame containing the summary data. The column names are used as labels for the plot.
  • kwargs: additional arguments for the plot, including:

Returns Axes.