mgplot.summary_plot
summary_plot.py:
Produce a summary plot for the data in a given DataFrame. The data is normalised to z-scores and scaled.
1""" 2summary_plot.py: 3 4Produce a summary plot for the data in a given DataFrame. 5The data is normalised to z-scores and scaled. 6""" 7 8# --- imports 9# system imports 10from typing import Any, NotRequired, Unpack 11 12# analytic third-party imports 13from numpy import ndarray, array 14from matplotlib.pyplot import Axes 15from pandas import DataFrame, Period 16 17# local imports 18from mgplot.settings import DataT 19from mgplot.utilities import get_axes, label_period 20from mgplot.finalise_plot import make_legend 21from mgplot.utilities import constrain_data, check_clean_timeseries 22from mgplot.keyword_checking import ( 23 report_kwargs, 24 validate_kwargs, 25 BaseKwargs, 26) 27 28 29# --- constants 30ME = "summary_plot" 31ZSCORES = "zscores" 32ZSCALED = "zscaled" 33 34 35class SummaryKwargs(BaseKwargs): 36 """Keyword arguments for the summary_plot function.""" 37 38 ax: NotRequired[Axes | None] 39 verbose: NotRequired[bool] 40 middle: NotRequired[float] 41 plot_type: NotRequired[str] 42 plot_from: NotRequired[int | Period] 43 legend: NotRequired[dict[str, Any]] 44 xlabel: NotRequired[str | None] 45 46 47# --- functions 48def _calc_quantiles(middle: float) -> ndarray: 49 """Calculate the quantiles for the middle of the data.""" 50 return array([(1 - middle) / 2.0, 1 - (1 - middle) / 2.0]) 51 52 53def calculate_z( 54 original: DataFrame, # only contains the data points of interest 55 middle: float, # middle proportion of data to highlight (eg. 0.8) 56 verbose: bool = False, # print the summary data 57) -> tuple[DataFrame, DataFrame]: 58 """Calculate z-scores, scaled z-scores and middle quantiles. 59 Return z_scores, z_scaled, q (which are the quantiles for the 60 start/end of the middle proportion of data to highlight).""" 61 62 # calculate z-scores, scaled scores and middle quantiles 63 z_scores: DataFrame = (original - original.mean()) / original.std() 64 z_scaled: DataFrame = ( 65 # scale z-scores between -1 and +1 66 (((z_scores - z_scores.min()) / (z_scores.max() - z_scores.min())) - 0.5) * 2 67 ) 68 q_middle = _calc_quantiles(middle) 69 70 if verbose: 71 frame = DataFrame( 72 { 73 "count": original.count(), 74 "mean": original.mean(), 75 "median": original.median(), 76 "min shaded": original.quantile(q=q_middle[0]), 77 "max shaded": original.quantile(q=q_middle[1]), 78 "z-scores": z_scores.iloc[-1], 79 "scaled": z_scaled.iloc[-1], 80 } 81 ) 82 print(frame) 83 84 return DataFrame(z_scores), DataFrame(z_scaled) # syntactic sugar for type hinting 85 86 87def _plot_middle_bars( 88 adjusted: DataFrame, 89 middle: float, 90 kwargs: dict[str, Any], # must be a dictionary, not a splat 91) -> Axes: 92 """Plot the middle (typically 80%) of the data as a bar. 93 Note: also sets the x-axis limits in kwargs. 94 Return the matplotlib Axes object.""" 95 96 q = _calc_quantiles(middle) 97 lo_hi: DataFrame = adjusted.quantile(q=q).T # get the middle section of data 98 span = 1.15 99 space = 0.2 100 low = min(adjusted.iloc[-1].min(), lo_hi.min().min(), -span) - space 101 high = max(adjusted.iloc[-1].max(), lo_hi.max().max(), span) + space 102 kwargs["xlim"] = (low, high) # update the kwargs with the xlim 103 ax, _ = get_axes(**kwargs) 104 ax.barh( 105 y=lo_hi.index, 106 width=lo_hi[q[1]] - lo_hi[q[0]], 107 left=lo_hi[q[0]], 108 color="#bbbbbb", 109 label=f"Middle {middle * 100:0.0f}% of prints", 110 ) 111 return ax 112 113 114def plot_latest_datapoint( 115 ax: Axes, 116 original: DataFrame, 117 adjusted: DataFrame, 118 f_size: int | str, 119) -> None: 120 """Add the latest datapoints to the summary plot""" 121 122 ax.scatter(adjusted.iloc[-1], adjusted.columns, color="darkorange", label="Latest") 123 f_size = 10 124 row = adjusted.index[-1] 125 for col_num, col_name in enumerate(original.columns): 126 ax.text( 127 x=adjusted.at[row, col_name], 128 y=col_num, 129 s=f"{original.at[row, col_name]:.1f}", 130 ha="center", 131 va="center", 132 size=f_size, 133 ) 134 135 136def label_extremes( 137 ax: Axes, 138 data: tuple[DataFrame, DataFrame], 139 plot_type: str, 140 f_size: int | str, 141 kwargs: dict[str, Any], # must be a dictionary, not a splat 142) -> None: 143 """Label the extremes in the scaled plots.""" 144 145 original, adjusted = data 146 low, high = kwargs["xlim"] 147 ax.set_xlim(low, high) # set the x-axis limits 148 if plot_type == ZSCALED: 149 ax.scatter( 150 adjusted.median(), 151 adjusted.columns, 152 color="darkorchid", 153 marker="x", 154 s=5, 155 label="Median", 156 ) 157 for col_num, col_name in enumerate(original.columns): 158 ax.text( 159 low, 160 col_num, 161 f" {original[col_name].min():.2f}", 162 ha="left", 163 va="center", 164 size=f_size, 165 ) 166 ax.text( 167 high, 168 col_num, 169 f"{original[col_name].max():.2f} ", 170 ha="right", 171 va="center", 172 size=f_size, 173 ) 174 175 176def horizontal_bar_plot( 177 original: DataFrame, 178 adjusted: DataFrame, 179 middle: float, 180 plot_type: str, 181 kwargs: dict[str, Any], # must be a dictionary, not a splat 182) -> Axes: 183 """Plot horizontal bars for the middle of the data.""" 184 185 # kwargs is a dictionary, not a splat 186 # so that we can pass it to the Axes object and 187 # set the x-axis limits. 188 189 ax = _plot_middle_bars(adjusted, middle, kwargs) 190 f_size = "x-small" 191 plot_latest_datapoint(ax, original, adjusted, f_size) 192 label_extremes(ax, data=(original, adjusted), plot_type=plot_type, f_size=f_size, kwargs=kwargs) 193 194 return ax 195 196 197def label_x_axis(plot_from: int | Period, label: str | None, plot_type: str, ax: Axes, df: DataFrame) -> None: 198 """Label the x-axis for the plot.""" 199 200 start: Period = plot_from if isinstance(plot_from, Period) else df.index[plot_from] 201 if label is not None: 202 if not label: 203 if plot_type == ZSCORES: 204 label = f"Z-scores for prints since {label_period(start)}" 205 else: 206 label = f"-1 to 1 scaled z-scores since {label_period(start)}" 207 ax.set_xlabel(label) 208 209 210def mark_reference_lines(plot_type: str, ax: Axes) -> None: 211 """Mark the reference lines for the plot.""" 212 213 if plot_type == ZSCALED: 214 ax.axvline(-1, color="#555555", linewidth=0.5, linestyle="--", label="-1") 215 ax.axvline(1, color="#555555", linewidth=0.5, linestyle="--", label="+1") 216 elif plot_type == ZSCORES: 217 ax.axvline(0, color="#555555", linewidth=0.5, linestyle="--", label="0") 218 219 220def plot_the_data(df: DataFrame, **kwargs) -> tuple[Axes, str]: 221 """Plot the data as a summary plot. 222 Args: 223 - df: DataFrame containing the data to plot. 224 - kwargs 225 226 Returns: 227 - ax: Axes object containing the plot. 228 - plot_type: type of plot, either 'zscores' or 'zscaled'. 229 """ 230 231 # get the data, calculate z-scores and scaled scores based on the start period 232 verbose = kwargs.pop("verbose", False) 233 middle = float(kwargs.pop("middle", 0.8)) 234 plot_type = kwargs.pop("plot_type", ZSCORES) 235 subset, kwargsd = constrain_data(df, **kwargs) 236 z_scores, z_scaled = calculate_z(subset, middle, verbose=verbose) 237 238 # plot as required by the plot_types argument 239 adjusted = z_scores if plot_type == ZSCORES else z_scaled 240 ax = horizontal_bar_plot(subset, adjusted, middle, plot_type, kwargsd) 241 ax.tick_params(axis="y", labelsize="small") 242 make_legend(ax, kwargsd["legend"]) 243 ax.set_xlim(kwargsd.get("xlim")) # provide space for the labels 244 245 return ax, plot_type 246 247 248# --- public 249def summary_plot(data: DataT, **kwargs: Unpack[SummaryKwargs]) -> Axes: 250 """Plot a summary of historical data for a given DataFrame. 251 252 Args:x 253 - summary: DataFrame containing the summary data. The column names are 254 used as labels for the plot. 255 - kwargs: additional arguments for the plot, including: 256 257 Returns Axes. 258 """ 259 260 # --- check the kwargs 261 me = "summary_plot" 262 report_kwargs(caller=me, **kwargs) 263 validate_kwargs(schema=SummaryKwargs, caller=me, **kwargs) 264 265 # --- check the data 266 data = check_clean_timeseries(data, me) 267 if not isinstance(data, DataFrame): 268 raise TypeError("data must be a pandas DataFrame for summary_plot()") 269 df = DataFrame(data) # syntactic sugar for type hinting 270 271 # --- legend 272 kwargs["legend"] = kwargs.get( 273 "legend", 274 { 275 # put the legend below the x-axis label 276 "loc": "upper center", 277 "fontsize": "xx-small", 278 "bbox_to_anchor": (0.5, -0.125), 279 "ncol": 4, 280 }, 281 ) 282 283 # --- and plot it ... 284 ax, plot_type = plot_the_data(df, **kwargs) 285 label_x_axis( 286 kwargs.get("plot_from", 0), label=kwargs.get("xlabel", ""), plot_type=plot_type, ax=ax, df=df 287 ) 288 mark_reference_lines(plot_type, ax) 289 290 return ax
ME =
'summary_plot'
ZSCORES =
'zscores'
ZSCALED =
'zscaled'
class
SummaryKwargs(mgplot.keyword_checking.BaseKwargs):
36class SummaryKwargs(BaseKwargs): 37 """Keyword arguments for the summary_plot function.""" 38 39 ax: NotRequired[Axes | None] 40 verbose: NotRequired[bool] 41 middle: NotRequired[float] 42 plot_type: NotRequired[str] 43 plot_from: NotRequired[int | Period] 44 legend: NotRequired[dict[str, Any]] 45 xlabel: NotRequired[str | None]
Keyword arguments for the summary_plot function.
def
calculate_z( original: pandas.core.frame.DataFrame, middle: float, verbose: bool = False) -> tuple[pandas.core.frame.DataFrame, pandas.core.frame.DataFrame]:
54def calculate_z( 55 original: DataFrame, # only contains the data points of interest 56 middle: float, # middle proportion of data to highlight (eg. 0.8) 57 verbose: bool = False, # print the summary data 58) -> tuple[DataFrame, DataFrame]: 59 """Calculate z-scores, scaled z-scores and middle quantiles. 60 Return z_scores, z_scaled, q (which are the quantiles for the 61 start/end of the middle proportion of data to highlight).""" 62 63 # calculate z-scores, scaled scores and middle quantiles 64 z_scores: DataFrame = (original - original.mean()) / original.std() 65 z_scaled: DataFrame = ( 66 # scale z-scores between -1 and +1 67 (((z_scores - z_scores.min()) / (z_scores.max() - z_scores.min())) - 0.5) * 2 68 ) 69 q_middle = _calc_quantiles(middle) 70 71 if verbose: 72 frame = DataFrame( 73 { 74 "count": original.count(), 75 "mean": original.mean(), 76 "median": original.median(), 77 "min shaded": original.quantile(q=q_middle[0]), 78 "max shaded": original.quantile(q=q_middle[1]), 79 "z-scores": z_scores.iloc[-1], 80 "scaled": z_scaled.iloc[-1], 81 } 82 ) 83 print(frame) 84 85 return DataFrame(z_scores), DataFrame(z_scaled) # syntactic sugar for type hinting
Calculate z-scores, scaled z-scores and middle quantiles. Return z_scores, z_scaled, q (which are the quantiles for the start/end of the middle proportion of data to highlight).
def
plot_latest_datapoint( ax: matplotlib.axes._axes.Axes, original: pandas.core.frame.DataFrame, adjusted: pandas.core.frame.DataFrame, f_size: int | str) -> None:
115def plot_latest_datapoint( 116 ax: Axes, 117 original: DataFrame, 118 adjusted: DataFrame, 119 f_size: int | str, 120) -> None: 121 """Add the latest datapoints to the summary plot""" 122 123 ax.scatter(adjusted.iloc[-1], adjusted.columns, color="darkorange", label="Latest") 124 f_size = 10 125 row = adjusted.index[-1] 126 for col_num, col_name in enumerate(original.columns): 127 ax.text( 128 x=adjusted.at[row, col_name], 129 y=col_num, 130 s=f"{original.at[row, col_name]:.1f}", 131 ha="center", 132 va="center", 133 size=f_size, 134 )
Add the latest datapoints to the summary plot
def
label_extremes( ax: matplotlib.axes._axes.Axes, data: tuple[pandas.core.frame.DataFrame, pandas.core.frame.DataFrame], plot_type: str, f_size: int | str, kwargs: dict[str, typing.Any]) -> None:
137def label_extremes( 138 ax: Axes, 139 data: tuple[DataFrame, DataFrame], 140 plot_type: str, 141 f_size: int | str, 142 kwargs: dict[str, Any], # must be a dictionary, not a splat 143) -> None: 144 """Label the extremes in the scaled plots.""" 145 146 original, adjusted = data 147 low, high = kwargs["xlim"] 148 ax.set_xlim(low, high) # set the x-axis limits 149 if plot_type == ZSCALED: 150 ax.scatter( 151 adjusted.median(), 152 adjusted.columns, 153 color="darkorchid", 154 marker="x", 155 s=5, 156 label="Median", 157 ) 158 for col_num, col_name in enumerate(original.columns): 159 ax.text( 160 low, 161 col_num, 162 f" {original[col_name].min():.2f}", 163 ha="left", 164 va="center", 165 size=f_size, 166 ) 167 ax.text( 168 high, 169 col_num, 170 f"{original[col_name].max():.2f} ", 171 ha="right", 172 va="center", 173 size=f_size, 174 )
Label the extremes in the scaled plots.
def
horizontal_bar_plot( original: pandas.core.frame.DataFrame, adjusted: pandas.core.frame.DataFrame, middle: float, plot_type: str, kwargs: dict[str, typing.Any]) -> matplotlib.axes._axes.Axes:
177def horizontal_bar_plot( 178 original: DataFrame, 179 adjusted: DataFrame, 180 middle: float, 181 plot_type: str, 182 kwargs: dict[str, Any], # must be a dictionary, not a splat 183) -> Axes: 184 """Plot horizontal bars for the middle of the data.""" 185 186 # kwargs is a dictionary, not a splat 187 # so that we can pass it to the Axes object and 188 # set the x-axis limits. 189 190 ax = _plot_middle_bars(adjusted, middle, kwargs) 191 f_size = "x-small" 192 plot_latest_datapoint(ax, original, adjusted, f_size) 193 label_extremes(ax, data=(original, adjusted), plot_type=plot_type, f_size=f_size, kwargs=kwargs) 194 195 return ax
Plot horizontal bars for the middle of the data.
def
label_x_axis( plot_from: int | pandas._libs.tslibs.period.Period, label: str | None, plot_type: str, ax: matplotlib.axes._axes.Axes, df: pandas.core.frame.DataFrame) -> None:
198def label_x_axis(plot_from: int | Period, label: str | None, plot_type: str, ax: Axes, df: DataFrame) -> None: 199 """Label the x-axis for the plot.""" 200 201 start: Period = plot_from if isinstance(plot_from, Period) else df.index[plot_from] 202 if label is not None: 203 if not label: 204 if plot_type == ZSCORES: 205 label = f"Z-scores for prints since {label_period(start)}" 206 else: 207 label = f"-1 to 1 scaled z-scores since {label_period(start)}" 208 ax.set_xlabel(label)
Label the x-axis for the plot.
def
mark_reference_lines(plot_type: str, ax: matplotlib.axes._axes.Axes) -> None:
211def mark_reference_lines(plot_type: str, ax: Axes) -> None: 212 """Mark the reference lines for the plot.""" 213 214 if plot_type == ZSCALED: 215 ax.axvline(-1, color="#555555", linewidth=0.5, linestyle="--", label="-1") 216 ax.axvline(1, color="#555555", linewidth=0.5, linestyle="--", label="+1") 217 elif plot_type == ZSCORES: 218 ax.axvline(0, color="#555555", linewidth=0.5, linestyle="--", label="0")
Mark the reference lines for the plot.
def
plot_the_data( df: pandas.core.frame.DataFrame, **kwargs) -> tuple[matplotlib.axes._axes.Axes, str]:
221def plot_the_data(df: DataFrame, **kwargs) -> tuple[Axes, str]: 222 """Plot the data as a summary plot. 223 Args: 224 - df: DataFrame containing the data to plot. 225 - kwargs 226 227 Returns: 228 - ax: Axes object containing the plot. 229 - plot_type: type of plot, either 'zscores' or 'zscaled'. 230 """ 231 232 # get the data, calculate z-scores and scaled scores based on the start period 233 verbose = kwargs.pop("verbose", False) 234 middle = float(kwargs.pop("middle", 0.8)) 235 plot_type = kwargs.pop("plot_type", ZSCORES) 236 subset, kwargsd = constrain_data(df, **kwargs) 237 z_scores, z_scaled = calculate_z(subset, middle, verbose=verbose) 238 239 # plot as required by the plot_types argument 240 adjusted = z_scores if plot_type == ZSCORES else z_scaled 241 ax = horizontal_bar_plot(subset, adjusted, middle, plot_type, kwargsd) 242 ax.tick_params(axis="y", labelsize="small") 243 make_legend(ax, kwargsd["legend"]) 244 ax.set_xlim(kwargsd.get("xlim")) # provide space for the labels 245 246 return ax, plot_type
Plot the data as a summary plot. Args:
- df: DataFrame containing the data to plot.
- kwargs
Returns:
- ax: Axes object containing the plot.
- plot_type: type of plot, either 'zscores' or 'zscaled'.
250def summary_plot(data: DataT, **kwargs: Unpack[SummaryKwargs]) -> Axes: 251 """Plot a summary of historical data for a given DataFrame. 252 253 Args:x 254 - summary: DataFrame containing the summary data. The column names are 255 used as labels for the plot. 256 - kwargs: additional arguments for the plot, including: 257 258 Returns Axes. 259 """ 260 261 # --- check the kwargs 262 me = "summary_plot" 263 report_kwargs(caller=me, **kwargs) 264 validate_kwargs(schema=SummaryKwargs, caller=me, **kwargs) 265 266 # --- check the data 267 data = check_clean_timeseries(data, me) 268 if not isinstance(data, DataFrame): 269 raise TypeError("data must be a pandas DataFrame for summary_plot()") 270 df = DataFrame(data) # syntactic sugar for type hinting 271 272 # --- legend 273 kwargs["legend"] = kwargs.get( 274 "legend", 275 { 276 # put the legend below the x-axis label 277 "loc": "upper center", 278 "fontsize": "xx-small", 279 "bbox_to_anchor": (0.5, -0.125), 280 "ncol": 4, 281 }, 282 ) 283 284 # --- and plot it ... 285 ax, plot_type = plot_the_data(df, **kwargs) 286 label_x_axis( 287 kwargs.get("plot_from", 0), label=kwargs.get("xlabel", ""), plot_type=plot_type, ax=ax, df=df 288 ) 289 mark_reference_lines(plot_type, ax) 290 291 return ax
Plot a summary of historical data for a given DataFrame.
Args:x
- summary: DataFrame containing the summary data. The column names are used as labels for the plot.
- kwargs: additional arguments for the plot, including:
Returns Axes.