Source code for dvhastats.ui

#!/usr/bin/env python
# -*- coding: utf-8 -*-
#
# ui.py
"""DVHA-Stats classes for user interaction"""
#
# Copyright (c) 2020 Dan Cutright
# Copyright (c) 2020 Arka Roy
# This file is part of DVHA-Stats, released under a MIT license.
#    See the file LICENSE included with this distribution, also
#    available at https://github.com/cutright/DVHA-Stats


from os.path import dirname, join
import numpy as np
from dvhastats.utilities import import_data
from dvhastats import plot
from dvhastats import stats

SCRIPT_DIR = dirname(__file__)
PARENT_DIR = dirname(SCRIPT_DIR)
TEST_DATA_PATH = join(PARENT_DIR, "tests", "testdata", "multivariate_data.csv")


[docs]class DVHAStatsBaseClass:
    """Base Class for DVHAStats objects and child objects"""

    def __init__(self):
        """Initialization of DVHAStatsBaseClass for common attr/methods"""
        self.plots = []

[docs]    def close(self, figure_number):
        """Close a plot by figure_number"""
        for i, p in enumerate(self.plots):
            if p.figure.number == figure_number:
                p.close()
                self.plots.pop(i)
                return


[docs]class DVHAStats(DVHAStatsBaseClass):
    """The main UI class object for DVHAStats

    Parameters
    ----------
    data : numpy.array, dict, str, None
        Input data (2-D) with N rows of observations and
        p columns of variables.  The CSV file must have a header row
        for column names. Test data is loaded if None
    var_names : list of str, optional
        If data is a numpy array, optionally provide the column names.
    x_axis : numpy.array, list, optional
        Specify x_axis for plotting purposes. Default is based on row
        number in data
    avg_len : int
        When plotting raw data, a trend line will be plotted using this
        value as an averaging length. If N < avg_len + 1 will not
        plot a trend line
    del_const_vars : bool
        Automatically delete any variables that have constant data. The
        names of these variables are stored in the excluded_vars attr.
        Default value is False.
    """

    def __init__(
        self,
        data=None,
        var_names=None,
        x_axis=None,
        avg_len=5,
        del_const_vars=False,
    ):
        """Class used to calculated various statistics"""
        DVHAStatsBaseClass.__init__(self)

        data = TEST_DATA_PATH if data is None else data
        self.data, self.var_names = import_data(data, var_names)

        self.x_axis = x_axis

        self.deleted_vars = []

        self.box_cox_data = None

        self.avg_len = avg_len

        if del_const_vars:
            self.del_const_vars()

[docs]    def get_data_by_var_name(self, var_name):
        """Get the single variable array based on var_name

        Parameters
        ----------
        var_name : int, str
            The name (str) or index (int) of the variable of interest

        Returns
        ----------
        np.ndarray
            The column of data for the given var_name
        """
        index = self.get_index_by_var_name(var_name)
        return self.data[:, index]

[docs]    def get_index_by_var_name(self, var_name):
        """Get the variable index by var_name

        Parameters
        ----------
        var_name : int, str
            The name (str) or index (int) of the variable of interest

        Returns
        ----------
        int
            The column index for the given var_name
        """
        if var_name in self.var_names:
            index = self.var_names.index(var_name)
        elif isinstance(var_name, int) and var_name in range(
            self.variable_count
        ):
            return var_name
        else:
            msg = "%s is not a valid var_name\n%s" % (
                var_name,
                ",".join(self.var_names),
            )
            raise AttributeError(msg)
        return index

    @property
    def observations(self):
        """Number of observations in data

        Returns
        ----------
        int
            Number of rows in data
        """
        return self.data.shape[0]

    @property
    def variable_count(self):
        """Number of variables in data

        Returns
        ----------
        int
            Number of columns in data"""
        return self.data.shape[1]

[docs]    def del_var(self, var_name):
        """Determine if data by var_name is constant

        Parameters
        ----------
        var_name : int, str
            The var_name to delete (or index of variable)
        """
        index = self.get_index_by_var_name(var_name)
        self.deleted_vars.append(self.var_names[index])
        self.data = np.delete(self.data, index, axis=1)
        self.var_names.pop(index)

[docs]    def del_const_vars(self):
        """Permanently remove variables with no variation"""
        self.deleted_vars.extend(self.constant_vars)
        del_indices = self.constant_var_indices
        self.data = self.non_const_data
        for i in del_indices[::-1]:
            self.var_names.pop(i)

[docs]    def correlation_matrix(self, corr_type="Pearson"):
        """Get a Pearson-R or Spearman correlation matrices

        Parameters
        ----------
        corr_type : str
            Either "Pearson" or "Spearman"

        Returns
        ----------
        CorrelationMatrixUI
            A CorrelationMatrixUI class object
        """
        return CorrelationMatrixUI(
            self.data, self.var_names, corr_type=corr_type
        )

[docs]    def is_constant(self, var_name):
        """Determine if data by var_name is constant

        Parameters
        ----------
        var_name : int, str
            The var_name to check (or index of variable)

        Returns
        ----------
        bool
            True if all values of var_name are the same (i.e., no variation)
        """
        data = self.get_data_by_var_name(var_name)
        return stats.is_arr_constant(data)

    @property
    def constant_vars(self):
        """Get a list of all constant variables

        Returns
        ----------
        list
            Names of variables with no variation
        """
        return [v for v in self.var_names if self.is_constant(v)]

    @property
    def constant_var_indices(self):
        """Get a list of all constant variable indices

        Returns
        ----------
        list
            Indices of variables with no variation
        """
        return [i for i, v in enumerate(self.var_names) if self.is_constant(v)]

    @property
    def non_const_data(self):
        """Return self.data excluding any constant variables

        Returns
        ----------
        np.ndarray
            Data with constant variables removed. This does not alter the data
            property.
        """
        return np.delete(self.data, self.constant_var_indices, axis=1)

[docs]    def histogram(self, var_name, bins="auto", nan_policy="omit"):
        """Get a Histogram class object

        var_name : str, int
            The name (str) or index (int) of teh variable to plot
        bins : int, list, str, optional
            See https://numpy.org/doc/stable/reference/generated/numpy.histogram.html for details
        nan_policy : str
            Value must be one of the following: ‘propagate’, ‘raise’, ‘omit’
            Defines how to handle when input contains nan. The following
            options are available (default is ‘omit’):
            ‘propagate’: returns nan
            ‘raise’: throws an error
            ‘omit’: performs the calculations ignoring nan values
        """
        data = self.get_data_by_var_name(var_name)
        return stats.Histogram(data, bins, nan_policy)

[docs]    def linear_reg(
        self,
        y,
        y_var_name=None,
        reg_vars=None,
        saved_reg=None,
        back_elim=False,
        back_elim_p=0.05,
    ):
        """Initialize a MultiVariableRegression class object

        Parameters
        ----------
        y : np.ndarray, list, str, int
            Dependent data based on DVHAStats.data. If y is str or int, then
            it is assumed to be the var_name or index of data to be set as
            the dependent variable
        y_var_name : int, str, optional
            Optionally provide name of the dependent variable. Automatically
            set if y is str or int
        reg_vars : list, optional
            Optionally specify variable names or indices of data to be used
            in the regression
        saved_reg : MultiVariableRegression, optional
            If supplied, predicted values (y-hat) will be calculated with
            DVHAStats.data and the regression from saved_reg. This is useful
            if testing a regression model on new data.
        back_elim : bool
            Automatically perform backward elimination if True
        back_elim_p : float
            p-value threshold for backward elimination

        Returns
        ----------
        LinearRegUI
            A LinearRegUI class object.
        """

        input_data = self.__process_reg_input(y, reg_vars, y_var_name)

        return LinearRegUI(
            input_data["data"],
            input_data["y"],
            saved_reg,
            var_names=input_data["var_names"],
            y_var_name=input_data["y_var_name"],
            back_elim=back_elim,
            back_elim_p=back_elim_p,
        )

    def __process_reg_input(self, y, reg_vars, y_var_name):
        excl = []
        if reg_vars is not None:
            incl = [self.get_index_by_var_name(v) for v in reg_vars]
            excl = [i for i in range(self.variable_count) if i not in incl]

        if isinstance(y, (str, int)):
            y_index = self.get_index_by_var_name(y)
            y_var_name = self.var_names[y_index]
            if y_index not in excl:
                excl.append(self.get_index_by_var_name(y))
                excl.sort()
            y = self.get_data_by_var_name(y)

        data = np.delete(self.data, excl, axis=1)
        var_names = [v for i, v in enumerate(self.var_names) if i not in excl]
        return {
            "data": data,
            "y": y,
            "var_names": var_names,
            "y_var_name": y_var_name,
        }

[docs]    def univariate_control_chart(
        self,
        var_name,
        std=3,
        ucl_limit=None,
        lcl_limit=None,
        box_cox=False,
        box_cox_alpha=None,
        box_cox_lmbda=None,
        const_policy="propagate",
    ):
        """
        Calculate control limits for a standard univariate Control Chart

        Parameters
        ----------
        var_name : str, int
            The name (str) or index (int) of teh variable to plot
        std : int, float, optional
            Number of standard deviations used to calculate if a y-value is
            out-of-control
        ucl_limit : float, optional
            Limit the upper control limit to this value
        lcl_limit : float, optional
            Limit the lower control limit to this value
        box_cox : bool, optional
            Set to true to perform a Box-Cox transformation on data prior to
            calculating the control chart statistics
        box_cox_alpha : float, optional
            If alpha is not None, return the 100 * (1-alpha)% confidence
            interval for lmbda as the third output argument. Must be between
            0.0 and 1.0.
        box_cox_lmbda : float, optional
            If lmbda is not None, do the transformation for that value.
            If lmbda is None, find the lambda that maximizes the log-likelihood
            function and return it as the second output argument.
        const_policy : str
            {‘propagate’, ‘raise’, 'omit'}
            Defines how to handle when data is constant. The following
            options are available (default is ‘propagate’):
            ‘propagate’: returns nan
            ‘raise’: throws an error
            'omit': remove NaN data

        Returns
        ----------
        stats.ControlChart
            stats.ControlChart class object
        """
        kwargs = {"std": std, "ucl_limit": ucl_limit, "lcl_limit": lcl_limit}

        index = self.get_index_by_var_name(var_name)
        var_name = self.var_names[index]

        if box_cox:
            if self.box_cox_data is None:
                cc_data = self.box_cox_by_index(
                    index,
                    alpha=box_cox_alpha,
                    lmbda=box_cox_lmbda,
                    const_policy=const_policy,
                )
            else:
                cc_data = self.box_cox_data[:, index]
            plot_title = "Univariate Control Chart with Box-Cox Transformation"
        else:
            cc_data = self.data[:, index]
            plot_title = None

        if const_policy == "propagate" and stats.is_nan_arr(cc_data):
            plot_title = "Cannot calculate control chart with const data!"
        data = ControlChartUI(
            cc_data, var_name=var_name, plot_title=plot_title, **kwargs
        )

        return data

[docs]    def univariate_control_charts(self, **kwargs):
        """
        Calculate Control charts for all variables

        Parameters
        ----------
        kwargs : any
            See univariate_control_chart for keyword parameters

        Returns
        ----------
        dict
            ControlChart class objects stored in a dictionary with
            var_names and indices as keys (can use var_name or index)
        """
        data = {}
        for i, key in enumerate(self.var_names):
            data[key] = self.univariate_control_chart(key, **kwargs)
            data[i] = data[key]
        return data

[docs]    def hotelling_t2(
        self,
        alpha=0.05,
        box_cox=False,
        box_cox_alpha=None,
        box_cox_lmbda=None,
        const_policy="omit",
    ):
        """
        Calculate control limits for a standard univariate Control Chart

        Parameters
        ----------
        alpha : float
            Significance level used to determine the upper control limit (ucl)
        box_cox : bool, optional
            Set to true to perform a Box-Cox transformation on data prior to
            calculating the control chart statistics
        box_cox_alpha : float, optional
            If alpha is not None, return the 100 * (1-alpha)% confidence
            interval for lmbda as the third output argument. Must be between
            0.0 and 1.0.
        box_cox_lmbda : float, optional
            If lmbda is not None, do the transformation for that value.
            If lmbda is None, find the lambda that maximizes the log-likelihood
            function and return it as the second output argument.
        const_policy : str
            {‘raise’, 'omit'}
            Defines how to handle when data is constant. The following
            options are available (default is ‘raise’):
            ‘raise’: throws an error
            'omit': exclude constant variables from calculation

        Returns
        ----------
        HotellingT2UI
            HotellingT2UI class object
        """

        if box_cox:
            if self.box_cox_data is None:
                self.box_cox(
                    alpha=box_cox_alpha,
                    lmbda=box_cox_lmbda,
                    const_policy=const_policy,
                )
            data = self.box_cox_data
            if const_policy == "omit":
                data = stats.remove_const_column(data)
            plot_title = (
                "Multivariate Control Chart with Box-Cox Transformation"
            )
        else:
            data = self.non_const_data if const_policy == "omit" else self.data
            plot_title = None

        return HotellingT2UI(data, alpha, plot_title=plot_title)

[docs]    def box_cox_by_index(
        self, index, alpha=None, lmbda=None, const_policy="propagate"
    ):
        """

        Parameters
        ----------
        index : int, str
            The index corresponding to the variable data to have a box-cox
            transformation applied.  If index is a string, it will be assumed
            to be the var_name
        lmbda : None, scalar, optional
            If lmbda is not None, do the transformation for that value.
            If lmbda is None, find the lambda that maximizes the
            log-likelihood function and return it as the second output
            argument.
        alpha : None, float, optional
            If alpha is not None, return the 100 * (1-alpha)% confidence
            interval for lmbda as the third output argument. Must be between
            0.0 and 1.0.
        const_policy : str
            {‘propagate’, ‘raise’, 'omit'}
            Defines how to handle when data is constant. The following
            options are available (default is ‘propagate’):
            ‘propagate’: returns nan
            ‘raise’: throws an error
            'omit': remove

        Returns
        ----------
        np.ndarray
            Results from stats.box_cox
        """
        if self.box_cox_data is None:
            self.box_cox_data = np.zeros_like(self.data)

        if isinstance(index, str):
            index = self.get_index_by_var_name(index)

        self.box_cox_data[:, index] = stats.box_cox(
            self.data[:, index],
            alpha=alpha,
            lmbda=lmbda,
            const_policy=const_policy,
        )

        return self.box_cox_data[:, index]

[docs]    def box_cox(self, alpha=None, lmbda=None, const_policy="propagate"):
        """Apply box_cox_by_index for all data"""
        for i in range(self.variable_count):
            self.box_cox_by_index(
                i, alpha=alpha, lmbda=lmbda, const_policy=const_policy
            )

[docs]    def pca(self, n_components=0.95, transform=True, **kwargs):
        """Return an sklearn PCA-like object, see PCA object for details

        Parameters
        ----------
        n_components : int, float, None or str
            Number of components to keep. if n_components is not set all
            components are kept:
            n_components == min(n_samples, n_features)

            If n_components == 'mle' and svd_solver == 'full', Minka’s MLE
            is used to guess the dimension. Use of n_components == 'mle'
            will interpret svd_solver == 'auto' as svd_solver == 'full'.

            If 0 < n_components < 1 and svd_solver == 'full', select the
            number of components such that the amount of variance that
            needs to be explained is greater than the percentage specified
            by n_components.

            If svd_solver == 'arpack', the number of components must be
            strictly less than the minimum of n_features and n_samples.
        transform : bool
            Fit the model and apply the dimensionality reduction
        kwargs : any
            Provide any keyword arguments for sklearn.decomposition.PCA:
            https://scikit-learn.org/stable/modules/generated/sklearn.decomposition.PCA.html

        Returns
        ----------
        PCAUI
            A principal component analysis object inherited from
            sklearn.decomposition.PCA
        """
        return PCAUI(
            self.data,
            var_names=self.var_names,
            n_components=n_components,
            transform=transform,
            **kwargs
        )

[docs]    def risk_adjusted_control_chart(
        self,
        y,
        std=3,
        ucl_limit=None,
        lcl_limit=None,
        saved_reg=None,
        y_name=None,
        reg_vars=None,
        back_elim=False,
        back_elim_p=0.05,
    ):
        """
        Calculate control limits for a Risk-Adjusted Control Chart

        Parameters
        ----------
        y : list, np.ndarray
            1-D Input data (dependent data)
        std : int, float, optional
            Number of standard deviations used to calculate if a y-value is
            out-of-control.
        ucl_limit : float, optional
            Limit the upper control limit to this value
        lcl_limit : float, optional
            Limit the lower control limit to this value
        saved_reg : MultiVariableRegression, optional
            Optionally provide a previously calculated regression
        y_name : int, str, optional
            Optionally provide name of the dependent variable. Automatically
            set if y is str or int
        reg_vars : list, optional
            Optionally specify variable names or indices of data to be used
            in the regression
        saved_reg : MultiVariableRegression, optional
            If supplied, predicted values (y-hat) will be calculated with
            DVHAStats.data and the regression from saved_reg. This is useful
            if testing a regression model on new data.
        back_elim : bool
            Automatically perform backward elimination if True
        back_elim_p : float
            p-value threshold for backward elimination
        """

        input_data = self.__process_reg_input(y, reg_vars, y_name)

        return RiskAdjustedControlChartUI(
            input_data["data"],
            input_data["y"],
            std=std,
            ucl_limit=ucl_limit,
            lcl_limit=lcl_limit,
            saved_reg=saved_reg,
            x=self.x_axis,
            y_name=input_data["y_var_name"],
            var_names=input_data["var_names"],
            back_elim=back_elim,
            back_elim_p=back_elim_p,
        )

    def __add_tend_line(self, var_name, plot_index):
        """Add trend line based on moving average"""
        trend_x, trend_y = stats.moving_avg(
            self.get_data_by_var_name(var_name), self.avg_len
        )
        self.plots[plot_index].add_line(
            trend_y, x=trend_x, line_color="black", line_width=0.75
        )

[docs]    def show(self, var_name=None, plot_type="trend", **kwargs):
        """Display a plot of var_name with matplotlib

        Parameters
        ----------
        var_name : str, int, None
            The name (str) or index (int) of the variable to plot.  If None
            and plot_type="boxplot", all variables will be plotted.
        plot_type : str
            Either "trend", "hist", "box"
        kwargs : any
            If plot_type is "hist", pass any of the matplotlib hist key word
            arguments

        Returns
        ----------
        int
            The number of the newly created matplotlib figure
        """
        plot_type = plot_type.lower()

        if plot_type not in {"trend", "hist", "box"}:
            msg = "plot_type must be in 'trend', 'hist', or 'box'"
            raise NotImplementedError(msg)

        if var_name is None:
            if plot_type != "box":
                msg = "Must specify var_name if plot_type in ('trend', 'hist')"
                raise NotImplementedError(msg)
            index = None
        else:
            index = self.get_index_by_var_name(var_name)
            var_name = self.var_names[index]

        if plot_type == "trend":
            self.plots.append(
                plot.Plot(
                    self.data[:, index],
                    x=self.x_axis,
                    xlabel="Observation",
                    ylabel=var_name,
                    title="",
                )
            )
            self.__add_tend_line(var_name, -1)
        elif plot_type == "hist":
            self.plots.append(
                plot.Histogram(self.data[:, index], xlabel=var_name, **kwargs)
            )
        elif plot_type == "box":
            data = self.data if var_name is None else self.data[:, index]
            xlabels = self.var_names if var_name is None else [var_name]
            self.plots.append(plot.BoxPlot(data, xlabels=xlabels, **kwargs))
        return self.plots[-1].figure.number


[docs]class ControlChartUI(DVHAStatsBaseClass, stats.ControlChart):
    """Univariate Control Chart

    Parameters
    ----------
    y : list, np.ndarray
        Input data (1-D)
    std : int, float, optional
        Number of standard deviations used to calculate if a y-value is
        out-of-control.
    ucl_limit : float, optional
        Limit the upper control limit to this value
    lcl_limit : float, optional
        Limit the lower control limit to this value
    plot_title : str, optional
        Over-ride the plot title
    """

    def __init__(
        self,
        y,
        std=3,
        ucl_limit=None,
        lcl_limit=None,
        var_name=None,
        x=None,
        plot_title=None,
    ):
        """Calculate control limits for a standard univariate Control Chart"""
        DVHAStatsBaseClass.__init__(self)
        stats.ControlChart.__init__(
            self, y, std=std, ucl_limit=ucl_limit, lcl_limit=lcl_limit, x=x
        )

        self.plot_title = (
            "Univariate Control Chart" if plot_title is None else plot_title
        )

        self.var_name = var_name

[docs]    def show(self):
        """Display the univariate control chart with matplotlib

        Returns
        ----------
        int
            The number of the newly created matplotlib figure
        """
        self.plots.append(
            plot.ControlChart(
                title=self.plot_title, ylabel=self.var_name, **self.chart_data
            )
        )
        return self.plots[-1].figure.number


[docs]class RiskAdjustedControlChartUI(
    DVHAStatsBaseClass, stats.RiskAdjustedControlChart
):
    """Risk-Adjusted Control Chart using a Multi-Variable Regression

    Parameters
    ----------
    X : array-like
        Input array (independent data)
    y : list, np.ndarray
        1-D Input data (dependent data)
    std : int, float, optional
        Number of standard deviations used to calculate if a y-value is
        out-of-control.
    ucl_limit : float, optional
        Limit the upper control limit to this value
    lcl_limit : float, optional
        Limit the lower control limit to this value
    x : list, np.ndarray, optional
        x-axis values
    plot_title : str, optional
        Over-ride the plot title
    saved_reg : MultiVariableRegression, optional
        Optionally provide a previously calculated regression
    var_names : list, optional
        Optionally provide names of the variables
    back_elim : bool
        Automatically perform backward elimination if True
    back_elim_p : float
        p-value threshold for backward elimination
    """

    def __init__(
        self,
        X,
        y,
        std=3,
        ucl_limit=None,
        lcl_limit=None,
        x=None,
        y_name=None,
        var_names=None,
        saved_reg=None,
        plot_title=None,
        back_elim=False,
        back_elim_p=0.05,
    ):
        """Calculate control limits for a Risk-Adjusted Control Chart"""
        DVHAStatsBaseClass.__init__(self)

        stats.RiskAdjustedControlChart.__init__(
            self,
            X,
            y,
            std=std,
            ucl_limit=ucl_limit,
            lcl_limit=lcl_limit,
            x=x,
            saved_reg=saved_reg,
            var_names=var_names,
            back_elim=back_elim,
            back_elim_p=back_elim_p,
        )

        self.plot_title = (
            "Risk-Adjusted Control Chart" if plot_title is None else plot_title
        )

        self.var_name = y_name

[docs]    def show(self):
        """Display the risk-adjusted control chart with matplotlib

        Returns
        ----------
        int
            The number of the newly created matplotlib figure
        """
        self.plots.append(
            plot.ControlChart(
                title=self.plot_title, ylabel=self.var_name, **self.chart_data
            )
        )
        return self.plots[-1].figure.number


[docs]class HotellingT2UI(DVHAStatsBaseClass, stats.HotellingT2):
    """Hotelling's t-squared statistic for multivariate hypothesis testing

    Parameters
    ----------
    data : np.ndarray
        A 2-D array of data to perform multivariate analysis.
        (e.g., DVHAStats.data)
    alpha : float
        The significance level used to calculate the
        upper control limit (UCL)
    plot_title : str, optional
        Over-ride the plot title
    """

    def __init__(self, data, alpha=0.05, plot_title=None):
        """Initialize the Hotelling T^2 class"""
        DVHAStatsBaseClass.__init__(self)
        stats.HotellingT2.__init__(self, data, alpha=alpha)

        self.plot_title = (
            "Multivariate Control Chart" if plot_title is None else plot_title
        )

[docs]    def show(self):
        """Display the multivariate control chart with matplotlib

        Returns
        ----------
        int
            The number of the newly created matplotlib figure
        """
        self.plots.append(
            plot.ControlChart(
                title=self.plot_title,
                ylabel="Hottelling T^2",
                **self.chart_data
            )
        )
        return self.plots[-1].figure.number


[docs]class PCAUI(DVHAStatsBaseClass, stats.PCA):
    """Hotelling's t-squared statistic for multivariate hypothesis testing

    Parameters
    ----------
    X : array-like, shape (n_samples, n_features)
        Training data, where n_samples is the number of samples and
        n_features is the number of features.
    var_names : str, optional
        Names of the independent variables in X
    n_components : int, float, None or str
        Number of components to keep. if n_components is not set all
        components are kept:
        n_components == min(n_samples, n_features)
        If n_components == 'mle' and svd_solver == 'full', Minka’s MLE
        is used to guess the dimension. Use of n_components == 'mle'
        will interpret svd_solver == 'auto' as svd_solver == 'full'.
        If 0 < n_components < 1 and svd_solver == 'full', select the
        number of components such that the amount of variance that
        needs to be explained is greater than the percentage specified
        by n_components.
        If svd_solver == 'arpack', the number of components must be
        strictly less than the minimum of n_features and n_samples.
    transform : bool
        Fit the model and apply the dimensionality reduction
    kwargs : any
        Provide any keyword arguments for sklearn.decomposition.PCA:
        https://scikit-learn.org/stable/modules/generated/sklearn.decomposition.PCA.html
    """

    def __init__(
        self, X, var_names=None, n_components=0.95, transform=True, **kwargs
    ):
        """Initialize PCA and perform fit. Inherits sklearn.decomposition.PCA"""
        # print(kwargs)
        DVHAStatsBaseClass.__init__(self)
        stats.PCA.__init__(
            self, X, n_components=n_components, transform=transform, **kwargs
        )
        self.var_names = range(X.shape[1]) if var_names is None else var_names

[docs]    def show(self, plot_type="feature_map", absolute=True):
        """Create a heat map of PCA components

        Parameters
        ----------
        plot_type : str
            Select a plot type to display. Options include: feature_map.
        absolute : bool
            Heat map will display the absolute values in PCA components
            if True

        Returns
        ----------
        int
            The number of the newly created matplotlib figure
        """
        if plot_type == "feature_map":
            data = self.feature_map_data
            if absolute:
                data = abs(data)
            self.plots.append(plot.PCAFeatureMap(data, self.var_names))
            return self.plots[-1].figure.number


[docs]class CorrelationMatrixUI(DVHAStatsBaseClass, stats.CorrelationMatrix):
    """Pearson-R correlation matrix UI object

    Parameters
    ----------
    X : np.ndarray
        Input data (2-D) with N rows of observations and
        p columns of variables.
    var_names : list, optional
        Optionally set the variable names with a list of str
    corr_type : str
        Either "Pearson" or "Spearman"
    cmap : str
        matplotlib compatible color map
    """

    def __init__(
        self, X, var_names=None, corr_type="Pearson", cmap="coolwarm"
    ):
        """Initialization of CorrelationMatrix object"""
        DVHAStatsBaseClass.__init__(self)
        stats.CorrelationMatrix.__init__(self, X=X, corr_type=corr_type)
        self.var_names = range(X.shape[1]) if var_names is None else var_names
        self.cmap = cmap

[docs]    def show(self, absolute=False, corr=True):
        """Create a heat map of PCA components

        Parameters
        ----------
        absolute : bool
            Heat map will display the absolute values in PCA components
            if True
        corr : bool
            Plot a p-value matrix if False, correlation matrix if True.

        Returns
        ----------
        int
            The number of the newly created matplotlib figure
        """

        data = self.corr if corr else self.p
        data = abs(data) if absolute else data

        mat_type = "Pearson-R" if self.corr_type == "pearson" else "Spearman"
        value_type = ["p-value", "Correlation"][corr]
        title = "%s %s Matrix" % (mat_type, value_type)

        self.plots.append(
            plot.HeatMap(
                data,
                xlabels=self.var_names,
                ylabels=self.var_names,
                cmap=self.cmap,
                title=title,
            )
        )
        return self.plots[-1].figure.number


[docs]class LinearRegUI(DVHAStatsBaseClass, stats.MultiVariableRegression):
    """A MultiVariableRegression class UI object

    Parameters
    ----------
    y : np.ndarray, list
        Dependent data based on DVHAStats.data
    saved_reg : MultiVariableRegression, optional
        If supplied, predicted values (y-hat) will be calculated with
        DVHAStats.data and the regression from saved_reg. This is useful
        if testing a regression model on new data.
    var_names : list, optional
        Optionally provide names of the independent variables
    y_var_name : int, str, optional
        Optionally provide name of the dependent variable
    back_elim : bool
        Automatically perform backward elimination if True
    back_elim_p : float
        p-value threshold for backward elimination
    """

    def __init__(
        self,
        X,
        y,
        saved_reg=None,
        var_names=None,
        y_var_name=None,
        back_elim=False,
        back_elim_p=0.05,
    ):
        """Initialization of LinearRegUI object"""
        DVHAStatsBaseClass.__init__(self)
        stats.MultiVariableRegression.__init__(
            self,
            X=X,
            y=y,
            saved_reg=saved_reg,
            var_names=var_names,
            y_var_name=y_var_name,
            back_elim=back_elim,
            back_elim_p=back_elim_p,
        )

[docs]    def show(self, plot_type="residual"):
        """Create a Residual or Probability Plot

        Parameters
        ----------
        plot_type : str
            Either "residual" or "prob"

        Returns
        ----------
        int
            The number of the newly created matplotlib figure
        """

        if plot_type not in {"residual", "prob"}:
            return

        title = (
            "Multi-Variable Linear Regression"
            if self.X.shape[1] > 1
            else "Linear Regression"
        )

        if plot_type == "residual":
            data = self.chart_data
            self.plots.append(
                plot.Plot(
                    data["resid"],
                    x=data["y"],
                    title=title,
                    xlabel="Fitted Values",
                    ylabel="Residual",
                    line=False,
                )
            )
            x_zero = [np.min(data["y"]), np.max(data["y"])]
            y_zero = [0, 0]
            self.plots[-1].add_line(
                y_zero, x_zero, line_color="black", line_style="--"
            )
        elif plot_type == "prob":
            data = self.prob_plot
            self.plots.append(
                plot.Plot(
                    data["y"],
                    x=data["x"],
                    title="Probability Plot",
                    xlabel="Quantiles",
                    ylabel="Ordered Values",
                    line=False,
                )
            )
            self.plots[-1].add_line(
                data["y_trend"],
                data["x_trend"],
                line_color="black",
                line_style="--",
            )

        return self.plots[-1].figure.number