Source code for causalexplain.explainability.regression_quality

from typing import List, Set, Union
import numpy as np
from scipy import stats
from sklearn.base import BaseEstimator



[docs]
class RegQuality(BaseEstimator):


[docs]
    def __init__(self):
        super().__init__()



[docs]
    @staticmethod
    def predict(
            scores: List[float],
            gamma_shape: float = 1,
            gamma_scale: float = 1,
            threshold: float = 0.9,
            verbose: bool = False) -> Set[int]:
        """
        Returns the indices of features that are both gamma and outliers. Both criteria
        are applied to the given scores to determine if the MSE error obtained from
        the regression is bad compared with the rest of regressions for the other features
        in the dataset, and thus the feature should be considered a parent node.

        Parameters
        ----------
        scores: List[float]
            List of scores

        Returns
        -------
        Set[int]
            List of indices of features that are both gamma and outliers
        """
        scores = np.array(scores)
        gamma_indices = RegQuality._gamma_criteria(
            scores, gamma_shape, gamma_scale, threshold, verbose=verbose)
        outliers_indices = RegQuality._mad_criteria(scores, verbose=verbose)

        return set(gamma_indices).intersection(outliers_indices)


    @staticmethod
    def _mad_criteria(scores, verbose=False) -> Set[int]:
        """
        Returns indices of outliers in the given scores, using the MAD method.
        Taken from https://stats.stackexchange.com/a/78617
        
        Parameters
        ----------
        scores: List[float]
            List of scores
        verbose: bool
            Whether to print the score and M for each score
            
        Returns
        -------
        Set[int]
            List of indices of outliers in the given scores
        """
        median = np.median(scores)
        ad = [np.abs(score-median) for score in scores]
        mad = np.median(ad)
        M = [np.abs(.6745 * (score-median) / mad) for score in scores]
        mad_indices = [idx for idx, m in enumerate(M) if m > 3.5]

        if verbose:
            print(f"Median: {median:.4f}")
            print(f"Median of absolute differences: {mad:.4f}")
            for score, m in zip(scores, M):
                print(f"Score: {score:.4f}, M: {m:.4f}")

        return set(mad_indices)

    @staticmethod
    def _gamma_criteria(
            scores,
            gamma_shape=1,
            gamma_scale=1,
            threshold=0.9,
            verbose=False) -> Union[None, Set[int]]:
        """
        Returns a list of booleans indicating whether the score is below the threshold

        Parameters
        ----------
        scores: List[float]
            List of scores
        gamma_shape: float
            Shape parameter for gamma distribution
        gamma_scale: float
            Scale parameter for gamma distribution
        threshold: float
            Threshold for gamma criteria
        verbose: bool
            Whether to print the score and gamma criteria for each score

        Returns
        -------
        Set[int]
            List of booleans indicating whether the score is below the threshold
        """
        gamma_indices = []
        for idx, score in enumerate(scores):
            pdf = stats.gamma.pdf(score, a=gamma_shape, scale=gamma_scale)
            if pdf <= threshold:
                gamma_indices.append(idx)
            if verbose:
                print(
                    f"Score: {score:.4f}, PDF: {pdf:.4f}, criteria: {pdf < threshold}")

        return set(gamma_indices)