Source code for causalexplain.explainability.regression_quality

from typing import List, Set, Union
import numpy as np
from scipy import stats
from sklearn.base import BaseEstimator


[docs] class RegQuality(BaseEstimator):
[docs] def __init__(self): super().__init__()
[docs] @staticmethod def predict( scores: List[float], gamma_shape: float = 1, gamma_scale: float = 1, threshold: float = 0.9, verbose: bool = False) -> Set[int]: """ Returns the indices of features that are both gamma and outliers. Both criteria are applied to the given scores to determine if the MSE error obtained from the regression is bad compared with the rest of regressions for the other features in the dataset, and thus the feature should be considered a parent node. Parameters ---------- scores: List[float] List of scores Returns ------- Set[int] List of indices of features that are both gamma and outliers """ scores = np.array(scores) gamma_indices = RegQuality._gamma_criteria( scores, gamma_shape, gamma_scale, threshold, verbose=verbose) outliers_indices = RegQuality._mad_criteria(scores, verbose=verbose) return set(gamma_indices).intersection(outliers_indices)
@staticmethod def _mad_criteria(scores, verbose=False) -> Set[int]: """ Returns indices of outliers in the given scores, using the MAD method. Taken from https://stats.stackexchange.com/a/78617 Parameters ---------- scores: List[float] List of scores verbose: bool Whether to print the score and M for each score Returns ------- Set[int] List of indices of outliers in the given scores """ median = np.median(scores) ad = [np.abs(score-median) for score in scores] mad = np.median(ad) M = [np.abs(.6745 * (score-median) / mad) for score in scores] mad_indices = [idx for idx, m in enumerate(M) if m > 3.5] if verbose: print(f"Median: {median:.4f}") print(f"Median of absolute differences: {mad:.4f}") for score, m in zip(scores, M): print(f"Score: {score:.4f}, M: {m:.4f}") return set(mad_indices) @staticmethod def _gamma_criteria( scores, gamma_shape=1, gamma_scale=1, threshold=0.9, verbose=False) -> Union[None, Set[int]]: """ Returns a list of booleans indicating whether the score is below the threshold Parameters ---------- scores: List[float] List of scores gamma_shape: float Shape parameter for gamma distribution gamma_scale: float Scale parameter for gamma distribution threshold: float Threshold for gamma criteria verbose: bool Whether to print the score and gamma criteria for each score Returns ------- Set[int] List of booleans indicating whether the score is below the threshold """ gamma_indices = [] for idx, score in enumerate(scores): pdf = stats.gamma.pdf(score, a=gamma_shape, scale=gamma_scale) if pdf <= threshold: gamma_indices.append(idx) if verbose: print( f"Score: {score:.4f}, PDF: {pdf:.4f}, criteria: {pdf < threshold}") return set(gamma_indices)