Source code for causalexplain.estimators.rex.knowledge

import math

import networkx as nx
import numpy as np
import pandas as pd
from sklearn.preprocessing import StandardScaler

from typing import Optional

# from causalexplain.common.utils import graph_from_dot_file, load_experiment
from ...common import utils

# pylint: disable=E1101:no-member, W0201:attribute-defined-outside-init, W0511:fixme
# pylint: disable=C0103:invalid_name, C0116:missing-function-docstring
# pylint: disable=R0913:too-many-arguments, R1702:too-many-branches
# pylint: disable=R0914:too-many-locals, R0915:too-many-statements



[docs]
class Knowledge:
    """
    This class collects everything we know about each edge in the proposed graph
    in terms of the following properties:

    - origin: the origin node
    - target: the target node
    - is_edge: whether the edge is in the reference graph
    - is_root_cause: whether the origin is a root cause
    - is_leaf_node: whether the origin is a leaf node
    - correlation: the correlation between the individual SHAP values and the origin node
    - KS_pval: the p-value of the Kolmogorov-Smirnov test between the origin and the target
    - shap_edge: whether the edge is in the graph constructed after evaluating mean
        SHAP values.
    - shap_skedastic_pval: the p-value of the skedastic test for the SHAP values
    - parent_skedastic_pval: the p-value of the skedastic test for the parent values
    - mean_shap: the mean of the SHAP values between the origin and the target
    - slope_shap: the slope of the linear regression for target vs. SHAP values
    - slope_target: the slope of the linear regression for the target vs. origin values
    - potential_root: whether the origin is a potential root cause
    - regression_err: the regression error of the origin to the target
    - err_contrib: the error contribution of the origin to the target
    - con_ind_pval: the p-value of the conditional independence test between the origin
        and the target
    """


[docs]
    def __init__(self, rex: object, ref_graph: nx.DiGraph):
        """
        Arguments:
        ----------
            shaps (ShapEstimator): The shap estimator.
            ref_graph (nx.DiGraph): The reference graph, or ground truth.
        """
        assert rex is not None, "Rex is None"
        assert rex.hierarchies is not None, "Hierarchies is None"
        assert rex.shaps is not None, "ShapEstimator is None"
        assert rex.pi is not None, "PIEstimator is None"

        self.K = 180.0 / math.pi
        self.shaps = rex.shaps
        self.pi = rex.pi
        self.hierarchies = rex.hierarchies
        self.indep = rex.indep
        self.feature_names = rex.feature_names
        self.scoring = rex.models.scoring
        self.ref_graph = ref_graph
        self.G_shap = rex.G_shap
        self.root_causes = rex.root_causes

        self.correlation_th = rex.correlation_th
        if self.correlation_th is not None:
            self.correlated_features = self.hierarchies.correlated_features



[docs]
    def info(self):
        """Returns a dataframe with the knowledge about each edge in the graph"""
        rows = []
        ci = self.indep.compute_cond_indep_pvals()
        for target in self.feature_names:
            for parent in self.feature_names:
                if target == parent:
                    continue

                if self.correlation_th is not None:
                    if parent in self.correlated_features[target]:
                        continue

                if self.correlation_th is not None:
                    all_features = [f for f in self.feature_names if (
                        f != target) and (f not in self.correlated_features[target])]
                else:
                    all_features = [
                        f for f in self.feature_names if f != target]
                feature_pos = all_features.index(parent)

                sd = self.shaps.shap_discrepancies[target][parent]
                pi = self.pi.pi[target]['importances_mean'][feature_pos]

                b0_s, beta1_s = sd.shap_model.params[0], sd.shap_model.params[1]
                b0_y, beta1_y = sd.parent_model.params[0], sd.parent_model.params[1]
                shap_slope = math.atan(beta1_s)*self.K
                parent_slope = math.atan(beta1_y)*self.K
                rows.append({
                    'origin': parent,
                    'target': target,
                    'is_edge': int((parent, target) in self.ref_graph.edges()),
                    'o_is_root': int(nx.ancestors(self.ref_graph, parent) == set()),
                    't_is_leaf': int(nx.descendants(self.ref_graph, target) == set()),
                    'correlation': self.hierarchies.correlations[target][parent],
                    'shap_corr': sd.shap_correlation,
                    'shap_gof': sd.shap_gof,
                    'ks_pval': sd.ks_pvalue,
                    'shap_edge': int(parent in set(self.G_shap.predecessors(target))),
                    'shap_sk_pval': sd.shap_p_value,
                    'parent_sk_pval': sd.parent_p_value,
                    'mean_shap': self.shaps.shap_mean_values[target][feature_pos],
                    'mean_pi': pi,
                    'slope_shap': shap_slope,
                    'slope_target': parent_slope,
                    'pot_root': int(parent in self.root_causes),
                    'regr_err': self.scoring[feature_pos],
                    'err_contrib': self.shaps.error_contribution.loc[target, parent],
                    'cond_ind_pval': ci[(target, parent)]
                })
        self.results = pd.DataFrame.from_dict(rows)
        return self.results



[docs]
    def retrieve(self, origin: str, target: str, what: Optional[str] = None):
        """Returns the knowledge about a specific edge"""
        if what is None:
            return self.results[(self.results.origin == origin) &
                                (self.results.target == target)]

        return self.results[(self.results.origin == origin) &
                            (self.results.target == target)][what].values[0]




if __name__ == "__main__":

    # Display Options
    np.set_printoptions(precision=4, linewidth=100)
    pd.set_option('display.precision', 4)
    pd.set_option('display.float_format', '{:.4f}'.format)

    # Paths
    path = "/Users/renero/phd/data/RC3/"
    output_path = "/Users/renero/phd/output/RC3/"
    # experiment_name = 'rex_generated_linear_1'
    experiment_name = 'custom_rex'

    # Read the data
    reference_graph = utils.graph_from_dot_file(f"{path}{experiment_name}.dot")
    data = pd.read_csv(f"{path}{experiment_name}.csv")
    scaler = StandardScaler()
    data = pd.DataFrame(scaler.fit_transform(data), columns=data.columns)

    custom = utils.load_experiment(f"{experiment_name}", output_path)
    custom.is_fitted_ = True
    print(f"Loaded experiment {experiment_name}")

    custom.feature_names = list(data.columns)
    custom.models.score(data)
    custom.knowledge(reference_graph)