Source code for causalexplain.independence.regressors

from typing import List

import numpy as np
import pandas as pd
from pandas import DataFrame
from pygam import LinearGAM
from sklearn.gaussian_process import GaussianProcessRegressor as gpr

from causalexplain.independence.hsic import HSIC


def _fit_GPR(x, y, x_test, y_test):
    model = gpr(normalize_y=False)
    model.fit(x, y)
    prediction = model.predict(x_test).reshape(-1, 1)
    residuals = y_test - prediction
    return residuals


def _fit_GAM(x, y, x_test, y_test):
    model = LinearGAM()
    model.gridsearch(x, y, progress=False)
    pred = model.predict(x_test)
    residuals = np.array([x_test[i] - pred[i] for i in range(x_test.shape[0])])
    return residuals


[docs] def fit_and_get_residuals( X: np.ndarray, Y: np.ndarray, X_test: np.ndarray = None, Y_test: np.ndarray = None, method="gpr" ): """ Fit a model y ~ f(X), where X is an independent variable and Y is a dependent one. The model is passed as argument, together with the training and test sets. Args: X: (np.ndarray) The feature to be used as input to predict Y_train Y: (np.ndarray) The feature to be predicted X_test: (np.ndarray) The feature to be used as input to predict Y_test Y_test: (np.ndarray) The feature to be predicted method: (str) Either "gpr" or "gam" Returns: The method returns the residuals and the RMS error. """ # Fix dimensions if hasn't been done already. if np.ndim(X) == 1: X = X.reshape(-1, 1) if np.ndim(Y) == 1: Y = Y.reshape(-1, 1) if np.ndim(X_test) == 1: X_test = X_test.reshape(-1, 1) if np.ndim(Y_test) == 1: Y_test = Y_test.reshape(-1, 1) noise = np.random.normal(0, .1, X.shape[0]).reshape(-1, 1) Y = Y + noise if X_test is None or Y_test is None: X_test = X Y_test = Y if method == "gpr": # residuals, XX, YY = _fit_GPR(X, Y, X_test, Y_test) residuals = _fit_GPR(X, Y, X_test, Y_test) elif method == "gam": # residuals, XX, YY = _fit_GAM(X, Y, X_test, Y_test) residuals = _fit_GAM(X, Y, X_test, Y_test) else: raise ValueError(f"Invalid method: {method}") return residuals
[docs] def run_feature_selection(X: DataFrame, y: str) -> List: """ Extracts 'y' from the list of features of "X" and call the prediction method passed to asses the predictive influence of each variable in X to obtain "y". Args: X: Dataframe with ALL continous variables y: the name of the variable in X to be used as target. predict_method: the method used to predict "y" from "X". "hsiclasso" or "block_hsic_lasso" Return: List: with the predictive score for each variable. """ feature_names = list(X.columns.values) feature_names.remove(y) df_target = pd.DataFrame(X[y], columns=[y]) df_features = X[feature_names] y = np.transpose(df_target.values) X = np.transpose(df_features.values) hsic = HSIC().fit(X, y) return hsic.stat