From 1bb3c53e71ef811048e1e03ce3311803f3b4baa0 Mon Sep 17 00:00:00 2001 From: Thomas Bury Date: Thu, 14 May 2020 18:56:38 +0200 Subject: [PATCH 1/9] catboost-plot-pimp-shapimp --- boruta/boruta_py.py | 358 +++++++++++++++++++++++++++++-------- boruta/test/modif_tests.py | 154 ++++++++++++++++ 2 files changed, 435 insertions(+), 77 deletions(-) create mode 100644 boruta/test/modif_tests.py diff --git a/boruta/boruta_py.py b/boruta/boruta_py.py index fd3f20b..c346708 100644 --- a/boruta/boruta_py.py +++ b/boruta/boruta_py.py @@ -1,3 +1,18 @@ +from __future__ import print_function, division +import numpy as np +import scipy as sp +from sklearn.utils import check_random_state, check_X_y +from sklearn.base import TransformerMixin, BaseEstimator, is_classifier, is_regressor +from sklearn.utils import check_random_state, check_X_y +from sklearn.base import TransformerMixin, BaseEstimator, is_regressor, is_classifier +from sklearn.model_selection import RepeatedKFold, train_test_split +from sklearn.inspection import permutation_importance +import shap +import pandas as pd +import time +import matplotlib +import matplotlib.pyplot as plt +from matplotlib.lines import Line2D #!/usr/bin/env python # -*- coding: utf-8 -*- """ @@ -8,17 +23,11 @@ License: BSD 3 clause """ -from __future__ import print_function, division -import numpy as np -import scipy as sp -from sklearn.utils import check_random_state, check_X_y -from sklearn.base import TransformerMixin, BaseEstimator class BorutaPy(BaseEstimator, TransformerMixin): """ Improved Python implementation of the Boruta R package. - The improvements of this implementation include: - Faster run times: Thanks to scikit-learn's fast implementation of the ensemble methods. @@ -48,40 +57,33 @@ class BorutaPy(BaseEstimator, TransformerMixin): - Ranking of features: After fitting BorutaPy it provides the user with ranking of features. Confirmed ones are 1, Tentatives are 2, and the rejected are ranked - starting from 3, based on their feautre importance history through + starting from 3, based on their feature importance history through the iterations. - + - Using either the native variable importance, scikit permutation importance, + SHAP importance. We highly recommend using pruned trees with a depth between 3-7. - For more, see the docs of these functions, and the examples below. - Original code and method by: Miron B Kursa, https://m2.icm.edu.pl/boruta/ - Boruta is an all relevant feature selection method, while most other are minimal optimal; this means it tries to find all features carrying information usable for prediction, rather than finding a possibly compact subset of features on which some classifier has a minimal error. - Why bother with all relevant feature selection? When you try to understand the phenomenon that made your data, you should care about all factors that contribute to it, not just the bluntest signs of it in context of your methodology (yes, minimal optimal set of features by definition depends on your classifier choice). - Parameters ---------- - estimator : object A supervised learning estimator, with a 'fit' method that returns the feature_importances_ attribute. Important features must correspond to high absolute values in the feature_importances_. - n_estimators : int or string, default = 1000 If int sets the number of estimators in the chosen ensemble method. If 'auto' this is determined automatically based on the size of the dataset. The other parameters of the used estimators need to be set with initialisation. - perc : int, default = 100 Instead of the max we use the percentile defined by the user, to pick our threshold for comparison between shadow and real features. The max @@ -89,93 +91,81 @@ class BorutaPy(BaseEstimator, TransformerMixin): lower perc is the more false positives will be picked as relevant but also the less relevant features will be left out. The usual trade-off. The default is essentially the vanilla Boruta corresponding to the max. - alpha : float, default = 0.05 Level at which the corrected p-values will get rejected in both correction steps. - + importance : str, default = 'shap' + The kind of variable importance used to compare and discriminate original + vs shadow predictors. two_step : Boolean, default = True If you want to use the original implementation of Boruta with Bonferroni correction only set this to False. - max_iter : int, default = 100 The number of maximum iterations to perform. - random_state : int, RandomState instance or None; default=None If int, random_state is the seed used by the random number generator; If RandomState instance, random_state is the random number generator; If None, the random number generator is the RandomState instance used by `np.random`. - + weight : pd.Series verbose : int, default=0 Controls verbosity of output: - 0: no output - 1: displays iteration number - 2: which features have been selected already - Attributes ---------- - n_features_ : int The number of selected features. - support_ : array of shape [n_features] - The mask of selected features - only confirmed ones are True. - support_weak_ : array of shape [n_features] - The mask of selected tentative features, which haven't gained enough support during the max_iter number of iterations.. - ranking_ : array of shape [n_features] - The feature ranking, such that ``ranking_[i]`` corresponds to the ranking position of the i-th feature. Selected (i.e., estimated best) features are assigned rank 1 and tentative features are assigned rank 2. - Examples -------- - + import pandas as pd from sklearn.ensemble import RandomForestClassifier from boruta import BorutaPy - + # load X and y # NOTE BorutaPy accepts numpy arrays only, hence the .values attribute X = pd.read_csv('examples/test_X.csv', index_col=0).values y = pd.read_csv('examples/test_y.csv', header=None, index_col=0).values y = y.ravel() - + # define random forest classifier, with utilising all cores and # sampling in proportion to y labels rf = RandomForestClassifier(n_jobs=-1, class_weight='balanced', max_depth=5) - + # define Boruta feature selection method feat_selector = BorutaPy(rf, n_estimators='auto', verbose=2, random_state=1) - + # find all relevant features - 5 features should be selected feat_selector.fit(X, y) - + # check selected features - first 5 features are selected feat_selector.support_ - + # check ranking of features feat_selector.ranking_ - + # call transform() on X to filter it down to selected features X_filtered = feat_selector.transform(X) - References ---------- - [1] Kursa M., Rudnicki W., "Feature Selection with the Boruta Package" Journal of Statistical Software, Vol. 36, Issue 11, Sep 2010 """ - def __init__(self, estimator, n_estimators=1000, perc=100, alpha=0.05, - two_step=True, max_iter=100, random_state=None, verbose=0): + def __init__(self, estimator, n_estimators=1000, perc=100, alpha=0.05, importance='shap', + two_step=True, max_iter=100, random_state=None, weight=None, verbose=0): self.estimator = estimator self.n_estimators = n_estimators self.perc = perc @@ -183,39 +173,48 @@ def __init__(self, estimator, n_estimators=1000, perc=100, alpha=0.05, self.two_step = two_step self.max_iter = max_iter self.random_state = random_state + self.random_state_instance = None self.verbose = verbose + self.weight = weight + self.importance = importance + self.cat_name = None + self.cat_idx = None + # Catboost doesn't allow to change random seed after fitting + self.is_cat = False + # plotting + self.imp_real_hist = None + self.sha_max = None + self.col_names = None def fit(self, X, y): """ Fits the Boruta feature selection with the provided estimator. - Parameters ---------- X : array-like, shape = [n_samples, n_features] The training input samples. - y : array-like, shape = [n_samples] The target values. """ - + self.imp_real_hist = np.empty((0, X.shape[1]), float) + if isinstance(X, pd.DataFrame) is not True: + X = pd.DataFrame(X) + self.col_names = X.columns.to_list() return self._fit(X, y) def transform(self, X, weak=False, return_df=False): """ Reduces the input X to the features selected by Boruta. - Parameters ---------- X : array-like, shape = [n_samples, n_features] The training input samples. - weak: boolean, default = False If set to true, the tentative features are also used to reduce X. - + return_df : boolean, default = False If ``X`` if a pandas dataframe and this parameter is set to True, the transformed data will also be a dataframe. - Returns ------- X : array-like, shape = [n_samples, n_features_] @@ -228,22 +227,17 @@ def transform(self, X, weak=False, return_df=False): def fit_transform(self, X, y, weak=False, return_df=False): """ Fits Boruta, then reduces the input X to the selected features. - Parameters ---------- X : array-like, shape = [n_samples, n_features] The training input samples. - y : array-like, shape = [n_samples] The target values. - weak: boolean, default = False If set to true, the tentative features are also used to reduce X. - return_df : boolean, default = False If ``X`` if a pandas dataframe and this parameter is set to True, the transformed data will also be a dataframe. - Returns ------- X : array-like, shape = [n_samples, n_features_] @@ -254,6 +248,63 @@ def fit_transform(self, X, y, weak=False, return_df=False): self._fit(X, y) return self._transform(X, weak, return_df) + def plot_importance(self): + """ + Boxplot of the variable importance, ordered by magnitude + The max shadow variable importance illustrated by the dashed line. + Requires to apply the fit method first. + :return: boxplot + """ + if self.imp_real_hist is None: + raise ValueError("Use the fit method first to compute the var.imp") + + color = {'boxes': 'gray', 'whiskers': 'gray', 'medians': '#404040', 'caps': 'gray'} + vimp_df = pd.DataFrame(self.imp_real_hist, columns=self.col_names) + vimp_df = vimp_df.reindex(vimp_df.mean().sort_values(ascending=True).index, axis=1) + bp = vimp_df.boxplot(color=color, + boxprops=dict(linestyle='-', linewidth=1.5), + flierprops=dict(linestyle='-', linewidth=1.5), + medianprops=dict(linestyle='-', linewidth=1.5), + whiskerprops=dict(linestyle='-', linewidth=1.5), + capprops=dict(linestyle='-', linewidth=1.5), + showfliers=False, grid=True, rot=0, vert=False, patch_artist=True, + figsize=(10, 3 * len(self.support_) / 10) + ) + + box_face_col = ["tab:blue"] * sum(self.support_) + ["#FDD023"] * sum(self.support_weak_) + for c in range(len(box_face_col)): + bp.findobj(matplotlib.patches.Patch)[len(self.support_) - c - 1].set_facecolor(box_face_col[c]) + bp.findobj(matplotlib.patches.Patch)[len(self.support_) - c - 1].set_facecolor(box_face_col[c]) + + # xt = bp.get_xticks() + # xt = np.round(np.append(xt, self.sha_max), 2) + # xtl = xt.tolist() + # xtl[-1] = "sha. max" + # bp.set_xticks(xt) + # bp.set_xticklabels(xtl) + bp.set_xlim(left=vimp_df.min(skipna=True).min(skipna=True)) + custom_lines = [Line2D([0], [0], color="tab:blue", lw=5), + Line2D([0], [0], color="#FDD023", lw=5), + Line2D([0], [0], color="gray", lw=5), + Line2D([0], [0], linestyle='--', color="gray", lw=2)] + bp.legend(custom_lines, ['confirmed', 'tentative', 'rejected', 'sha. max'], loc="lower right") + plt.axvline(x=self.sha_max, linestyle='--', color='gray') + fig = bp.get_figure() + fig.suptitle('Boruta importance and selected predictors') + plt.show() + + def _is_catboost(self): + self.is_cat = 'catboost' in str(type(self.estimator)) + + def _is_tree_based(self): + """ + checking if the estimator is tree-based (kernel SAP is too slow to be used here, unless using sampling) + :return: + """ + tree_based_models = ['lightgbm', 'xgboost', 'catboost', '_forest'] + condition = any(i in str(type(self.estimator)) for i in tree_based_models) + return condition + def _validate_pandas_input(self, arg): try: return arg.values @@ -262,16 +313,48 @@ def _validate_pandas_input(self, arg): "input needs to be a numpy array or pandas data frame." ) - def _fit(self, X, y): + def _fit(self, X_raw, y): + self._is_catboost() + + start_time = time.time() + # basic cat features encoding + # First, let's store "object" columns as categorical columns + # obj_feat = X_raw.dtypes.loc[X_raw.dtypes == 'object'].index.tolist() + obj_feat = list(set(list(X_raw.columns)) - set(list(X_raw.select_dtypes(include=[np.number])))) + + if obj_feat: + X_raw.loc[:, obj_feat] = X_raw[obj_feat].astype('str').astype('category') + cat_feat = X_raw.dtypes.loc[X_raw.dtypes == 'category'].index.tolist() + cat_idx = [X_raw.columns.get_loc(c) for c in cat_feat if c in cat_feat] + if cat_feat: + # a way without loop but need to re-do astype + Cat = X_raw[cat_feat].stack().astype('category').cat.codes.unstack() + + if self.is_cat is False: + if cat_feat: + X = pd.concat([X_raw[X_raw.columns.difference(cat_feat)], Cat], axis=1) + else: + X = X_raw + else: + X = X_raw + + X = np.nan_to_num(X) + y = np.nan_to_num(y) + # w = w.fillna(0) + + self.cat_name = cat_feat + self.cat_idx = cat_idx + # check input params self._check_params(X, y) if not isinstance(X, np.ndarray): - X = self._validate_pandas_input(X) + X = self._validate_pandas_input(X) + if not isinstance(y, np.ndarray): y = self._validate_pandas_input(y) - self.random_state = check_random_state(self.random_state) + self.random_state_instance = check_random_state(self.random_state) # setup variables for Boruta n_sample, n_feat = X.shape _iter = 1 @@ -301,7 +384,9 @@ def _fit(self, X, y): self.estimator.set_params(n_estimators=n_tree) # make sure we start with a new tree in each iteration - self.estimator.set_params(random_state=self.random_state) + # Catboost doesn't allow to change random seed after fitting + if self.is_cat is False: + self.estimator.set_params(random_state=self.random_state) # add shadow attributes, shuffle them and train estimator, get imps cur_imp = self._add_shadows_get_imps(X, y, dec_reg) @@ -342,6 +427,11 @@ def _fit(self, X, y): self.support_[confirmed] = 1 self.support_weak_ = np.zeros(n_feat, dtype=np.bool) self.support_weak_[tentative] = 1 + # for plotting + self.imp_real_hist = imp_history + self.sha_max = imp_sha_max + if isinstance(X_raw, pd.DataFrame): + self.support_names_ = [X_raw.columns[i] for i, x in enumerate(self.support_) if x] # ranking, confirmed variables are rank 1 self.ranking_ = np.ones(n_feat, dtype=np.int) @@ -356,18 +446,18 @@ def _fit(self, X, y): # update rank for not_selected features if not_selected.shape[0] > 0: - # calculate ranks in each iteration, then median of ranks across feats - iter_ranks = self._nanrankdata(imp_history_rejected, axis=1) - rank_medians = np.nanmedian(iter_ranks, axis=0) - ranks = self._nanrankdata(rank_medians, axis=0) - - # set smallest rank to 3 if there are tentative feats - if tentative.shape[0] > 0: - ranks = ranks - np.min(ranks) + 3 - else: - # and 2 otherwise - ranks = ranks - np.min(ranks) + 2 - self.ranking_[not_selected] = ranks + # calculate ranks in each iteration, then median of ranks across feats + iter_ranks = self._nanrankdata(imp_history_rejected, axis=1) + rank_medians = np.nanmedian(iter_ranks, axis=0) + ranks = self._nanrankdata(rank_medians, axis=0) + + # set smallest rank to 3 if there are tentative feats + if tentative.shape[0] > 0: + ranks = ranks - np.min(ranks) + 3 + else: + # and 2 otherwise + ranks = ranks - np.min(ranks) + 2 + self.ranking_[not_selected] = ranks else: # all are selected, thus we set feature supports to True self.support_ = np.ones(n_feat, dtype=np.bool) @@ -375,6 +465,10 @@ def _fit(self, X, y): # notify user if self.verbose > 0: self._print_results(dec_reg, _iter, 1) + self.running_time = time.time() - start_time + hours, rem = divmod(self.running_time, 3600) + minutes, seconds = divmod(rem, 60) + print("All relevant predictors selected in {:0>2}:{:0>2}:{:05.2f}".format(int(hours), int(minutes), seconds)) return self def _transform(self, X, weak=False, return_df=False): @@ -408,7 +502,16 @@ def _get_tree_num(self, n_feat): def _get_imp(self, X, y): try: - self.estimator.fit(X, y) + if self.is_cat: + X = pd.DataFrame(X) + obj_feat = X.dtypes.loc[(X.dtypes == 'object') | (X.dtypes == 'category')].index.tolist() + if obj_feat: + X[obj_feat] = X[obj_feat].astype('str').astype('category') + cat_feat = X.dtypes.loc[X.dtypes == 'category'].index.tolist() + self.estimator.fit(X, y, sample_weight=self.weight, cat_features=cat_feat) + else: + self.estimator.fit(X, y, sample_weight=self.weight) + except Exception as e: raise ValueError('Please check your X and y variable. The provided ' 'estimator cannot be fitted to your data.\n' + str(e)) @@ -419,8 +522,100 @@ def _get_imp(self, X, y): 'are currently supported in BorutaPy.') return imp + def _get_shap_imp(self, X, y): + + if self.weight is not None: + w = self.weight + if is_regressor(self.estimator): + X_tr, X_tt, y_tr, y_tt, w_tr, w_tt = train_test_split(X, y, w, random_state=42) + else: + X_tr, X_tt, y_tr, y_tt, w_tr, w_tt = train_test_split(X, y, w, stratify=y, random_state=42) + else: + if is_regressor(self.estimator): + X_tr, X_tt, y_tr, y_tt = train_test_split(X, y, random_state=42) + else: + X_tr, X_tt, y_tr, y_tt = train_test_split(X, y, stratify=y, random_state=42) + w_tr, w_tt = None, None + + X_tr = pd.DataFrame(X_tr) + X_tt = pd.DataFrame(X_tt) + obj_feat = list(set(list(X_tr.columns)) - set(list(X_tr.select_dtypes(include=[np.number])))) + obj_idx = None + + # if obj_feat: + # X_tr[obj_feat] = X_tr[obj_feat].astype('str').astype('category') + # X_tt[obj_feat] = X_tt[obj_feat].astype('str').astype('category') + # obj_idx = np.argwhere(X_tr.columns.isin(obj_feat)).ravel() + + if self._is_tree_based(): + try: + if self.is_cat: + model = self.estimator.fit(X_tr, y_tr, sample_weight=w_tr, cat_features=obj_idx) + else: + model = self.estimator.fit(X_tr, y_tr, sample_weight=w_tr) + + except Exception as e: + raise ValueError('Please check your X and y variable. The provided ' + 'estimator cannot be fitted to your data.\n' + str(e)) + # build the explainer + explainer = shap.TreeExplainer(model, feature_perturbation="tree_path_dependent") + shap_values = explainer.shap_values(X_tt) + # flatten to 2D if classification + if is_classifier(self.estimator): + shap_values = np.array(shap_values) + shap_values = shap_values.reshape(-1, shap_values.shape[-1]) + + shap_imp = np.mean(np.abs(shap_values), axis=0) + else: + raise ValueError('Not a tree based model') + + return shap_imp + + def _get_perm_imp(self, X, y): + + if self.weight is not None: + w = self.weight + if is_regressor(self.estimator): + X_tr, X_tt, y_tr, y_tt, w_tr, w_tt = train_test_split(X, y, w, random_state=42) + else: + X_tr, X_tt, y_tr, y_tt, w_tr, w_tt = train_test_split(X, y, w, stratify=y, random_state=42) + else: + if is_regressor(self.estimator): + X_tr, X_tt, y_tr, y_tt = train_test_split(X, y, random_state=42) + else: + X_tr, X_tt, y_tr, y_tt = train_test_split(X, y, stratify=y, random_state=42) + w_tr, w_tt = None, None + + X_tr = pd.DataFrame(X_tr) + X_tt = pd.DataFrame(X_tt) + obj_feat = list(set(list(X_tr.columns)) - set(list(X_tr.select_dtypes(include=[np.number])))) + obj_idx = None + + if obj_feat: + X_tr[obj_feat] = X_tr[obj_feat].astype('str').astype('category') + X_tt[obj_feat] = X_tt[obj_feat].astype('str').astype('category') + obj_idx = np.argwhere(X_tr.columns.isin(obj_feat)).ravel() + + if self._is_tree_based(): + try: + if self.is_cat: + model = self.estimator.fit(X_tr, y_tr, sample_weight=w_tr, cat_features=obj_feat) + else: + model = self.estimator.fit(X_tr, y_tr, sample_weight=w_tr) + + except Exception as e: + raise ValueError('Please check your X and y variable. The provided ' + 'estimator cannot be fitted to your data.\n' + str(e)) + + perm_imp = permutation_importance(self.estimator, X_tt, y_tt, n_repeats=5, random_state=42, n_jobs=-1) + imp = perm_imp.importances_mean.ravel() + else: + raise ValueError('Not a tree based model') + + return imp + def _get_shuffle(self, seq): - self.random_state.shuffle(seq) + self.random_state_instance.shuffle(seq) return seq def _add_shadows_get_imps(self, X, y, dec_reg): @@ -436,12 +631,19 @@ def _add_shadows_get_imps(self, X, y, dec_reg): # shuffle xSha x_sha = np.apply_along_axis(self._get_shuffle, 0, x_sha) # get importance of the merged matrix - imp = self._get_imp(np.hstack((x_cur, x_sha)), y) + if self.importance == 'shap': + imp = self._get_shap_imp(np.hstack((x_cur, x_sha)), y) + elif self.importance == 'pimp': + imp = self._get_perm_imp(np.hstack((x_cur, x_sha)), y) + else: + imp = self._get_imp(np.hstack((x_cur, x_sha)), y) + # separate importances of real and shadow features imp_sha = imp[x_cur_w:] imp_real = np.zeros(X.shape[1]) imp_real[:] = np.nan imp_real[x_cur_ind] = imp[:x_cur_w] + return imp_real, imp_sha def _assign_hits(self, hit_reg, cur_imp, imp_sha_max): @@ -492,14 +694,12 @@ def _fdrcorrection(self, pvals, alpha=0.05): """ Benjamini/Hochberg p-value correction for false discovery rate, from statsmodels package. Included here for decoupling dependency on statsmodels. - Parameters ---------- pvals : array_like set of p-values of the individual tests. alpha : float error rate - Returns ------- rejected : array, bool @@ -541,7 +741,7 @@ def _check_params(self, X, y): Check hyperparameters as well as X and y before proceeding with fit. """ # check X and y are consistent len, X is Array and y is column - X, y = check_X_y(X, y) + X, y = check_X_y(X, y, dtype=None, force_all_finite=False) if self.perc <= 0 or self.perc > 100: raise ValueError('The percentile should be between 0 and 100.') @@ -568,5 +768,9 @@ def _print_results(self, dec_reg, _iter, flag): n_tentative = np.sum(self.support_weak_) content = map(str, [n_iter, n_confirmed, n_tentative, n_rejected]) result = '\n'.join([x[0] + '\t' + x[1] for x in zip(cols, content)]) - output = "\n\nBorutaPy finished running.\n\n" + result + if self.importance in ['shap', 'pimp']: + vimp = str(self.importance) + else: + vimp = 'native' + output = "\n\nBorutaPy finished running using " + vimp + " var. imp.\n\n" + result print(output) diff --git a/boruta/test/modif_tests.py b/boruta/test/modif_tests.py new file mode 100644 index 0000000..507db6f --- /dev/null +++ b/boruta/test/modif_tests.py @@ -0,0 +1,154 @@ +%matplotlib inline +from sklearn.datasets import fetch_openml +from sklearn.ensemble import RandomForestClassifier +from sklearn.impute import SimpleImputer +from sklearn.inspection import permutation_importance +from sklearn.compose import ColumnTransformer +from sklearn.model_selection import train_test_split +from sklearn.pipeline import Pipeline +from sklearn.preprocessing import OneHotEncoder +from sklearn.datasets import fetch_openml +from sklearn.inspection import permutation_importance +import catboost +from boruta import BorutaPy as bp +from sklearn.datasets import load_boston, load_diabetes +import numpy as np +import pandas as pd +import matplotlib as mpl +import matplotlib.pyplot as plt +import gc +import shap +# lightgbm and catboost +import lightgbm as lgb +from lightgbm import LGBMRegressor, LGBMClassifier +from xgboost import XGBRegressor, XGBClassifier +from catboost import CatBoostRegressor, CatBoostClassifier +from sys import getsizeof, path +plt.style.use('fivethirtyeight') +rng = np.random.RandomState(seed=42) + + + +# Convert the cat. pred. for boosting +def cat_var(df, col_excl=None, return_cat=True): + """Identify categorical features. + + Parameters + ---------- + df: original df after missing operations + + Returns + ------- + cat_var_df: summary df with col index and col name for all categorical vars + :param return_cat: Boolean, return encoded cols as type 'category' + :param df: pd.DF, the encoded data-frame + :param col_excl: list, colums not to be encoded + """ + + if col_excl is None: + non_num_cols = list(set(list(df.columns)) - set(list(df.select_dtypes(include=[np.number])))) + else: + non_num_cols = list( + set(list(df.columns)) - set(list(df.select_dtypes(include=[np.number]))) - set(col_excl)) + + # cat_var_index = [i for i, x in enumerate(df[col_names].dtypes.tolist()) if isinstance(x, pd.CategoricalDtype) or x == 'object'] + # cat_var_name = [x for i, x in enumerate(col_names) if i in cat_var_index] + + cat_var_index = [df.columns.get_loc(c) for c in non_num_cols if c in df] + + cat_var_df = pd.DataFrame({'cat_ind': cat_var_index, + 'cat_name': non_num_cols}) + + cols_need_mapped = cat_var_df.cat_name.to_list() + inv_mapper = {col: dict(enumerate(df[col].astype('category').cat.categories)) for col in df[cols_need_mapped]} + mapper = {col: {v: k for k, v in inv_mapper[col].items()} for col in df[cols_need_mapped]} + + for c in cols_need_mapped: + df.loc[:, c] = df.loc[:, c].map(mapper[c]).fillna(0).astype(int) + + if return_cat: + df[non_num_cols] = df[non_num_cols].astype('category') + return df, cat_var_df, inv_mapper + + +def get_titanic_data(): + # Fetch Titanic data and add random cat and numbers + # Example taken from https://scikit-learn.org/stable/auto_examples/inspection/ + # plot_permutation_importance.html#sphx-glr-auto-examples-inspection-plot-permutation-importance-py + X, y = fetch_openml("titanic", version=1, as_frame=True, return_X_y=True) + rng = np.random.RandomState(seed=42) + X['random_cat'] = rng.randint(3, size=X.shape[0]) + X['random_cat'] = X['random_cat'].astype('str') + X['random_num'] = rng.randn(X.shape[0]) + + categorical_columns = ['pclass', 'sex', 'embarked', 'random_cat'] + numerical_columns = ['age', 'sibsp', 'parch', 'fare', 'random_num'] + X = X[categorical_columns + numerical_columns] + # Impute + categorical_pipe = Pipeline([('imputer', SimpleImputer(strategy='constant', fill_value='missing'))]) + numerical_pipe = Pipeline([('imputer', SimpleImputer(strategy='mean'))]) + preprocessing = ColumnTransformer([('cat', categorical_pipe, categorical_columns), ('num', numerical_pipe, numerical_columns)]) + X_trans = preprocessing.fit_transform(X) + X = pd.DataFrame(X_trans, columns = X.columns) + # encode + X, cat_var_df, inv_mapper = cat_var(X) + return X, y + +def get_boston_data(): + boston = load_boston() + X = pd.DataFrame(boston.data) + X['random_num1'] = rng.randn(X.shape[0]) + X['random_num2'] = np.random.poisson(1, X.shape[0]) + y = pd.Series(boston.target) + return X, y + +# Testing the changes with rnd cat. and num. predictors added to the set of genuine predictors +def testing_estimators(varimp, models, X, y): + for model in models: + print('testing ' + str(type(model)) + ' for var.imp: ' + varimp) + feat_selector = bp(model, n_estimators = 100, verbose= 1, max_iter= 10, random_state=42, weight=None, importance=varimp) + feat_selector.fit(X, y) + print(feat_selector.support_names_) + feat_selector.plot_importance() + gc.enable() + del(feat_selector, model) + gc.collect() + +def testing_clf_all_varimp(X, y): + for varimp in ['shap', 'pimp', 'native']: + models = [catboost.CatBoostClassifier(random_state=42, verbose=0), XGBClassifier(random_state=42, verbose=0), LGBMClassifier(random_state=42, verbose=0)] + testing_estimators(varimp=varimp, models=models, X=X, y=y) + gc.enable() + del(models) + gc.collect() + +def testing_regr_all_varimp(X, y): + for varimp in ['shap', 'pimp', 'native']: + models = [catboost.CatBoostRegressor(random_state=42, verbose=0), XGBRegressor(random_state=42, verbose=0), LGBMRegressor(random_state=42, verbose=0)] + testing_estimators(varimp=varimp, models=models, X=X, y=y) + gc.enable() + del(models) + gc.collect() + + +X, y = get_titanic_data() +X_train, X_test, y_train, y_test = train_test_split(X, y, stratify=y, random_state=42) +# lightgbm faster and better than RF +lgb_model = LGBMClassifier(n_jobs= 4) +lgb_model.fit(X_train, y_train) +result = permutation_importance(lgb_model, X_test, y_test, n_repeats=10, random_state=42, n_jobs=2) +sorted_idx = result.importances_mean.argsort() +# Plot +fig, ax = plt.subplots() +ax.boxplot(result.importances[sorted_idx].T, vert=False, labels=X_test.columns[sorted_idx]) +ax.set_title("Permutation Importances (test set)") +fig.tight_layout() +plt.show() + +if __name__ == '__main__': + # classification and cat. pred + X, y = get_titanic_data() + testing_clf_all_varimp(X=X, y=y) + # regression + X, y = get_boston_data() + testing_regr_all_varimp(X=X, y=y) \ No newline at end of file From bbea1b745c51eac0031e2b7b854c406480df52d0 Mon Sep 17 00:00:00 2001 From: Thomas Bury Date: Sat, 16 May 2020 12:54:56 +0200 Subject: [PATCH 2/9] correcting typos --- boruta/boruta_py.py | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/boruta/boruta_py.py b/boruta/boruta_py.py index c346708..1cb1416 100644 --- a/boruta/boruta_py.py +++ b/boruta/boruta_py.py @@ -542,15 +542,15 @@ def _get_shap_imp(self, X, y): obj_feat = list(set(list(X_tr.columns)) - set(list(X_tr.select_dtypes(include=[np.number])))) obj_idx = None - # if obj_feat: - # X_tr[obj_feat] = X_tr[obj_feat].astype('str').astype('category') - # X_tt[obj_feat] = X_tt[obj_feat].astype('str').astype('category') - # obj_idx = np.argwhere(X_tr.columns.isin(obj_feat)).ravel() + if obj_feat: + X_tr[obj_feat] = X_tr[obj_feat].astype('str').astype('category') + X_tt[obj_feat] = X_tt[obj_feat].astype('str').astype('category') + obj_idx = np.argwhere(X_tr.columns.isin(obj_feat)).ravel() if self._is_tree_based(): try: if self.is_cat: - model = self.estimator.fit(X_tr, y_tr, sample_weight=w_tr, cat_features=obj_idx) + model = self.estimator.fit(X_tr, y_tr, sample_weight=w_tr, cat_features=obj_feat) else: model = self.estimator.fit(X_tr, y_tr, sample_weight=w_tr) @@ -607,7 +607,7 @@ def _get_perm_imp(self, X, y): raise ValueError('Please check your X and y variable. The provided ' 'estimator cannot be fitted to your data.\n' + str(e)) - perm_imp = permutation_importance(self.estimator, X_tt, y_tt, n_repeats=5, random_state=42, n_jobs=-1) + perm_imp = permutation_importance(model, X_tt, y_tt, n_repeats=5, random_state=42, n_jobs=-1) imp = perm_imp.importances_mean.ravel() else: raise ValueError('Not a tree based model') From e8062d7cd58c1f2e13fa1c989a1afe7770782f06 Mon Sep 17 00:00:00 2001 From: Thomas Bury Date: Mon, 18 May 2020 17:54:23 +0200 Subject: [PATCH 3/9] fix the fig size + tight layout --- boruta/boruta_py.py | 16 ++++++---------- 1 file changed, 6 insertions(+), 10 deletions(-) diff --git a/boruta/boruta_py.py b/boruta/boruta_py.py index 1cb1416..7d943b7 100644 --- a/boruta/boruta_py.py +++ b/boruta/boruta_py.py @@ -267,8 +267,7 @@ def plot_importance(self): medianprops=dict(linestyle='-', linewidth=1.5), whiskerprops=dict(linestyle='-', linewidth=1.5), capprops=dict(linestyle='-', linewidth=1.5), - showfliers=False, grid=True, rot=0, vert=False, patch_artist=True, - figsize=(10, 3 * len(self.support_) / 10) + showfliers=False, grid=True, rot=0, vert=False, patch_artist=True ) box_face_col = ["tab:blue"] * sum(self.support_) + ["#FDD023"] * sum(self.support_weak_) @@ -276,13 +275,8 @@ def plot_importance(self): bp.findobj(matplotlib.patches.Patch)[len(self.support_) - c - 1].set_facecolor(box_face_col[c]) bp.findobj(matplotlib.patches.Patch)[len(self.support_) - c - 1].set_facecolor(box_face_col[c]) - # xt = bp.get_xticks() - # xt = np.round(np.append(xt, self.sha_max), 2) - # xtl = xt.tolist() - # xtl[-1] = "sha. max" - # bp.set_xticks(xt) - # bp.set_xticklabels(xtl) - bp.set_xlim(left=vimp_df.min(skipna=True).min(skipna=True)) + xrange = vimp_df.max(skipna=True).max(skipna=True)-vimp_df.min(skipna=True).min(skipna=True) + bp.set_xlim(left=vimp_df.min(skipna=True).min(skipna=True)-0.10*xrange) custom_lines = [Line2D([0], [0], color="tab:blue", lw=5), Line2D([0], [0], color="#FDD023", lw=5), Line2D([0], [0], color="gray", lw=5), @@ -290,7 +284,9 @@ def plot_importance(self): bp.legend(custom_lines, ['confirmed', 'tentative', 'rejected', 'sha. max'], loc="lower right") plt.axvline(x=self.sha_max, linestyle='--', color='gray') fig = bp.get_figure() - fig.suptitle('Boruta importance and selected predictors') + plt.title('Boruta importance and selected predictors') + fig.set_size_inches((10, 2 * np.rint(max(vimp_df.shape) / 10))) + plt.tight_layout() plt.show() def _is_catboost(self): From 6a05d378c6c2a91eb0ea643c255ad685afe4fd87 Mon Sep 17 00:00:00 2001 From: Thomas Bury Date: Thu, 28 May 2020 16:16:55 +0200 Subject: [PATCH 4/9] fixing lgb clf shap imp --- boruta/boruta_py.py | 21 ++++++++++++++++----- 1 file changed, 16 insertions(+), 5 deletions(-) diff --git a/boruta/boruta_py.py b/boruta/boruta_py.py index 7d943b7..3136d47 100644 --- a/boruta/boruta_py.py +++ b/boruta/boruta_py.py @@ -292,6 +292,9 @@ def plot_importance(self): def _is_catboost(self): self.is_cat = 'catboost' in str(type(self.estimator)) + def _is_lightgbm(self): + self.is_lgb = 'lightgbm' in str(type(self.estimator)) + def _is_tree_based(self): """ checking if the estimator is tree-based (kernel SAP is too slow to be used here, unless using sampling) @@ -556,12 +559,20 @@ def _get_shap_imp(self, X, y): # build the explainer explainer = shap.TreeExplainer(model, feature_perturbation="tree_path_dependent") shap_values = explainer.shap_values(X_tt) - # flatten to 2D if classification + # flatten to 2D if classification and lightgbm if is_classifier(self.estimator): - shap_values = np.array(shap_values) - shap_values = shap_values.reshape(-1, shap_values.shape[-1]) - - shap_imp = np.mean(np.abs(shap_values), axis=0) + if isinstance(shap_values, list): + # for lightgbm clf sklearn, shap return list of arrays + # https://github.com/slundberg/shap/issues/526 + class_inds = range(len(shap_values)) + shap_imp = np.zeros(shap_values[0].shape[1]) + for i, ind in enumerate(class_inds): + shap_imp = np.abs(shap_values[ind]).mean(0) + shap_imp /= len(shap_values) + else: + shap_imp = np.abs(shap_values).mean(0) + else: + shap_imp = np.abs(shap_values).mean(0) else: raise ValueError('Not a tree based model') From 2081d391cdf883742d3ddf439dccd2ec3cc6a63f Mon Sep 17 00:00:00 2001 From: Thomas Bury Date: Wed, 8 Jul 2020 11:35:28 +0200 Subject: [PATCH 5/9] changes for lightgbm seed --- boruta/boruta_py.py | 42 +- .../Madalon_Data_Set-checkpoint.ipynb | 405 ++++++++++++++++++ boruta/examples/Madalon_Data_Set.ipynb | 21 +- .../.ipynb_checkpoints/__init__-checkpoint.py | 0 .../modif_tests-checkpoint.py | 154 +++++++ .../unit_tests-checkpoint.py | 57 +++ 6 files changed, 645 insertions(+), 34 deletions(-) create mode 100644 boruta/examples/.ipynb_checkpoints/Madalon_Data_Set-checkpoint.ipynb create mode 100644 boruta/test/.ipynb_checkpoints/__init__-checkpoint.py create mode 100644 boruta/test/.ipynb_checkpoints/modif_tests-checkpoint.py create mode 100644 boruta/test/.ipynb_checkpoints/unit_tests-checkpoint.py diff --git a/boruta/boruta_py.py b/boruta/boruta_py.py index 3136d47..eaa79ba 100644 --- a/boruta/boruta_py.py +++ b/boruta/boruta_py.py @@ -1,17 +1,15 @@ from __future__ import print_function, division import numpy as np import scipy as sp -from sklearn.utils import check_random_state, check_X_y -from sklearn.base import TransformerMixin, BaseEstimator, is_classifier, is_regressor -from sklearn.utils import check_random_state, check_X_y -from sklearn.base import TransformerMixin, BaseEstimator, is_regressor, is_classifier -from sklearn.model_selection import RepeatedKFold, train_test_split -from sklearn.inspection import permutation_importance import shap import pandas as pd import time import matplotlib import matplotlib.pyplot as plt +from sklearn.utils import check_random_state, check_X_y +from sklearn.base import TransformerMixin, BaseEstimator, is_classifier, is_regressor +from sklearn.model_selection import RepeatedKFold, train_test_split +from sklearn.inspection import permutation_importance from matplotlib.lines import Line2D #!/usr/bin/env python # -*- coding: utf-8 -*- @@ -180,7 +178,9 @@ def __init__(self, estimator, n_estimators=1000, perc=100, alpha=0.05, importanc self.cat_name = None self.cat_idx = None # Catboost doesn't allow to change random seed after fitting - self.is_cat = False + self.is_cat = 'catboost' in str(type(self.estimator)) + # Random state throws an error with lightgbm + self.is_lgb = 'lightgbm' in str(type(self.estimator)) # plotting self.imp_real_hist = None self.sha_max = None @@ -289,12 +289,6 @@ def plot_importance(self): plt.tight_layout() plt.show() - def _is_catboost(self): - self.is_cat = 'catboost' in str(type(self.estimator)) - - def _is_lightgbm(self): - self.is_lgb = 'lightgbm' in str(type(self.estimator)) - def _is_tree_based(self): """ checking if the estimator is tree-based (kernel SAP is too slow to be used here, unless using sampling) @@ -313,7 +307,6 @@ def _validate_pandas_input(self, arg): ) def _fit(self, X_raw, y): - self._is_catboost() start_time = time.time() # basic cat features encoding @@ -329,7 +322,7 @@ def _fit(self, X_raw, y): # a way without loop but need to re-do astype Cat = X_raw[cat_feat].stack().astype('category').cat.codes.unstack() - if self.is_cat is False: + if not self.is_cat: if cat_feat: X = pd.concat([X_raw[X_raw.columns.difference(cat_feat)], Cat], axis=1) else: @@ -353,7 +346,7 @@ def _fit(self, X_raw, y): if not isinstance(y, np.ndarray): y = self._validate_pandas_input(y) - self.random_state_instance = check_random_state(self.random_state) + self.random_state = check_random_state(self.random_state) # setup variables for Boruta n_sample, n_feat = X.shape _iter = 1 @@ -384,8 +377,13 @@ def _fit(self, X_raw, y): # make sure we start with a new tree in each iteration # Catboost doesn't allow to change random seed after fitting - if self.is_cat is False: - self.estimator.set_params(random_state=self.random_state) + if not self.is_cat: + if self._is_lightgbm: + # https://github.com/scikit-learn-contrib/boruta_py/pull/78 + self.estimator.set_params(random_state=self.random_state.randint(0, 10000)) + else: + self.estimator.set_params(random_state=self.random_state) + # add shadow attributes, shuffle them and train estimator, get imps cur_imp = self._add_shadows_get_imps(X, y, dec_reg) @@ -522,7 +520,7 @@ def _get_imp(self, X, y): return imp def _get_shap_imp(self, X, y): - + # SHAP and permutation importances must be computed on unseen data if self.weight is not None: w = self.weight if is_regressor(self.estimator): @@ -562,12 +560,12 @@ def _get_shap_imp(self, X, y): # flatten to 2D if classification and lightgbm if is_classifier(self.estimator): if isinstance(shap_values, list): - # for lightgbm clf sklearn, shap return list of arrays + # for lightgbm clf sklearn api, shap returns list of arrays # https://github.com/slundberg/shap/issues/526 class_inds = range(len(shap_values)) shap_imp = np.zeros(shap_values[0].shape[1]) for i, ind in enumerate(class_inds): - shap_imp = np.abs(shap_values[ind]).mean(0) + shap_imp += np.abs(shap_values[ind]).mean(0) shap_imp /= len(shap_values) else: shap_imp = np.abs(shap_values).mean(0) @@ -622,7 +620,7 @@ def _get_perm_imp(self, X, y): return imp def _get_shuffle(self, seq): - self.random_state_instance.shuffle(seq) + self.random_state.shuffle(seq) return seq def _add_shadows_get_imps(self, X, y, dec_reg): diff --git a/boruta/examples/.ipynb_checkpoints/Madalon_Data_Set-checkpoint.ipynb b/boruta/examples/.ipynb_checkpoints/Madalon_Data_Set-checkpoint.ipynb new file mode 100644 index 0000000..45fb14a --- /dev/null +++ b/boruta/examples/.ipynb_checkpoints/Madalon_Data_Set-checkpoint.ipynb @@ -0,0 +1,405 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Using Boruta on the Madalon Data Set\n", + "Author: [Mike Bernico](mike.bernico@gmail.com)\n", + "\n", + "This example demonstrates using Boruta to find all relevant features in the Madalon dataset, which is an artificial dataset used in NIPS2003 and cited in the [Boruta paper](https://www.jstatsoft.org/article/view/v036i11/v36i11.pdf)\n", + "\n", + "This dataset has 2000 observations and 500 features. We will use Boruta to identify the features that are relevant to the classification task.\n", + "\n", + "\n", + "\n" + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "metadata": {}, + "outputs": [], + "source": [ + "# Installation\n", + "#!pip install boruta" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "metadata": {}, + "outputs": [], + "source": [ + "import pandas as pd\n", + "from sklearn.datasets import load_iris\n", + "from sklearn.ensemble import RandomForestClassifier\n", + "from boruta import BorutaPy" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "metadata": { + "collapsed": true, + "jupyter": { + "outputs_hidden": true + } + }, + "outputs": [], + "source": [ + "def load_data():\n", + " # URLS for dataset via UCI\n", + " train_data_url='https://archive.ics.uci.edu/ml/machine-learning-databases/madelon/MADELON/madelon_train.data'\n", + " train_label_url='https://archive.ics.uci.edu/ml/machine-learning-databases/madelon/MADELON/madelon_train.labels'\n", + "\n", + " X_data = pd.read_csv(train_data_url, sep=\" \", header=None)\n", + " y_data = pd.read_csv(train_label_url, sep=\" \", header=None)\n", + " data = X_data.loc[:, :499]\n", + " data['target'] = y_data[0]\n", + " return data" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "metadata": {}, + "outputs": [], + "source": [ + "data = load_data()" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
0123456789...491492493494495496497498499target
0485477537479452471491476475473...481477485511485481479475496-1
1483458460487587475526479485469...478487338513486483492510517-1
2487542499468448471442478480477...481492650506501480489499498-1
3480491510485495472417474502476...4804745724544694754824944611
4484502528489466481402478487468...4794524354865084815044955111
\n", + "

5 rows × 501 columns

\n", + "
" + ], + "text/plain": [ + " 0 1 2 3 4 5 6 7 8 9 ... 491 492 493 \\\n", + "0 485 477 537 479 452 471 491 476 475 473 ... 481 477 485 \n", + "1 483 458 460 487 587 475 526 479 485 469 ... 478 487 338 \n", + "2 487 542 499 468 448 471 442 478 480 477 ... 481 492 650 \n", + "3 480 491 510 485 495 472 417 474 502 476 ... 480 474 572 \n", + "4 484 502 528 489 466 481 402 478 487 468 ... 479 452 435 \n", + "\n", + " 494 495 496 497 498 499 target \n", + "0 511 485 481 479 475 496 -1 \n", + "1 513 486 483 492 510 517 -1 \n", + "2 506 501 480 489 499 498 -1 \n", + "3 454 469 475 482 494 461 1 \n", + "4 486 508 481 504 495 511 1 \n", + "\n", + "[5 rows x 501 columns]" + ] + }, + "execution_count": 5, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "data.head()" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "metadata": {}, + "outputs": [], + "source": [ + "y = data.pop('target')\n", + "X = data.copy().values" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Boruta conforms to the sklearn api and can be used in a Pipeline as well as on it's own. Here we will demonstrate stand alone operation.\n", + "\n", + "First we will instantiate an estimator that Boruta will use. Then we will instantiate a Boruta Object." + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "metadata": {}, + "outputs": [], + "source": [ + "rf = RandomForestClassifier(n_jobs=-1, class_weight=None, max_depth=7, random_state=0)\n", + "# Define Boruta feature selection method\n", + "feat_selector = BorutaPy(rf, n_estimators='auto', verbose=2, random_state=0)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Once built, we can use this object to identify the relevant features in our dataset." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "feat_selector.fit(X, y)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Boruta has confirmed only a few features as useful. When our run ended, Boruta was undecided on 2 features. '\n", + "\n", + "We can interrogate .support_ to understand which features were selected. .support_ returns an array of booleans that we can use to slice our feature matrix to include only relevant columns. Of course, .transform can also be used, as expected in the scikit API." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Check selected features\n", + "print(feat_selector.support_)\n", + "# Select the chosen features from our dataframe.\n", + "selected = X[:, feat_selector.support_]\n", + "print (\"\")\n", + "print (\"Selected Feature Matrix Shape\")\n", + "print (selected.shape)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "We can also interrogate the ranking of the unselected features with .ranking_" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "feat_selector.ranking_" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.7.7" + }, + "varInspector": { + "cols": { + "lenName": 16, + "lenType": 16, + "lenVar": 40 + }, + "kernels_config": { + "python": { + "delete_cmd_postfix": "", + "delete_cmd_prefix": "del ", + "library": "var_list.py", + "varRefreshCmd": "print(var_dic_list())" + }, + "r": { + "delete_cmd_postfix": ") ", + "delete_cmd_prefix": "rm(", + "library": "var_list.r", + "varRefreshCmd": "cat(var_dic_list()) " + } + }, + "types_to_exclude": [ + "module", + "function", + "builtin_function_or_method", + "instance", + "_Feature" + ], + "window_display": false + } + }, + "nbformat": 4, + "nbformat_minor": 4 +} diff --git a/boruta/examples/Madalon_Data_Set.ipynb b/boruta/examples/Madalon_Data_Set.ipynb index 435eba0..45fb14a 100644 --- a/boruta/examples/Madalon_Data_Set.ipynb +++ b/boruta/examples/Madalon_Data_Set.ipynb @@ -18,9 +18,7 @@ { "cell_type": "code", "execution_count": 1, - "metadata": { - "collapsed": true - }, + "metadata": {}, "outputs": [], "source": [ "# Installation\n", @@ -43,7 +41,10 @@ "cell_type": "code", "execution_count": 3, "metadata": { - "collapsed": true + "collapsed": true, + "jupyter": { + "outputs_hidden": true + } }, "outputs": [], "source": [ @@ -279,9 +280,7 @@ { "cell_type": "code", "execution_count": 7, - "metadata": { - "collapsed": true - }, + "metadata": {}, "outputs": [], "source": [ "rf = RandomForestClassifier(n_jobs=-1, class_weight=None, max_depth=7, random_state=0)\n", @@ -348,9 +347,7 @@ { "cell_type": "code", "execution_count": null, - "metadata": { - "collapsed": true - }, + "metadata": {}, "outputs": [], "source": [] } @@ -371,7 +368,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.6.5" + "version": "3.7.7" }, "varInspector": { "cols": { @@ -404,5 +401,5 @@ } }, "nbformat": 4, - "nbformat_minor": 1 + "nbformat_minor": 4 } diff --git a/boruta/test/.ipynb_checkpoints/__init__-checkpoint.py b/boruta/test/.ipynb_checkpoints/__init__-checkpoint.py new file mode 100644 index 0000000..e69de29 diff --git a/boruta/test/.ipynb_checkpoints/modif_tests-checkpoint.py b/boruta/test/.ipynb_checkpoints/modif_tests-checkpoint.py new file mode 100644 index 0000000..507db6f --- /dev/null +++ b/boruta/test/.ipynb_checkpoints/modif_tests-checkpoint.py @@ -0,0 +1,154 @@ +%matplotlib inline +from sklearn.datasets import fetch_openml +from sklearn.ensemble import RandomForestClassifier +from sklearn.impute import SimpleImputer +from sklearn.inspection import permutation_importance +from sklearn.compose import ColumnTransformer +from sklearn.model_selection import train_test_split +from sklearn.pipeline import Pipeline +from sklearn.preprocessing import OneHotEncoder +from sklearn.datasets import fetch_openml +from sklearn.inspection import permutation_importance +import catboost +from boruta import BorutaPy as bp +from sklearn.datasets import load_boston, load_diabetes +import numpy as np +import pandas as pd +import matplotlib as mpl +import matplotlib.pyplot as plt +import gc +import shap +# lightgbm and catboost +import lightgbm as lgb +from lightgbm import LGBMRegressor, LGBMClassifier +from xgboost import XGBRegressor, XGBClassifier +from catboost import CatBoostRegressor, CatBoostClassifier +from sys import getsizeof, path +plt.style.use('fivethirtyeight') +rng = np.random.RandomState(seed=42) + + + +# Convert the cat. pred. for boosting +def cat_var(df, col_excl=None, return_cat=True): + """Identify categorical features. + + Parameters + ---------- + df: original df after missing operations + + Returns + ------- + cat_var_df: summary df with col index and col name for all categorical vars + :param return_cat: Boolean, return encoded cols as type 'category' + :param df: pd.DF, the encoded data-frame + :param col_excl: list, colums not to be encoded + """ + + if col_excl is None: + non_num_cols = list(set(list(df.columns)) - set(list(df.select_dtypes(include=[np.number])))) + else: + non_num_cols = list( + set(list(df.columns)) - set(list(df.select_dtypes(include=[np.number]))) - set(col_excl)) + + # cat_var_index = [i for i, x in enumerate(df[col_names].dtypes.tolist()) if isinstance(x, pd.CategoricalDtype) or x == 'object'] + # cat_var_name = [x for i, x in enumerate(col_names) if i in cat_var_index] + + cat_var_index = [df.columns.get_loc(c) for c in non_num_cols if c in df] + + cat_var_df = pd.DataFrame({'cat_ind': cat_var_index, + 'cat_name': non_num_cols}) + + cols_need_mapped = cat_var_df.cat_name.to_list() + inv_mapper = {col: dict(enumerate(df[col].astype('category').cat.categories)) for col in df[cols_need_mapped]} + mapper = {col: {v: k for k, v in inv_mapper[col].items()} for col in df[cols_need_mapped]} + + for c in cols_need_mapped: + df.loc[:, c] = df.loc[:, c].map(mapper[c]).fillna(0).astype(int) + + if return_cat: + df[non_num_cols] = df[non_num_cols].astype('category') + return df, cat_var_df, inv_mapper + + +def get_titanic_data(): + # Fetch Titanic data and add random cat and numbers + # Example taken from https://scikit-learn.org/stable/auto_examples/inspection/ + # plot_permutation_importance.html#sphx-glr-auto-examples-inspection-plot-permutation-importance-py + X, y = fetch_openml("titanic", version=1, as_frame=True, return_X_y=True) + rng = np.random.RandomState(seed=42) + X['random_cat'] = rng.randint(3, size=X.shape[0]) + X['random_cat'] = X['random_cat'].astype('str') + X['random_num'] = rng.randn(X.shape[0]) + + categorical_columns = ['pclass', 'sex', 'embarked', 'random_cat'] + numerical_columns = ['age', 'sibsp', 'parch', 'fare', 'random_num'] + X = X[categorical_columns + numerical_columns] + # Impute + categorical_pipe = Pipeline([('imputer', SimpleImputer(strategy='constant', fill_value='missing'))]) + numerical_pipe = Pipeline([('imputer', SimpleImputer(strategy='mean'))]) + preprocessing = ColumnTransformer([('cat', categorical_pipe, categorical_columns), ('num', numerical_pipe, numerical_columns)]) + X_trans = preprocessing.fit_transform(X) + X = pd.DataFrame(X_trans, columns = X.columns) + # encode + X, cat_var_df, inv_mapper = cat_var(X) + return X, y + +def get_boston_data(): + boston = load_boston() + X = pd.DataFrame(boston.data) + X['random_num1'] = rng.randn(X.shape[0]) + X['random_num2'] = np.random.poisson(1, X.shape[0]) + y = pd.Series(boston.target) + return X, y + +# Testing the changes with rnd cat. and num. predictors added to the set of genuine predictors +def testing_estimators(varimp, models, X, y): + for model in models: + print('testing ' + str(type(model)) + ' for var.imp: ' + varimp) + feat_selector = bp(model, n_estimators = 100, verbose= 1, max_iter= 10, random_state=42, weight=None, importance=varimp) + feat_selector.fit(X, y) + print(feat_selector.support_names_) + feat_selector.plot_importance() + gc.enable() + del(feat_selector, model) + gc.collect() + +def testing_clf_all_varimp(X, y): + for varimp in ['shap', 'pimp', 'native']: + models = [catboost.CatBoostClassifier(random_state=42, verbose=0), XGBClassifier(random_state=42, verbose=0), LGBMClassifier(random_state=42, verbose=0)] + testing_estimators(varimp=varimp, models=models, X=X, y=y) + gc.enable() + del(models) + gc.collect() + +def testing_regr_all_varimp(X, y): + for varimp in ['shap', 'pimp', 'native']: + models = [catboost.CatBoostRegressor(random_state=42, verbose=0), XGBRegressor(random_state=42, verbose=0), LGBMRegressor(random_state=42, verbose=0)] + testing_estimators(varimp=varimp, models=models, X=X, y=y) + gc.enable() + del(models) + gc.collect() + + +X, y = get_titanic_data() +X_train, X_test, y_train, y_test = train_test_split(X, y, stratify=y, random_state=42) +# lightgbm faster and better than RF +lgb_model = LGBMClassifier(n_jobs= 4) +lgb_model.fit(X_train, y_train) +result = permutation_importance(lgb_model, X_test, y_test, n_repeats=10, random_state=42, n_jobs=2) +sorted_idx = result.importances_mean.argsort() +# Plot +fig, ax = plt.subplots() +ax.boxplot(result.importances[sorted_idx].T, vert=False, labels=X_test.columns[sorted_idx]) +ax.set_title("Permutation Importances (test set)") +fig.tight_layout() +plt.show() + +if __name__ == '__main__': + # classification and cat. pred + X, y = get_titanic_data() + testing_clf_all_varimp(X=X, y=y) + # regression + X, y = get_boston_data() + testing_regr_all_varimp(X=X, y=y) \ No newline at end of file diff --git a/boruta/test/.ipynb_checkpoints/unit_tests-checkpoint.py b/boruta/test/.ipynb_checkpoints/unit_tests-checkpoint.py new file mode 100644 index 0000000..5d5ce9f --- /dev/null +++ b/boruta/test/.ipynb_checkpoints/unit_tests-checkpoint.py @@ -0,0 +1,57 @@ +import unittest +from boruta import BorutaPy +import pandas as pd +from sklearn.ensemble import RandomForestClassifier +import numpy as np + + +class BorutaTestCases(unittest.TestCase): + + def test_get_tree_num(self): + rfc = RandomForestClassifier(max_depth=10) + bt = BorutaPy(rfc) + self.assertEqual(bt._get_tree_num(10), 44, "Tree Est. Math Fail") + self.assertEqual(bt._get_tree_num(100), 141, "Tree Est. Math Fail") + + def test_if_boruta_extracts_relevant_features(self): + np.random.seed(42) + y = np.random.binomial(1, 0.5, 1000) + X = np.zeros((1000, 10)) + + z = y - np.random.binomial(1, 0.1, 1000) + np.random.binomial(1, 0.1, 1000) + z[z == -1] = 0 + z[z == 2] = 1 + + # 5 relevant features + X[:, 0] = z + X[:, 1] = y * np.abs(np.random.normal(0, 1, 1000)) + np.random.normal(0, 0.1, 1000) + X[:, 2] = y + np.random.normal(0, 1, 1000) + X[:, 3] = y ** 2 + np.random.normal(0, 1, 1000) + X[:, 4] = np.sqrt(y) + np.random.binomial(2, 0.1, 1000) + + # 5 irrelevant features + X[:, 5] = np.random.normal(0, 1, 1000) + X[:, 6] = np.random.poisson(1, 1000) + X[:, 7] = np.random.binomial(1, 0.3, 1000) + X[:, 8] = np.random.normal(0, 1, 1000) + X[:, 9] = np.random.poisson(1, 1000) + + rfc = RandomForestClassifier() + bt = BorutaPy(rfc) + bt.fit(X, y) + + # make sure that only all the relevant features are returned + self.assertListEqual(list(range(5)), list(np.where(bt.support_)[0])) + + # test if this works as expected for dataframe input + X_df, y_df = pd.DataFrame(X), pd.Series(y) + bt.fit(X_df, y_df) + self.assertListEqual(list(range(5)), list(np.where(bt.support_)[0])) + + # check it dataframe is returned when return_df=True + self.assertIsInstance(bt.transform(X_df, return_df=True), pd.DataFrame) + +if __name__ == '__main__': + unittest.main() + + From a6c4b8ad85166aee10f07e04b6facdac2277257c Mon Sep 17 00:00:00 2001 From: Thomas Bury Date: Wed, 8 Jul 2020 11:43:51 +0200 Subject: [PATCH 6/9] naming of var consistent with master --- boruta/boruta_py.py | 36 ++++++++++++++++++------------------ 1 file changed, 18 insertions(+), 18 deletions(-) diff --git a/boruta/boruta_py.py b/boruta/boruta_py.py index eaa79ba..49b2ce7 100644 --- a/boruta/boruta_py.py +++ b/boruta/boruta_py.py @@ -1,3 +1,13 @@ +#!/usr/bin/env python +# -*- coding: utf-8 -*- +""" +Author: Daniel Homola + +Original code and method by: Miron B Kursa, https://m2.icm.edu.pl/boruta/ + +License: BSD 3 clause +""" + from __future__ import print_function, division import numpy as np import scipy as sp @@ -11,16 +21,6 @@ from sklearn.model_selection import RepeatedKFold, train_test_split from sklearn.inspection import permutation_importance from matplotlib.lines import Line2D -#!/usr/bin/env python -# -*- coding: utf-8 -*- -""" -Author: Daniel Homola - -Original code and method by: Miron B Kursa, https://m2.icm.edu.pl/boruta/ - -License: BSD 3 clause -""" - class BorutaPy(BaseEstimator, TransformerMixin): @@ -171,20 +171,20 @@ def __init__(self, estimator, n_estimators=1000, perc=100, alpha=0.05, importanc self.two_step = two_step self.max_iter = max_iter self.random_state = random_state - self.random_state_instance = None self.verbose = verbose self.weight = weight self.importance = importance self.cat_name = None self.cat_idx = None # Catboost doesn't allow to change random seed after fitting - self.is_cat = 'catboost' in str(type(self.estimator)) + self._is_catboost = 'catboost' in str(type(self.estimator)) # Random state throws an error with lightgbm - self.is_lgb = 'lightgbm' in str(type(self.estimator)) + self._is_lightgbm = 'lightgbm' in str(type(self.estimator)) # plotting self.imp_real_hist = None self.sha_max = None self.col_names = None + self.__version__ = '0.3' def fit(self, X, y): """ @@ -322,7 +322,7 @@ def _fit(self, X_raw, y): # a way without loop but need to re-do astype Cat = X_raw[cat_feat].stack().astype('category').cat.codes.unstack() - if not self.is_cat: + if not self._is_catboost: if cat_feat: X = pd.concat([X_raw[X_raw.columns.difference(cat_feat)], Cat], axis=1) else: @@ -377,7 +377,7 @@ def _fit(self, X_raw, y): # make sure we start with a new tree in each iteration # Catboost doesn't allow to change random seed after fitting - if not self.is_cat: + if not self._is_catboost: if self._is_lightgbm: # https://github.com/scikit-learn-contrib/boruta_py/pull/78 self.estimator.set_params(random_state=self.random_state.randint(0, 10000)) @@ -499,7 +499,7 @@ def _get_tree_num(self, n_feat): def _get_imp(self, X, y): try: - if self.is_cat: + if self._is_catboost: X = pd.DataFrame(X) obj_feat = X.dtypes.loc[(X.dtypes == 'object') | (X.dtypes == 'category')].index.tolist() if obj_feat: @@ -546,7 +546,7 @@ def _get_shap_imp(self, X, y): if self._is_tree_based(): try: - if self.is_cat: + if self._is_catboost: model = self.estimator.fit(X_tr, y_tr, sample_weight=w_tr, cat_features=obj_feat) else: model = self.estimator.fit(X_tr, y_tr, sample_weight=w_tr) @@ -603,7 +603,7 @@ def _get_perm_imp(self, X, y): if self._is_tree_based(): try: - if self.is_cat: + if self._is_catboost: model = self.estimator.fit(X_tr, y_tr, sample_weight=w_tr, cat_features=obj_feat) else: model = self.estimator.fit(X_tr, y_tr, sample_weight=w_tr) From e278764fc4c4a86eaec1bbeee841e559c51cad23 Mon Sep 17 00:00:00 2001 From: Thomas Bury Date: Wed, 8 Jul 2020 11:52:16 +0200 Subject: [PATCH 7/9] splitting import into different lines --- boruta/boruta_py.py | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/boruta/boruta_py.py b/boruta/boruta_py.py index 49b2ce7..ab773e5 100644 --- a/boruta/boruta_py.py +++ b/boruta/boruta_py.py @@ -17,7 +17,8 @@ import matplotlib import matplotlib.pyplot as plt from sklearn.utils import check_random_state, check_X_y -from sklearn.base import TransformerMixin, BaseEstimator, is_classifier, is_regressor +from sklearn.base import TransformerMixin, BaseEstimator +from sklearn.base import is_classifier, is_regressor from sklearn.model_selection import RepeatedKFold, train_test_split from sklearn.inspection import permutation_importance from matplotlib.lines import Line2D @@ -176,15 +177,14 @@ def __init__(self, estimator, n_estimators=1000, perc=100, alpha=0.05, importanc self.importance = importance self.cat_name = None self.cat_idx = None - # Catboost doesn't allow to change random seed after fitting - self._is_catboost = 'catboost' in str(type(self.estimator)) - # Random state throws an error with lightgbm - self._is_lightgbm = 'lightgbm' in str(type(self.estimator)) # plotting self.imp_real_hist = None self.sha_max = None self.col_names = None - self.__version__ = '0.3' + # Catboost doesn't allow to change random seed after fitting + self._is_catboost = 'catboost' in str(type(self.estimator)) + # Random state throws an error with lightgbm + self._is_lightgbm = 'lightgbm' in str(type(self.estimator)) def fit(self, X, y): """ From 132e265677f456cb92956c800c9b637c27a8a052 Mon Sep 17 00:00:00 2001 From: Thomas Bury Date: Wed, 8 Jul 2020 11:54:32 +0200 Subject: [PATCH 8/9] import warnings --- boruta/boruta_py.py | 1 + 1 file changed, 1 insertion(+) diff --git a/boruta/boruta_py.py b/boruta/boruta_py.py index ab773e5..6a1de8d 100644 --- a/boruta/boruta_py.py +++ b/boruta/boruta_py.py @@ -22,6 +22,7 @@ from sklearn.model_selection import RepeatedKFold, train_test_split from sklearn.inspection import permutation_importance from matplotlib.lines import Line2D +import warnings class BorutaPy(BaseEstimator, TransformerMixin): From 504abf50b63ba7fb2e5429124043018b53cf09ef Mon Sep 17 00:00:00 2001 From: Thomas Bury Date: Fri, 28 Aug 2020 15:05:50 +0200 Subject: [PATCH 9/9] move weight in fit method --- boruta/boruta_py.py | 572 +++++++++++++++++++++++++------------ boruta/test/modif_tests.py | 37 +-- 2 files changed, 412 insertions(+), 197 deletions(-) diff --git a/boruta/boruta_py.py b/boruta/boruta_py.py index 6a1de8d..e1e14f7 100644 --- a/boruta/boruta_py.py +++ b/boruta/boruta_py.py @@ -1,30 +1,3 @@ -#!/usr/bin/env python -# -*- coding: utf-8 -*- -""" -Author: Daniel Homola - -Original code and method by: Miron B Kursa, https://m2.icm.edu.pl/boruta/ - -License: BSD 3 clause -""" - -from __future__ import print_function, division -import numpy as np -import scipy as sp -import shap -import pandas as pd -import time -import matplotlib -import matplotlib.pyplot as plt -from sklearn.utils import check_random_state, check_X_y -from sklearn.base import TransformerMixin, BaseEstimator -from sklearn.base import is_classifier, is_regressor -from sklearn.model_selection import RepeatedKFold, train_test_split -from sklearn.inspection import permutation_importance -from matplotlib.lines import Line2D -import warnings - - class BorutaPy(BaseEstimator, TransformerMixin): """ Improved Python implementation of the Boruta R package. @@ -51,8 +24,8 @@ class BorutaPy(BaseEstimator, TransformerMixin): crucial parameter. For more info, please read about the perc parameter. - Automatic tree number: Setting the n_estimator to 'auto' will calculate the number of trees - in each itartion based on the number of features under investigation. - This way more trees are used when the training data has many feautres + in each iteration based on the number of features under investigation. + This way more trees are used when the training data has many features and less when most of the features have been rejected. - Ranking of features: After fitting BorutaPy it provides the user with ranking of features. @@ -73,6 +46,32 @@ class BorutaPy(BaseEstimator, TransformerMixin): care about all factors that contribute to it, not just the bluntest signs of it in context of your methodology (yes, minimal optimal set of features by definition depends on your classifier choice). + + + Summary + ------- + - Loop over n_iter or until dec_reg == 0 + - add shadows + o find features that are tentative + o make sure that at least 5 columns are added + o shuffle shadows + o get feature importance + * fit the estimator + * extract feature importance (native, shap or permutation) + * return feature importance + o separate the importance of shadows and real + + - Calculate the maximum shadow importance and append to the previous run + - Assign hits using the imp_sha_max of this run + o find all the feat imp > imp_sha_max + o tag them as hits + o add +1 to the previous tag vector + - Perform a test + o select non rejected features yet + o get a binomial p-values (nbr of times the feat has been tagged as important on the n_iter done so far) + o reject or not according the (corrected) p-val + + Parameters ---------- estimator : object @@ -96,7 +95,9 @@ class BorutaPy(BaseEstimator, TransformerMixin): correction steps. importance : str, default = 'shap' The kind of variable importance used to compare and discriminate original - vs shadow predictors. + vs shadow predictors. Note that the builtin tree importance (gini/impurity based + importance) is biased towards numerical and large cardinality predictors, even + if they are random. Shapley values and permutation imp. are robust w.r.t those predictors. two_step : Boolean, default = True If you want to use the original implementation of Boruta with Bonferroni correction only set this to False. @@ -107,12 +108,13 @@ class BorutaPy(BaseEstimator, TransformerMixin): If RandomState instance, random_state is the random number generator; If None, the random number generator is the RandomState instance used by `np.random`. - weight : pd.Series verbose : int, default=0 Controls verbosity of output: - 0: no output - 1: displays iteration number - 2: which features have been selected already + + Attributes ---------- n_features_ : int @@ -127,6 +129,20 @@ class BorutaPy(BaseEstimator, TransformerMixin): ranking position of the i-th feature. Selected (i.e., estimated best) features are assigned rank 1 and tentative features are assigned rank 2. + cat_name : list of str + the name of the categorical columns + cat_idx : list of int + the index of the categorical columns + imp_real_hist : array + array of the historical feature importance of the real predictors + sha_max : float + the maximum feature importance of the shadow predictors + col_names : list of str + the names of the real predictors + tag_df : dataframe + the df with the details (accepted or rejected) of the feature selection + + Examples -------- @@ -164,7 +180,7 @@ class BorutaPy(BaseEstimator, TransformerMixin): Journal of Statistical Software, Vol. 36, Issue 11, Sep 2010 """ - def __init__(self, estimator, n_estimators=1000, perc=100, alpha=0.05, importance='shap', + def __init__(self, estimator, n_estimators=1000, perc=90, alpha=0.05, importance='shap', two_step=True, max_iter=100, random_state=None, weight=None, verbose=0): self.estimator = estimator self.n_estimators = n_estimators @@ -173,21 +189,22 @@ def __init__(self, estimator, n_estimators=1000, perc=100, alpha=0.05, importanc self.two_step = two_step self.max_iter = max_iter self.random_state = random_state + self.random_state_instance = None self.verbose = verbose self.weight = weight self.importance = importance self.cat_name = None self.cat_idx = None + # Catboost doesn't allow to change random seed after fitting + self.is_cat = is_catboost(estimator) + self.is_lgb = is_lightgbm(estimator) # plotting self.imp_real_hist = None self.sha_max = None self.col_names = None - # Catboost doesn't allow to change random seed after fitting - self._is_catboost = 'catboost' in str(type(self.estimator)) - # Random state throws an error with lightgbm - self._is_lightgbm = 'lightgbm' in str(type(self.estimator)) + self.tag_df = None - def fit(self, X, y): + def fit(self, X, y, sample_weight=None): """ Fits the Boruta feature selection with the provided estimator. Parameters @@ -196,12 +213,15 @@ def fit(self, X, y): The training input samples. y : array-like, shape = [n_samples] The target values. + sample_weight : array-like, shape = [n_samples], default=None + Individual weights for each sample """ self.imp_real_hist = np.empty((0, X.shape[1]), float) if isinstance(X, pd.DataFrame) is not True: X = pd.DataFrame(X) self.col_names = X.columns.to_list() - return self._fit(X, y) + + return self._fit(X, y, sample_weight=sample_weight) def transform(self, X, weak=False, return_df=False): """ @@ -225,15 +245,18 @@ def transform(self, X, weak=False, return_df=False): return self._transform(X, weak, return_df) - def fit_transform(self, X, y, weak=False, return_df=False): + def fit_transform(self, X, y, sample_weight=None, weak=False, return_df=False): """ Fits Boruta, then reduces the input X to the selected features. + Parameters ---------- X : array-like, shape = [n_samples, n_features] The training input samples. y : array-like, shape = [n_samples] The target values. + sample_weight : array-like, shape = [n_samples], default=None + Individual weights for each sample weak: boolean, default = False If set to true, the tentative features are also used to reduce X. return_df : boolean, default = False @@ -244,18 +267,42 @@ def fit_transform(self, X, y, weak=False, return_df=False): X : array-like, shape = [n_samples, n_features_] The input matrix X's columns are reduced to the features which were selected by Boruta. + + Summary + ------- + - Loop over n_iter or until dec_reg == 0 + - add shadows + o find features that are tentative + o make sure that at least 5 columns are added + o shuffle shadows + o get feature importance + * fit the estimator + * extract feature importance (native, shap or permutation) + * return feature importance + o separate the importance of shadows and real + + - Calculate the maximum shadow importance and append to the previous run + - Assign hits using the imp_sha_max of this run + o find all the feat imp > imp_sha_max + o tag them as hits + o add +1 to the previous tag vector + - Perform a test + o select non rejected features yet + o get a binomial p-values (nbr of times the feat has been tagged as important on the n_iter done so far) + o reject or not according the (corrected) p-val """ - self._fit(X, y) + self._fit(X, y, sample_weight=sample_weight) return self._transform(X, weak, return_df) - def plot_importance(self): + def plot_importance(self, n_feat_per_inch=5): """ Boxplot of the variable importance, ordered by magnitude The max shadow variable importance illustrated by the dashed line. Requires to apply the fit method first. :return: boxplot """ + set_my_plt_style() if self.imp_real_hist is None: raise ValueError("Use the fit method first to compute the var.imp") @@ -268,38 +315,33 @@ def plot_importance(self): medianprops=dict(linestyle='-', linewidth=1.5), whiskerprops=dict(linestyle='-', linewidth=1.5), capprops=dict(linestyle='-', linewidth=1.5), - showfliers=False, grid=True, rot=0, vert=False, patch_artist=True + showfliers=False, grid=True, rot=0, vert=False, patch_artist=True, + figsize=(10, vimp_df.shape[1] / n_feat_per_inch), fontsize=9 ) - - box_face_col = ["tab:blue"] * sum(self.support_) + ["#FDD023"] * sum(self.support_weak_) + blue_color = "#2590fa" + yellow_color = "#f0be00" + box_face_col = [blue_color] * sum(self.support_) + [yellow_color] * sum(self.support_weak_) for c in range(len(box_face_col)): bp.findobj(matplotlib.patches.Patch)[len(self.support_) - c - 1].set_facecolor(box_face_col[c]) - bp.findobj(matplotlib.patches.Patch)[len(self.support_) - c - 1].set_facecolor(box_face_col[c]) + bp.findobj(matplotlib.patches.Patch)[len(self.support_) - c - 1].set_color(box_face_col[c]) + + xrange = vimp_df.max(skipna=True).max(skipna=True) - vimp_df.min(skipna=True).min(skipna=True) + bp.set_xlim(left=vimp_df.min(skipna=True).min(skipna=True) - 0.10 * xrange) - xrange = vimp_df.max(skipna=True).max(skipna=True)-vimp_df.min(skipna=True).min(skipna=True) - bp.set_xlim(left=vimp_df.min(skipna=True).min(skipna=True)-0.10*xrange) - custom_lines = [Line2D([0], [0], color="tab:blue", lw=5), - Line2D([0], [0], color="#FDD023", lw=5), + custom_lines = [Line2D([0], [0], color=blue_color, lw=5), + Line2D([0], [0], color=yellow_color, lw=5), Line2D([0], [0], color="gray", lw=5), Line2D([0], [0], linestyle='--', color="gray", lw=2)] bp.legend(custom_lines, ['confirmed', 'tentative', 'rejected', 'sha. max'], loc="lower right") plt.axvline(x=self.sha_max, linestyle='--', color='gray') fig = bp.get_figure() plt.title('Boruta importance and selected predictors') - fig.set_size_inches((10, 2 * np.rint(max(vimp_df.shape) / 10))) - plt.tight_layout() + fig.set_size_inches((10, 1.5 * np.rint(max(vimp_df.shape) / 10))) + # plt.tight_layout() plt.show() - def _is_tree_based(self): - """ - checking if the estimator is tree-based (kernel SAP is too slow to be used here, unless using sampling) - :return: - """ - tree_based_models = ['lightgbm', 'xgboost', 'catboost', '_forest'] - condition = any(i in str(type(self.estimator)) for i in tree_based_models) - return condition - - def _validate_pandas_input(self, arg): + @staticmethod + def _validate_pandas_input(arg): try: return arg.values except AttributeError: @@ -307,7 +349,45 @@ def _validate_pandas_input(self, arg): "input needs to be a numpy array or pandas data frame." ) - def _fit(self, X_raw, y): + def _fit(self, X_raw, y, sample_weight=None): + """ + Private method. + Chaining: + - Loop over n_iter or until dec_reg == 0 + - add shadows + o find features that are tentative + o make sure that at least 5 columns are added + o shuffle shadows + o get feature importance + * fit the estimator + * extract feature importance (native, shap or permutation) + * return feature importance + o separate the importance of shadows and real + + - Calculate the maximum shadow importance and append to the previous run + - Assign hits using the imp_sha_max of this run + o find all the feat imp > imp_sha_max + o tag them as hits + o add +1 to the previous tag vector + - Perform a test + o select non rejected features yet + o get a binomial p-values (nbr of times the feat has been tagged as important on the n_iter done so far) + o reject or not according the (corrected) p-val + + Parameters + ---------- + X_raw : array-like, shape = [n_samples, n_features] + The training input samples. + y : array-like, shape = [n_samples] + The target values. + sample_weight : array-like, shape = [n_samples], default=None + Individual weights for each sample + + :return: + self : object + Nothing but attributes + """ + # self.is_cat = is_catboost(self.estimator) start_time = time.time() # basic cat features encoding @@ -323,7 +403,7 @@ def _fit(self, X_raw, y): # a way without loop but need to re-do astype Cat = X_raw[cat_feat].stack().astype('category').cat.codes.unstack() - if not self._is_catboost: + if self.is_cat is False: if cat_feat: X = pd.concat([X_raw[X_raw.columns.difference(cat_feat)], Cat], axis=1) else: @@ -347,6 +427,10 @@ def _fit(self, X_raw, y): if not isinstance(y, np.ndarray): y = self._validate_pandas_input(y) + if sample_weight is not None: + if not isinstance(sample_weight, np.ndarray): + sample_weight = self._validate_pandas_input(sample_weight) + self.random_state = check_random_state(self.random_state) # setup variables for Boruta n_sample, n_feat = X.shape @@ -378,16 +462,14 @@ def _fit(self, X_raw, y): # make sure we start with a new tree in each iteration # Catboost doesn't allow to change random seed after fitting - if not self._is_catboost: - if self._is_lightgbm: - # https://github.com/scikit-learn-contrib/boruta_py/pull/78 + if self.is_cat is False: + if self.is_lgb: self.estimator.set_params(random_state=self.random_state.randint(0, 10000)) else: self.estimator.set_params(random_state=self.random_state) - # add shadow attributes, shuffle them and train estimator, get imps - cur_imp = self._add_shadows_get_imps(X, y, dec_reg) + cur_imp = self._add_shadows_get_imps(X, y, sample_weight, dec_reg) # get the threshold of shadow importances we will use for rejection imp_sha_max = np.percentile(cur_imp[1], self.perc) @@ -428,10 +510,23 @@ def _fit(self, X_raw, y): # for plotting self.imp_real_hist = imp_history self.sha_max = imp_sha_max + + if isinstance(X_raw, np.ndarray): + X_raw = pd.DataFrame(X_raw) + if isinstance(X_raw, pd.DataFrame): self.support_names_ = [X_raw.columns[i] for i, x in enumerate(self.support_) if x] + self.tag_df = pd.DataFrame({'predictor': list(X_raw.columns)}) + self.tag_df['Boruta'] = 1 + self.tag_df['Boruta'] = np.where(self.tag_df['predictor'].isin(list(self.support_names_)), 1, 0) - # ranking, confirmed variables are rank 1 + if isinstance(X_raw, pd.DataFrame): + self.support_weak_names_ = [X_raw.columns[i] for i, x in enumerate(self.support_weak_) if x] + self.tag_df['Boruta_weak_incl'] = np.where(self.tag_df['predictor'].isin( + list(self.support_names_ + self.support_weak_names_) + ), 1, 0) + + # ranking, confirmed variables are rank 1 self.ranking_ = np.ones(n_feat, dtype=np.int) # tentative variables are rank 2 self.ranking_[tentative] = 2 @@ -470,6 +565,20 @@ def _fit(self, X_raw, y): return self def _transform(self, X, weak=False, return_df=False): + """ + Private method + + transform the predictor matrix by dropping the rejected and (optional) the undecided predictors + :param X: pd.DataFrame + predictor matrix + :param weak: bool + whether to drop or not the undecided predictors + :param return_df: bool + return a pandas dataframe or not + :return: + X: np.array or pd.DataFrame + the transformed predictors matrix + """ # sanity check try: self.ranking_ @@ -489,7 +598,7 @@ def _transform(self, X, weak=False, return_df=False): def _get_tree_num(self, n_feat): depth = self.estimator.get_params()['max_depth'] - if depth == None: + if depth is None: depth = 10 # how many times a feature should be considered on average f_repr = 100 @@ -498,17 +607,36 @@ def _get_tree_num(self, n_feat): n_estimators = int(multi * f_repr) return n_estimators - def _get_imp(self, X, y): + def _get_imp(self, X, y, sample_weight=None): + """ + Get the native feature importance (impurity based for instance) + This is know to return biased and uninformative results. + e.g. + https://scikit-learn.org/stable/auto_examples/inspection/ + plot_permutation_importance.html#sphx-glr-auto-examples-inspection-plot-permutation-importance-py + + or + https://explained.ai/rf-importance/ + + Parameters + ---------- + X : array-like, shape = [n_samples, n_features] + The training input samples. + y : array-like, shape = [n_samples] + The target values. + sample_weight : array-like, shape = [n_samples], default=None + Individual weights for each sample + """ try: - if self._is_catboost: + if self.is_cat: X = pd.DataFrame(X) obj_feat = X.dtypes.loc[(X.dtypes == 'object') | (X.dtypes == 'category')].index.tolist() if obj_feat: X[obj_feat] = X[obj_feat].astype('str').astype('category') cat_feat = X.dtypes.loc[X.dtypes == 'category'].index.tolist() - self.estimator.fit(X, y, sample_weight=self.weight, cat_features=cat_feat) + self.estimator.fit(X, y, sample_weight=sample_weight, cat_features=cat_feat) else: - self.estimator.fit(X, y, sample_weight=self.weight) + self.estimator.fit(X, y, sample_weight=sample_weight) except Exception as e: raise ValueError('Please check your X and y variable. The provided ' @@ -520,111 +648,28 @@ def _get_imp(self, X, y): 'are currently supported in BorutaPy.') return imp - def _get_shap_imp(self, X, y): - # SHAP and permutation importances must be computed on unseen data - if self.weight is not None: - w = self.weight - if is_regressor(self.estimator): - X_tr, X_tt, y_tr, y_tt, w_tr, w_tt = train_test_split(X, y, w, random_state=42) - else: - X_tr, X_tt, y_tr, y_tt, w_tr, w_tt = train_test_split(X, y, w, stratify=y, random_state=42) - else: - if is_regressor(self.estimator): - X_tr, X_tt, y_tr, y_tt = train_test_split(X, y, random_state=42) - else: - X_tr, X_tt, y_tr, y_tt = train_test_split(X, y, stratify=y, random_state=42) - w_tr, w_tt = None, None - - X_tr = pd.DataFrame(X_tr) - X_tt = pd.DataFrame(X_tt) - obj_feat = list(set(list(X_tr.columns)) - set(list(X_tr.select_dtypes(include=[np.number])))) - obj_idx = None - - if obj_feat: - X_tr[obj_feat] = X_tr[obj_feat].astype('str').astype('category') - X_tt[obj_feat] = X_tt[obj_feat].astype('str').astype('category') - obj_idx = np.argwhere(X_tr.columns.isin(obj_feat)).ravel() - - if self._is_tree_based(): - try: - if self._is_catboost: - model = self.estimator.fit(X_tr, y_tr, sample_weight=w_tr, cat_features=obj_feat) - else: - model = self.estimator.fit(X_tr, y_tr, sample_weight=w_tr) - - except Exception as e: - raise ValueError('Please check your X and y variable. The provided ' - 'estimator cannot be fitted to your data.\n' + str(e)) - # build the explainer - explainer = shap.TreeExplainer(model, feature_perturbation="tree_path_dependent") - shap_values = explainer.shap_values(X_tt) - # flatten to 2D if classification and lightgbm - if is_classifier(self.estimator): - if isinstance(shap_values, list): - # for lightgbm clf sklearn api, shap returns list of arrays - # https://github.com/slundberg/shap/issues/526 - class_inds = range(len(shap_values)) - shap_imp = np.zeros(shap_values[0].shape[1]) - for i, ind in enumerate(class_inds): - shap_imp += np.abs(shap_values[ind]).mean(0) - shap_imp /= len(shap_values) - else: - shap_imp = np.abs(shap_values).mean(0) - else: - shap_imp = np.abs(shap_values).mean(0) - else: - raise ValueError('Not a tree based model') - - return shap_imp - - def _get_perm_imp(self, X, y): - - if self.weight is not None: - w = self.weight - if is_regressor(self.estimator): - X_tr, X_tt, y_tr, y_tt, w_tr, w_tt = train_test_split(X, y, w, random_state=42) - else: - X_tr, X_tt, y_tr, y_tt, w_tr, w_tt = train_test_split(X, y, w, stratify=y, random_state=42) - else: - if is_regressor(self.estimator): - X_tr, X_tt, y_tr, y_tt = train_test_split(X, y, random_state=42) - else: - X_tr, X_tt, y_tr, y_tt = train_test_split(X, y, stratify=y, random_state=42) - w_tr, w_tt = None, None - - X_tr = pd.DataFrame(X_tr) - X_tt = pd.DataFrame(X_tt) - obj_feat = list(set(list(X_tr.columns)) - set(list(X_tr.select_dtypes(include=[np.number])))) - obj_idx = None - - if obj_feat: - X_tr[obj_feat] = X_tr[obj_feat].astype('str').astype('category') - X_tt[obj_feat] = X_tt[obj_feat].astype('str').astype('category') - obj_idx = np.argwhere(X_tr.columns.isin(obj_feat)).ravel() - - if self._is_tree_based(): - try: - if self._is_catboost: - model = self.estimator.fit(X_tr, y_tr, sample_weight=w_tr, cat_features=obj_feat) - else: - model = self.estimator.fit(X_tr, y_tr, sample_weight=w_tr) - - except Exception as e: - raise ValueError('Please check your X and y variable. The provided ' - 'estimator cannot be fitted to your data.\n' + str(e)) - - perm_imp = permutation_importance(model, X_tt, y_tt, n_repeats=5, random_state=42, n_jobs=-1) - imp = perm_imp.importances_mean.ravel() - else: - raise ValueError('Not a tree based model') - - return imp - def _get_shuffle(self, seq): self.random_state.shuffle(seq) return seq - def _add_shadows_get_imps(self, X, y, dec_reg): + def _add_shadows_get_imps(self, X, y, sample_weight, dec_reg): + """ + Add a shuffled copy of the columns (shadows) and get the feature importance of the augmented data set + + :param X: pd.DataFrame of shape [n_samples, n_features] + predictor matrix + :param y: pd.series of shape [n_samples] + target + :param sample_weight: array-like, shape = [n_samples], default=None + Individual weights for each sample + :param dec_reg: array + holds the decision about each feature 1, 0, -1 (accepted, undecided, rejected) + :return: + imp_real: array + feature importance of the real predictors + imp_sha: array + feature importance of the shadow predictors + """ # find features that are tentative still x_cur_ind = np.where(dec_reg >= 0)[0] x_cur = np.copy(X[:, x_cur_ind]) @@ -638,9 +683,9 @@ def _add_shadows_get_imps(self, X, y, dec_reg): x_sha = np.apply_along_axis(self._get_shuffle, 0, x_sha) # get importance of the merged matrix if self.importance == 'shap': - imp = self._get_shap_imp(np.hstack((x_cur, x_sha)), y) + imp = _get_shap_imp(self.estimator, np.hstack((x_cur, x_sha)), y, sample_weight) elif self.importance == 'pimp': - imp = self._get_perm_imp(np.hstack((x_cur, x_sha)), y) + imp = _get_perm_imp(self.estimator, np.hstack((x_cur, x_sha)), y, sample_weight) else: imp = self._get_imp(np.hstack((x_cur, x_sha)), y) @@ -652,7 +697,18 @@ def _add_shadows_get_imps(self, X, y, dec_reg): return imp_real, imp_sha - def _assign_hits(self, hit_reg, cur_imp, imp_sha_max): + @staticmethod + def _assign_hits(hit_reg, cur_imp, imp_sha_max): + """ + count how many times a given feature was more important than the best of the shadow features + :param hit_reg: array + count how many times a given feature was more important than the best of the shadow features + :param cur_imp: array + current importance + :param imp_sha_max: array + importance of the best shadow predictor + :return: + """ # register hits for features that did better than the best of shadows cur_imp_no_nan = cur_imp[0] cur_imp_no_nan[np.isnan(cur_imp_no_nan)] = 0 @@ -661,6 +717,20 @@ def _assign_hits(self, hit_reg, cur_imp, imp_sha_max): return hit_reg def _do_tests(self, dec_reg, hit_reg, _iter): + """ + Perform the rest if the feature should be tagget as relevant (confirmed), not relevant (rejected) + or undecided. The test is performed by considering the binomial tentatives over several attempts. + I.e. count how many times a given feature was more important than the best of the shadow features + and test if the associated probability to the z-score is below, between or above the rejection or acceptance + threshold. + + :param dec_reg: array + holds the decision about each feature 1, 0, -1 (accepted, undecided, rejected) + :param hit_reg: array + counts how many times a given feature was more important than the best of the shadow features + :param _iter: + :return: + """ active_features = np.where(dec_reg >= 0)[0] hits = hit_reg[active_features] # get uncorrected p values based on hit_reg @@ -696,7 +766,8 @@ def _do_tests(self, dec_reg, hit_reg, _iter): dec_reg[active_features[to_reject]] = -1 return dec_reg - def _fdrcorrection(self, pvals, alpha=0.05): + @staticmethod + def _fdrcorrection(pvals, alpha=0.05): """ Benjamini/Hochberg p-value correction for false discovery rate, from statsmodels package. Included here for decoupling dependency on statsmodels. @@ -734,7 +805,8 @@ def _fdrcorrection(self, pvals, alpha=0.05): reject_[pvals_sortind] = reject return reject_, pvals_corrected_ - def _nanrankdata(self, X, axis=1): + @staticmethod + def _nanrankdata(X, axis=1): """ Replaces bottleneck's nanrankdata with scipy and numpy alternative. """ @@ -744,7 +816,12 @@ def _nanrankdata(self, X, axis=1): def _check_params(self, X, y): """ + Private method Check hyperparameters as well as X and y before proceeding with fit. + :param X: pd.DataFrame + predictor matrix + :param y: pd.series + target """ # check X and y are consistent len, X is Array and y is column X, y = check_X_y(X, y, dtype=None, force_all_finite=False) @@ -755,6 +832,19 @@ def _check_params(self, X, y): raise ValueError('Alpha should be between 0 and 1.') def _print_results(self, dec_reg, _iter, flag): + """ + Private method + printing the result + :param dec_reg: array + if the feature as been tagged as relevant (confirmed), not relevant (rejected) or undecided + :param _iter: int + the iteration number + :param flag: int + is still in the feature selection process or not + :return: + output: str + the output to be printed out + """ n_iter = str(_iter) + ' / ' + str(self.max_iter) n_confirmed = np.where(dec_reg == 1)[0].shape[0] n_rejected = np.where(dec_reg == -1)[0].shape[0] @@ -780,3 +870,125 @@ def _print_results(self, dec_reg, _iter, flag): vimp = 'native' output = "\n\nBorutaPy finished running using " + vimp + " var. imp.\n\n" + result print(output) + + +def _split_fit_estimator(estimator, X, y, sample_weight=None): + """ + Private function + split the train, test and fit the model + + :param estimator: sklearn estimator + :param X: pd.DataFrame of shape [n_samples, n_features] + predictor matrix + :param y: pd.series of shape [n_samples] + target + :param sample_weight: array-like, shape = [n_samples], default=None + Individual weights for each sample + + :return: + model + fitted model + X_tt: array [n_samples, n_features] + the test split, predictors + y_tt: array [n_samples] + the test split, target + """ + if sample_weight is not None: + w = sample_weight + if is_regressor(estimator): + X_tr, X_tt, y_tr, y_tt, w_tr, w_tt = train_test_split(X, y, w, random_state=42) + else: + X_tr, X_tt, y_tr, y_tt, w_tr, w_tt = train_test_split(X, y, w, stratify=y, random_state=42) + else: + if is_regressor(estimator): + X_tr, X_tt, y_tr, y_tt = train_test_split(X, y, random_state=42) + else: + X_tr, X_tt, y_tr, y_tt = train_test_split(X, y, stratify=y, random_state=42) + w_tr, w_tt = None, None + + X_tr = pd.DataFrame(X_tr) + X_tt = pd.DataFrame(X_tt) + obj_feat = list(set(list(X_tr.columns)) - set(list(X_tr.select_dtypes(include=[np.number])))) + obj_idx = None + + if obj_feat: + X_tr[obj_feat] = X_tr[obj_feat].astype('str').astype('category') + X_tt[obj_feat] = X_tt[obj_feat].astype('str').astype('category') + obj_idx = np.argwhere(X_tr.columns.isin(obj_feat)).ravel() + + if check_if_tree_based(estimator): + try: + if is_catboost(estimator): + model = estimator.fit(X_tr, y_tr, sample_weight=w_tr, cat_features=obj_feat) + else: + model = estimator.fit(X_tr, y_tr, sample_weight=w_tr) + + except Exception as e: + raise ValueError('Please check your X and y variable. The provided ' + 'estimator cannot be fitted to your data.\n' + str(e)) + else: + raise ValueError('Not a tree based model') + + return model, X_tt, y_tt, w_tt + + +def _get_shap_imp(estimator, X, y, sample_weight=None): + """ + Private function + Get the SHAP feature importance + + :param estimator: sklearn estimator + :param X: pd.DataFrame of shape [n_samples, n_features] + predictor matrix + :param y: pd.series of shape [n_samples] + target + :param sample_weight: array-like, shape = [n_samples], default=None + Individual weights for each sample + + :return: + shap_imp, array + the SHAP importance array + """ + model, X_tt, y_tt, w_tt = _split_fit_estimator(estimator, X, y, sample_weight=sample_weight) + # build the explainer + explainer = shap.TreeExplainer(model, feature_perturbation="tree_path_dependent") + shap_values = explainer.shap_values(X_tt) + # flatten to 2D if classification and lightgbm + if is_classifier(estimator): + if isinstance(shap_values, list): + # for lightgbm clf sklearn api, shap returns list of arrays + # https://github.com/slundberg/shap/issues/526 + class_inds = range(len(shap_values)) + shap_imp = np.zeros(shap_values[0].shape[1]) + for i, ind in enumerate(class_inds): + shap_imp += np.abs(shap_values[ind]).mean(0) + shap_imp /= len(shap_values) + else: + shap_imp = np.abs(shap_values).mean(0) + else: + shap_imp = np.abs(shap_values).mean(0) + + return shap_imp + + +def _get_perm_imp(estimator, X, y, sample_weight): + """ + Private function + Get the permutation feature importance + + :param estimator: sklearn estimator + :param X: pd.DataFrame of shape [n_samples, n_features] + predictor matrix + :param y: pd.series of shape [n_samples] + target + :param sample_weight: array-like, shape = [n_samples], default=None + Individual weights for each sample + + :return: + imp, array + the permutation importance array + """ + model, X_tt, y_tt, w_tt = _split_fit_estimator(estimator, X, y, sample_weight=sample_weight) + perm_imp = permutation_importance(model, X_tt, y_tt, n_repeats=5, random_state=42, n_jobs=-1) + imp = perm_imp.importances_mean.ravel() + return imp diff --git a/boruta/test/modif_tests.py b/boruta/test/modif_tests.py index 507db6f..3bc4a5d 100644 --- a/boruta/test/modif_tests.py +++ b/boruta/test/modif_tests.py @@ -92,7 +92,9 @@ def get_titanic_data(): X = pd.DataFrame(X_trans, columns = X.columns) # encode X, cat_var_df, inv_mapper = cat_var(X) - return X, y + # sample weight is just a dummy random vector for testing purpose + sample_weight = np.random.uniform(0,1, len(y)) + return X, y, sample_weight def get_boston_data(): boston = load_boston() @@ -103,23 +105,23 @@ def get_boston_data(): return X, y # Testing the changes with rnd cat. and num. predictors added to the set of genuine predictors -def testing_estimators(varimp, models, X, y): +def testing_estimators(varimp, models, X, y, sample_weight=None): for model in models: - print('testing ' + str(type(model)) + ' for var.imp: ' + varimp) - feat_selector = bp(model, n_estimators = 100, verbose= 1, max_iter= 10, random_state=42, weight=None, importance=varimp) - feat_selector.fit(X, y) + print('='*20 +' testing: {mod:>55} for var.imp: {vimp:<15} '.format(mod=str(model), vimp=varimp)+'='*20 ) + feat_selector = noglmgroot.BorutaPy(model, n_estimators = 100, verbose= 1, max_iter= 10, random_state=42, weight=None, importance=varimp) + feat_selector.fit(X, y, sample_weight) print(feat_selector.support_names_) - feat_selector.plot_importance() + feat_selector.plot_importance(n_feat_per_inch=3) gc.enable() del(feat_selector, model) gc.collect() -def testing_clf_all_varimp(X, y): +def testing_clf_all_varimp(X, y, sample_weight=None): for varimp in ['shap', 'pimp', 'native']: - models = [catboost.CatBoostClassifier(random_state=42, verbose=0), XGBClassifier(random_state=42, verbose=0), LGBMClassifier(random_state=42, verbose=0)] - testing_estimators(varimp=varimp, models=models, X=X, y=y) + models = [RandomForestClassifier(n_jobs= 4, oob_score= True), catboost.CatBoostClassifier(random_state=42, verbose=0), XGBClassifier(random_state=42, verbose=0), LGBMClassifier(random_state=42, verbose=0)] + testing_estimators(varimp=varimp, models=models, X=X, y=y, sample_weight=sample_weight) gc.enable() - del(models) + del models gc.collect() def testing_regr_all_varimp(X, y): @@ -129,13 +131,13 @@ def testing_regr_all_varimp(X, y): gc.enable() del(models) gc.collect() - -X, y = get_titanic_data() -X_train, X_test, y_train, y_test = train_test_split(X, y, stratify=y, random_state=42) +print('='*20 + ' Benchmarking using sklearn permutation importance ' + '='*20 ) +X, y, sample_weight = get_titanic_data() +X_train, X_test, y_train, y_test, w_train, w_test = train_test_split(X, y, sample_weight, stratify=y, random_state=42) # lightgbm faster and better than RF lgb_model = LGBMClassifier(n_jobs= 4) -lgb_model.fit(X_train, y_train) +lgb_model.fit(X_train, y_train, sample_weight=w_train) result = permutation_importance(lgb_model, X_test, y_test, n_repeats=10, random_state=42, n_jobs=2) sorted_idx = result.importances_mean.argsort() # Plot @@ -146,9 +148,10 @@ def testing_regr_all_varimp(X, y): plt.show() if __name__ == '__main__': - # classification and cat. pred - X, y = get_titanic_data() - testing_clf_all_varimp(X=X, y=y) + # classification and cat. pred, sample weight is just a dummy random vector for testing purpose + X, y, sample_weight = get_titanic_data() #get_titanic_data() + testing_clf_all_varimp(X=X, y=y, sample_weight=sample_weight) + # regression X, y = get_boston_data() testing_regr_all_varimp(X=X, y=y) \ No newline at end of file