From 1bb3c53e71ef811048e1e03ce3311803f3b4baa0 Mon Sep 17 00:00:00 2001
From: Thomas Bury <thomas.bury@allianz.be>
Date: Thu, 14 May 2020 18:56:38 +0200
Subject: [PATCH 1/9] catboost-plot-pimp-shapimp

---
 boruta/boruta_py.py        | 358 +++++++++++++++++++++++++++++--------
 boruta/test/modif_tests.py | 154 ++++++++++++++++
 2 files changed, 435 insertions(+), 77 deletions(-)
 create mode 100644 boruta/test/modif_tests.py

diff --git a/boruta/boruta_py.py b/boruta/boruta_py.py
index fd3f20b..c346708 100644
--- a/boruta/boruta_py.py
+++ b/boruta/boruta_py.py
@@ -1,3 +1,18 @@
+from __future__ import print_function, division
+import numpy as np
+import scipy as sp
+from sklearn.utils import check_random_state, check_X_y
+from sklearn.base import TransformerMixin, BaseEstimator, is_classifier, is_regressor
+from sklearn.utils import check_random_state, check_X_y
+from sklearn.base import TransformerMixin, BaseEstimator, is_regressor, is_classifier
+from sklearn.model_selection import RepeatedKFold, train_test_split
+from sklearn.inspection import permutation_importance
+import shap
+import pandas as pd
+import time
+import matplotlib
+import matplotlib.pyplot as plt
+from matplotlib.lines import Line2D
 #!/usr/bin/env python
 # -*- coding: utf-8 -*-
 """
@@ -8,17 +23,11 @@
 License: BSD 3 clause
 """
 
-from __future__ import print_function, division
-import numpy as np
-import scipy as sp
-from sklearn.utils import check_random_state, check_X_y
-from sklearn.base import TransformerMixin, BaseEstimator
 
 
 class BorutaPy(BaseEstimator, TransformerMixin):
     """
     Improved Python implementation of the Boruta R package.
-
     The improvements of this implementation include:
     - Faster run times:
         Thanks to scikit-learn's fast implementation of the ensemble methods.
@@ -48,40 +57,33 @@ class BorutaPy(BaseEstimator, TransformerMixin):
     - Ranking of features:
         After fitting BorutaPy it provides the user with ranking of features.
         Confirmed ones are 1, Tentatives are 2, and the rejected are ranked
-        starting from 3, based on their feautre importance history through
+        starting from 3, based on their feature importance history through
         the iterations.
-
+    - Using either the native variable importance, scikit permutation importance,
+        SHAP importance.
     We highly recommend using pruned trees with a depth between 3-7.
-
     For more, see the docs of these functions, and the examples below.
-
     Original code and method by: Miron B Kursa, https://m2.icm.edu.pl/boruta/
-
     Boruta is an all relevant feature selection method, while most other are
     minimal optimal; this means it tries to find all features carrying
     information usable for prediction, rather than finding a possibly compact
     subset of features on which some classifier has a minimal error.
-
     Why bother with all relevant feature selection?
     When you try to understand the phenomenon that made your data, you should
     care about all factors that contribute to it, not just the bluntest signs
     of it in context of your methodology (yes, minimal optimal set of features
     by definition depends on your classifier choice).
-
     Parameters
     ----------
-
     estimator : object
         A supervised learning estimator, with a 'fit' method that returns the
         feature_importances_ attribute. Important features must correspond to
         high absolute values in the feature_importances_.
-
     n_estimators : int or string, default = 1000
         If int sets the number of estimators in the chosen ensemble method.
         If 'auto' this is determined automatically based on the size of the
         dataset. The other parameters of the used estimators need to be set
         with initialisation.
-
     perc : int, default = 100
         Instead of the max we use the percentile defined by the user, to pick
         our threshold for comparison between shadow and real features. The max
@@ -89,93 +91,81 @@ class BorutaPy(BaseEstimator, TransformerMixin):
         lower perc is the more false positives will be picked as relevant but
         also the less relevant features will be left out. The usual trade-off.
         The default is essentially the vanilla Boruta corresponding to the max.
-
     alpha : float, default = 0.05
         Level at which the corrected p-values will get rejected in both
         correction steps.
-
+    importance : str, default = 'shap'
+        The kind of variable importance used to compare and discriminate original
+        vs shadow predictors.
     two_step : Boolean, default = True
         If you want to use the original implementation of Boruta with Bonferroni
         correction only set this to False.
-
     max_iter : int, default = 100
         The number of maximum iterations to perform.
-
     random_state : int, RandomState instance or None; default=None
         If int, random_state is the seed used by the random number generator;
         If RandomState instance, random_state is the random number generator;
         If None, the random number generator is the RandomState instance used
         by `np.random`.
-
+    weight : pd.Series
     verbose : int, default=0
         Controls verbosity of output:
         - 0: no output
         - 1: displays iteration number
         - 2: which features have been selected already
-
     Attributes
     ----------
-
     n_features_ : int
         The number of selected features.
-
     support_ : array of shape [n_features]
-
         The mask of selected features - only confirmed ones are True.
-
     support_weak_ : array of shape [n_features]
-
         The mask of selected tentative features, which haven't gained enough
         support during the max_iter number of iterations..
-
     ranking_ : array of shape [n_features]
-
         The feature ranking, such that ``ranking_[i]`` corresponds to the
         ranking position of the i-th feature. Selected (i.e., estimated
         best) features are assigned rank 1 and tentative features are assigned
         rank 2.
-
     Examples
     --------
-    
+
     import pandas as pd
     from sklearn.ensemble import RandomForestClassifier
     from boruta import BorutaPy
-    
+
     # load X and y
     # NOTE BorutaPy accepts numpy arrays only, hence the .values attribute
     X = pd.read_csv('examples/test_X.csv', index_col=0).values
     y = pd.read_csv('examples/test_y.csv', header=None, index_col=0).values
     y = y.ravel()
-    
+
     # define random forest classifier, with utilising all cores and
     # sampling in proportion to y labels
     rf = RandomForestClassifier(n_jobs=-1, class_weight='balanced', max_depth=5)
-    
+
     # define Boruta feature selection method
     feat_selector = BorutaPy(rf, n_estimators='auto', verbose=2, random_state=1)
-    
+
     # find all relevant features - 5 features should be selected
     feat_selector.fit(X, y)
-    
+
     # check selected features - first 5 features are selected
     feat_selector.support_
-    
+
     # check ranking of features
     feat_selector.ranking_
-    
+
     # call transform() on X to filter it down to selected features
     X_filtered = feat_selector.transform(X)
-
     References
     ----------
-
     [1] Kursa M., Rudnicki W., "Feature Selection with the Boruta Package"
         Journal of Statistical Software, Vol. 36, Issue 11, Sep 2010
     """
 
-    def __init__(self, estimator, n_estimators=1000, perc=100, alpha=0.05,
-                 two_step=True, max_iter=100, random_state=None, verbose=0):
+    def __init__(self, estimator, n_estimators=1000, perc=100, alpha=0.05, importance='shap',
+                 two_step=True, max_iter=100, random_state=None, weight=None, verbose=0):
         self.estimator = estimator
         self.n_estimators = n_estimators
         self.perc = perc
@@ -183,39 +173,48 @@ def __init__(self, estimator, n_estimators=1000, perc=100, alpha=0.05,
         self.two_step = two_step
         self.max_iter = max_iter
         self.random_state = random_state
+        self.random_state_instance = None
         self.verbose = verbose
+        self.weight = weight
+        self.importance = importance
+        self.cat_name = None
+        self.cat_idx = None
+        # Catboost doesn't allow to change random seed after fitting
+        self.is_cat = False
+        # plotting
+        self.imp_real_hist = None
+        self.sha_max = None
+        self.col_names = None
 
     def fit(self, X, y):
         """
         Fits the Boruta feature selection with the provided estimator.
-
         Parameters
         ----------
         X : array-like, shape = [n_samples, n_features]
             The training input samples.
-
         y : array-like, shape = [n_samples]
             The target values.
         """
-
+        self.imp_real_hist = np.empty((0, X.shape[1]), float)
+        if isinstance(X, pd.DataFrame) is not True:
+            X = pd.DataFrame(X)
+        self.col_names = X.columns.to_list()
         return self._fit(X, y)
 
     def transform(self, X, weak=False, return_df=False):
         """
         Reduces the input X to the features selected by Boruta.
-
         Parameters
         ----------
         X : array-like, shape = [n_samples, n_features]
             The training input samples.
-
         weak: boolean, default = False
             If set to true, the tentative features are also used to reduce X.
-        
+
         return_df : boolean, default = False
             If ``X`` if a pandas dataframe and this parameter is set to True,
             the transformed data will also be a dataframe.
-
         Returns
         -------
         X : array-like, shape = [n_samples, n_features_]
@@ -228,22 +227,17 @@ def transform(self, X, weak=False, return_df=False):
     def fit_transform(self, X, y, weak=False, return_df=False):
         """
         Fits Boruta, then reduces the input X to the selected features.
-
         Parameters
         ----------
         X : array-like, shape = [n_samples, n_features]
             The training input samples.
-
         y : array-like, shape = [n_samples]
             The target values.
-
         weak: boolean, default = False
             If set to true, the tentative features are also used to reduce X.
-
         return_df : boolean, default = False
             If ``X`` if a pandas dataframe and this parameter is set to True,
             the transformed data will also be a dataframe.
-
         Returns
         -------
         X : array-like, shape = [n_samples, n_features_]
@@ -254,6 +248,63 @@ def fit_transform(self, X, y, weak=False, return_df=False):
         self._fit(X, y)
         return self._transform(X, weak, return_df)
 
+    def plot_importance(self):
+        """
+        Boxplot of the variable importance, ordered by magnitude
+        The max shadow variable importance illustrated by the dashed line.
+        Requires to apply the fit method first.
+        :return: boxplot
+        """
+        if self.imp_real_hist is None:
+            raise ValueError("Use the fit method first to compute the var.imp")
+
+        color = {'boxes': 'gray', 'whiskers': 'gray', 'medians': '#404040', 'caps': 'gray'}
+        vimp_df = pd.DataFrame(self.imp_real_hist, columns=self.col_names)
+        vimp_df = vimp_df.reindex(vimp_df.mean().sort_values(ascending=True).index, axis=1)
+        bp = vimp_df.boxplot(color=color,
+                             boxprops=dict(linestyle='-', linewidth=1.5),
+                             flierprops=dict(linestyle='-', linewidth=1.5),
+                             medianprops=dict(linestyle='-', linewidth=1.5),
+                             whiskerprops=dict(linestyle='-', linewidth=1.5),
+                             capprops=dict(linestyle='-', linewidth=1.5),
+                             showfliers=False, grid=True, rot=0, vert=False, patch_artist=True,
+                             figsize=(10, 3 * len(self.support_) / 10)
+                             )
+
+        box_face_col = ["tab:blue"] * sum(self.support_) + ["#FDD023"] * sum(self.support_weak_)
+        for c in range(len(box_face_col)):
+            bp.findobj(matplotlib.patches.Patch)[len(self.support_) - c - 1].set_facecolor(box_face_col[c])
+            bp.findobj(matplotlib.patches.Patch)[len(self.support_) - c - 1].set_facecolor(box_face_col[c])
+
+        # xt = bp.get_xticks()
+        # xt = np.round(np.append(xt, self.sha_max), 2)
+        # xtl = xt.tolist()
+        # xtl[-1] = "sha. max"
+        # bp.set_xticks(xt)
+        # bp.set_xticklabels(xtl)
+        bp.set_xlim(left=vimp_df.min(skipna=True).min(skipna=True))
+        custom_lines = [Line2D([0], [0], color="tab:blue", lw=5),
+                        Line2D([0], [0], color="#FDD023", lw=5),
+                        Line2D([0], [0], color="gray", lw=5),
+                        Line2D([0], [0], linestyle='--', color="gray", lw=2)]
+        bp.legend(custom_lines, ['confirmed', 'tentative', 'rejected', 'sha. max'], loc="lower right")
+        plt.axvline(x=self.sha_max, linestyle='--', color='gray')
+        fig = bp.get_figure()
+        fig.suptitle('Boruta importance and selected predictors')
+        plt.show()
+
+    def _is_catboost(self):
+        self.is_cat = 'catboost' in str(type(self.estimator))
+
+    def _is_tree_based(self):
+        """
+        checking if the estimator is tree-based (kernel SAP is too slow to be used here, unless using sampling)
+        :return:
+        """
+        tree_based_models = ['lightgbm', 'xgboost', 'catboost', '_forest']
+        condition = any(i in str(type(self.estimator)) for i in tree_based_models)
+        return condition
+
     def _validate_pandas_input(self, arg):
         try:
             return arg.values
@@ -262,16 +313,48 @@ def _validate_pandas_input(self, arg):
                 "input needs to be a numpy array or pandas data frame."
             )
 
-    def _fit(self, X, y):
+    def _fit(self, X_raw, y):
+        self._is_catboost()
+
+        start_time = time.time()
+        # basic cat features encoding
+        # First, let's store "object" columns as categorical columns
+        # obj_feat = X_raw.dtypes.loc[X_raw.dtypes == 'object'].index.tolist()
+        obj_feat = list(set(list(X_raw.columns)) - set(list(X_raw.select_dtypes(include=[np.number]))))
+
+        if obj_feat:
+            X_raw.loc[:, obj_feat] = X_raw[obj_feat].astype('str').astype('category')
+        cat_feat = X_raw.dtypes.loc[X_raw.dtypes == 'category'].index.tolist()
+        cat_idx = [X_raw.columns.get_loc(c) for c in cat_feat if c in cat_feat]
+        if cat_feat:
+            # a way without loop but need to re-do astype
+            Cat = X_raw[cat_feat].stack().astype('category').cat.codes.unstack()
+
+        if self.is_cat is False:
+            if cat_feat:
+                X = pd.concat([X_raw[X_raw.columns.difference(cat_feat)], Cat], axis=1)
+            else:
+                X = X_raw
+        else:
+            X = X_raw
+
+        X = np.nan_to_num(X)
+        y = np.nan_to_num(y)
+        # w = w.fillna(0)
+
+        self.cat_name = cat_feat
+        self.cat_idx = cat_idx
+
         # check input params
         self._check_params(X, y)
 
         if not isinstance(X, np.ndarray):
-            X = self._validate_pandas_input(X) 
+            X = self._validate_pandas_input(X)
+
         if not isinstance(y, np.ndarray):
             y = self._validate_pandas_input(y)
 
-        self.random_state = check_random_state(self.random_state)
+        self.random_state_instance = check_random_state(self.random_state)
         # setup variables for Boruta
         n_sample, n_feat = X.shape
         _iter = 1
@@ -301,7 +384,9 @@ def _fit(self, X, y):
                 self.estimator.set_params(n_estimators=n_tree)
 
             # make sure we start with a new tree in each iteration
-            self.estimator.set_params(random_state=self.random_state)
+            # Catboost doesn't allow to change random seed after fitting
+            if self.is_cat is False:
+                self.estimator.set_params(random_state=self.random_state)
 
             # add shadow attributes, shuffle them and train estimator, get imps
             cur_imp = self._add_shadows_get_imps(X, y, dec_reg)
@@ -342,6 +427,11 @@ def _fit(self, X, y):
         self.support_[confirmed] = 1
         self.support_weak_ = np.zeros(n_feat, dtype=np.bool)
         self.support_weak_[tentative] = 1
+        # for plotting
+        self.imp_real_hist = imp_history
+        self.sha_max = imp_sha_max
+        if isinstance(X_raw, pd.DataFrame):
+            self.support_names_ = [X_raw.columns[i] for i, x in enumerate(self.support_) if x]
 
         # ranking, confirmed variables are rank 1
         self.ranking_ = np.ones(n_feat, dtype=np.int)
@@ -356,18 +446,18 @@ def _fit(self, X, y):
 
         # update rank for not_selected features
         if not_selected.shape[0] > 0:
-                # calculate ranks in each iteration, then median of ranks across feats
-                iter_ranks = self._nanrankdata(imp_history_rejected, axis=1)
-                rank_medians = np.nanmedian(iter_ranks, axis=0)
-                ranks = self._nanrankdata(rank_medians, axis=0)
-
-                # set smallest rank to 3 if there are tentative feats
-                if tentative.shape[0] > 0:
-                    ranks = ranks - np.min(ranks) + 3
-                else:
-                    # and 2 otherwise
-                    ranks = ranks - np.min(ranks) + 2
-                self.ranking_[not_selected] = ranks
+            # calculate ranks in each iteration, then median of ranks across feats
+            iter_ranks = self._nanrankdata(imp_history_rejected, axis=1)
+            rank_medians = np.nanmedian(iter_ranks, axis=0)
+            ranks = self._nanrankdata(rank_medians, axis=0)
+
+            # set smallest rank to 3 if there are tentative feats
+            if tentative.shape[0] > 0:
+                ranks = ranks - np.min(ranks) + 3
+            else:
+                # and 2 otherwise
+                ranks = ranks - np.min(ranks) + 2
+            self.ranking_[not_selected] = ranks
         else:
             # all are selected, thus we set feature supports to True
             self.support_ = np.ones(n_feat, dtype=np.bool)
@@ -375,6 +465,10 @@ def _fit(self, X, y):
         # notify user
         if self.verbose > 0:
             self._print_results(dec_reg, _iter, 1)
+        self.running_time = time.time() - start_time
+        hours, rem = divmod(self.running_time, 3600)
+        minutes, seconds = divmod(rem, 60)
+        print("All relevant predictors selected in {:0>2}:{:0>2}:{:05.2f}".format(int(hours), int(minutes), seconds))
         return self
 
     def _transform(self, X, weak=False, return_df=False):
@@ -408,7 +502,16 @@ def _get_tree_num(self, n_feat):
 
     def _get_imp(self, X, y):
         try:
-            self.estimator.fit(X, y)
+            if self.is_cat:
+                X = pd.DataFrame(X)
+                obj_feat = X.dtypes.loc[(X.dtypes == 'object') | (X.dtypes == 'category')].index.tolist()
+                if obj_feat:
+                    X[obj_feat] = X[obj_feat].astype('str').astype('category')
+                cat_feat = X.dtypes.loc[X.dtypes == 'category'].index.tolist()
+                self.estimator.fit(X, y, sample_weight=self.weight, cat_features=cat_feat)
+            else:
+                self.estimator.fit(X, y, sample_weight=self.weight)
+
         except Exception as e:
             raise ValueError('Please check your X and y variable. The provided '
                              'estimator cannot be fitted to your data.\n' + str(e))
@@ -419,8 +522,100 @@ def _get_imp(self, X, y):
                              'are currently supported in BorutaPy.')
         return imp
 
+    def _get_shap_imp(self, X, y):
+
+        if self.weight is not None:
+            w = self.weight
+            if is_regressor(self.estimator):
+                X_tr, X_tt, y_tr, y_tt, w_tr, w_tt = train_test_split(X, y, w, random_state=42)
+            else:
+                X_tr, X_tt, y_tr, y_tt, w_tr, w_tt = train_test_split(X, y, w, stratify=y, random_state=42)
+        else:
+            if is_regressor(self.estimator):
+                X_tr, X_tt, y_tr, y_tt = train_test_split(X, y, random_state=42)
+            else:
+                X_tr, X_tt, y_tr, y_tt = train_test_split(X, y, stratify=y, random_state=42)
+            w_tr, w_tt = None, None
+
+        X_tr = pd.DataFrame(X_tr)
+        X_tt = pd.DataFrame(X_tt)
+        obj_feat = list(set(list(X_tr.columns)) - set(list(X_tr.select_dtypes(include=[np.number]))))
+        obj_idx = None
+
+        # if obj_feat:
+        #     X_tr[obj_feat] = X_tr[obj_feat].astype('str').astype('category')
+        #     X_tt[obj_feat] = X_tt[obj_feat].astype('str').astype('category')
+        #     obj_idx = np.argwhere(X_tr.columns.isin(obj_feat)).ravel()
+
+        if self._is_tree_based():
+            try:
+                if self.is_cat:
+                    model = self.estimator.fit(X_tr, y_tr, sample_weight=w_tr, cat_features=obj_idx)
+                else:
+                    model = self.estimator.fit(X_tr, y_tr, sample_weight=w_tr)
+
+            except Exception as e:
+                raise ValueError('Please check your X and y variable. The provided '
+                                 'estimator cannot be fitted to your data.\n' + str(e))
+            # build the explainer
+            explainer = shap.TreeExplainer(model, feature_perturbation="tree_path_dependent")
+            shap_values = explainer.shap_values(X_tt)
+            # flatten to 2D if classification
+            if is_classifier(self.estimator):
+                shap_values = np.array(shap_values)
+                shap_values = shap_values.reshape(-1, shap_values.shape[-1])
+
+            shap_imp = np.mean(np.abs(shap_values), axis=0)
+        else:
+            raise ValueError('Not a tree based model')
+
+        return shap_imp
+
+    def _get_perm_imp(self, X, y):
+
+        if self.weight is not None:
+            w = self.weight
+            if is_regressor(self.estimator):
+                X_tr, X_tt, y_tr, y_tt, w_tr, w_tt = train_test_split(X, y, w, random_state=42)
+            else:
+                X_tr, X_tt, y_tr, y_tt, w_tr, w_tt = train_test_split(X, y, w, stratify=y, random_state=42)
+        else:
+            if is_regressor(self.estimator):
+                X_tr, X_tt, y_tr, y_tt = train_test_split(X, y, random_state=42)
+            else:
+                X_tr, X_tt, y_tr, y_tt = train_test_split(X, y, stratify=y, random_state=42)
+            w_tr, w_tt = None, None
+
+        X_tr = pd.DataFrame(X_tr)
+        X_tt = pd.DataFrame(X_tt)
+        obj_feat = list(set(list(X_tr.columns)) - set(list(X_tr.select_dtypes(include=[np.number]))))
+        obj_idx = None
+
+        if obj_feat:
+            X_tr[obj_feat] = X_tr[obj_feat].astype('str').astype('category')
+            X_tt[obj_feat] = X_tt[obj_feat].astype('str').astype('category')
+            obj_idx = np.argwhere(X_tr.columns.isin(obj_feat)).ravel()
+
+        if self._is_tree_based():
+            try:
+                if self.is_cat:
+                    model = self.estimator.fit(X_tr, y_tr, sample_weight=w_tr, cat_features=obj_feat)
+                else:
+                    model = self.estimator.fit(X_tr, y_tr, sample_weight=w_tr)
+
+            except Exception as e:
+                raise ValueError('Please check your X and y variable. The provided '
+                                 'estimator cannot be fitted to your data.\n' + str(e))
+
+            perm_imp = permutation_importance(self.estimator, X_tt, y_tt, n_repeats=5, random_state=42, n_jobs=-1)
+            imp = perm_imp.importances_mean.ravel()
+        else:
+            raise ValueError('Not a tree based model')
+
+        return imp
+
     def _get_shuffle(self, seq):
-        self.random_state.shuffle(seq)
+        self.random_state_instance.shuffle(seq)
         return seq
 
     def _add_shadows_get_imps(self, X, y, dec_reg):
@@ -436,12 +631,19 @@ def _add_shadows_get_imps(self, X, y, dec_reg):
         # shuffle xSha
         x_sha = np.apply_along_axis(self._get_shuffle, 0, x_sha)
         # get importance of the merged matrix
-        imp = self._get_imp(np.hstack((x_cur, x_sha)), y)
+        if self.importance == 'shap':
+            imp = self._get_shap_imp(np.hstack((x_cur, x_sha)), y)
+        elif self.importance == 'pimp':
+            imp = self._get_perm_imp(np.hstack((x_cur, x_sha)), y)
+        else:
+            imp = self._get_imp(np.hstack((x_cur, x_sha)), y)
+
         # separate importances of real and shadow features
         imp_sha = imp[x_cur_w:]
         imp_real = np.zeros(X.shape[1])
         imp_real[:] = np.nan
         imp_real[x_cur_ind] = imp[:x_cur_w]
+
         return imp_real, imp_sha
 
     def _assign_hits(self, hit_reg, cur_imp, imp_sha_max):
@@ -492,14 +694,12 @@ def _fdrcorrection(self, pvals, alpha=0.05):
         """
         Benjamini/Hochberg p-value correction for false discovery rate, from
         statsmodels package. Included here for decoupling dependency on statsmodels.
-
         Parameters
         ----------
         pvals : array_like
             set of p-values of the individual tests.
         alpha : float
             error rate
-
         Returns
         -------
         rejected : array, bool
@@ -541,7 +741,7 @@ def _check_params(self, X, y):
         Check hyperparameters as well as X and y before proceeding with fit.
         """
         # check X and y are consistent len, X is Array and y is column
-        X, y = check_X_y(X, y)
+        X, y = check_X_y(X, y, dtype=None, force_all_finite=False)
         if self.perc <= 0 or self.perc > 100:
             raise ValueError('The percentile should be between 0 and 100.')
 
@@ -568,5 +768,9 @@ def _print_results(self, dec_reg, _iter, flag):
             n_tentative = np.sum(self.support_weak_)
             content = map(str, [n_iter, n_confirmed, n_tentative, n_rejected])
             result = '\n'.join([x[0] + '\t' + x[1] for x in zip(cols, content)])
-            output = "\n\nBorutaPy finished running.\n\n" + result
+            if self.importance in ['shap', 'pimp']:
+                vimp = str(self.importance)
+            else:
+                vimp = 'native'
+            output = "\n\nBorutaPy finished running using " + vimp + " var. imp.\n\n" + result
         print(output)
diff --git a/boruta/test/modif_tests.py b/boruta/test/modif_tests.py
new file mode 100644
index 0000000..507db6f
--- /dev/null
+++ b/boruta/test/modif_tests.py
@@ -0,0 +1,154 @@
+%matplotlib inline
+from sklearn.datasets import fetch_openml
+from sklearn.ensemble import RandomForestClassifier
+from sklearn.impute import SimpleImputer
+from sklearn.inspection import permutation_importance
+from sklearn.compose import ColumnTransformer
+from sklearn.model_selection import train_test_split
+from sklearn.pipeline import Pipeline
+from sklearn.preprocessing import OneHotEncoder
+from sklearn.datasets import fetch_openml
+from sklearn.inspection import permutation_importance
+import catboost
+from boruta import BorutaPy as bp
+from sklearn.datasets import load_boston, load_diabetes
+import numpy as np
+import pandas as pd
+import matplotlib as mpl
+import matplotlib.pyplot as plt
+import gc
+import shap
+# lightgbm and catboost
+import lightgbm as lgb
+from lightgbm import LGBMRegressor, LGBMClassifier
+from xgboost import XGBRegressor, XGBClassifier
+from catboost import CatBoostRegressor, CatBoostClassifier
+from sys import getsizeof, path
+plt.style.use('fivethirtyeight')
+rng = np.random.RandomState(seed=42)
+
+
+
+# Convert the cat. pred. for boosting
+def cat_var(df, col_excl=None, return_cat=True):
+    """Identify categorical features.
+
+        Parameters
+        ----------
+        df: original df after missing operations
+
+        Returns
+        -------
+        cat_var_df: summary df with col index and col name for all categorical vars
+        :param return_cat: Boolean, return encoded cols as type 'category'
+        :param df: pd.DF, the encoded data-frame
+        :param col_excl: list, colums not to be encoded
+        """
+
+    if col_excl is None:
+        non_num_cols = list(set(list(df.columns)) - set(list(df.select_dtypes(include=[np.number]))))
+    else:
+        non_num_cols = list(
+            set(list(df.columns)) - set(list(df.select_dtypes(include=[np.number]))) - set(col_excl))
+
+    # cat_var_index = [i for i, x in enumerate(df[col_names].dtypes.tolist()) if isinstance(x, pd.CategoricalDtype) or x == 'object']
+    # cat_var_name = [x for i, x in enumerate(col_names) if i in cat_var_index]
+
+    cat_var_index = [df.columns.get_loc(c) for c in non_num_cols if c in df]
+
+    cat_var_df = pd.DataFrame({'cat_ind': cat_var_index,
+                               'cat_name': non_num_cols})
+
+    cols_need_mapped = cat_var_df.cat_name.to_list()
+    inv_mapper = {col: dict(enumerate(df[col].astype('category').cat.categories)) for col in df[cols_need_mapped]}
+    mapper = {col: {v: k for k, v in inv_mapper[col].items()} for col in df[cols_need_mapped]}
+
+    for c in cols_need_mapped:
+        df.loc[:, c] = df.loc[:, c].map(mapper[c]).fillna(0).astype(int)
+
+    if return_cat:
+        df[non_num_cols] = df[non_num_cols].astype('category')
+    return df, cat_var_df, inv_mapper
+
+
+def get_titanic_data():
+    # Fetch Titanic data and add random cat and numbers
+    # Example taken from https://scikit-learn.org/stable/auto_examples/inspection/
+    # plot_permutation_importance.html#sphx-glr-auto-examples-inspection-plot-permutation-importance-py
+    X, y = fetch_openml("titanic", version=1, as_frame=True, return_X_y=True)
+    rng = np.random.RandomState(seed=42)
+    X['random_cat'] = rng.randint(3, size=X.shape[0])
+    X['random_cat'] = X['random_cat'].astype('str')
+    X['random_num'] = rng.randn(X.shape[0])
+
+    categorical_columns = ['pclass', 'sex', 'embarked', 'random_cat']
+    numerical_columns = ['age', 'sibsp', 'parch', 'fare', 'random_num']
+    X = X[categorical_columns + numerical_columns]
+    # Impute
+    categorical_pipe = Pipeline([('imputer', SimpleImputer(strategy='constant', fill_value='missing'))])
+    numerical_pipe = Pipeline([('imputer', SimpleImputer(strategy='mean'))])
+    preprocessing = ColumnTransformer([('cat', categorical_pipe, categorical_columns), ('num', numerical_pipe, numerical_columns)])
+    X_trans = preprocessing.fit_transform(X)
+    X = pd.DataFrame(X_trans, columns = X.columns)
+    # encode
+    X, cat_var_df, inv_mapper = cat_var(X)
+    return X, y
+
+def get_boston_data():
+    boston = load_boston()
+    X = pd.DataFrame(boston.data)
+    X['random_num1'] = rng.randn(X.shape[0])
+    X['random_num2'] = np.random.poisson(1, X.shape[0])
+    y = pd.Series(boston.target)
+    return X, y
+
+# Testing the changes with rnd cat. and num. predictors added to the set of genuine predictors
+def testing_estimators(varimp, models, X, y):
+    for model in models:
+        print('testing ' + str(type(model)) + ' for var.imp: ' + varimp)
+        feat_selector = bp(model, n_estimators = 100, verbose= 1, max_iter= 10, random_state=42, weight=None, importance=varimp)
+        feat_selector.fit(X, y)
+        print(feat_selector.support_names_)
+        feat_selector.plot_importance()
+        gc.enable()
+        del(feat_selector, model)
+        gc.collect()
+
+def testing_clf_all_varimp(X, y):
+    for varimp in ['shap', 'pimp', 'native']:
+        models = [catboost.CatBoostClassifier(random_state=42, verbose=0), XGBClassifier(random_state=42, verbose=0), LGBMClassifier(random_state=42, verbose=0)]
+        testing_estimators(varimp=varimp, models=models, X=X, y=y)
+        gc.enable()
+        del(models)
+        gc.collect()
+
+def testing_regr_all_varimp(X, y):
+    for varimp in ['shap', 'pimp', 'native']:
+        models = [catboost.CatBoostRegressor(random_state=42, verbose=0), XGBRegressor(random_state=42, verbose=0), LGBMRegressor(random_state=42, verbose=0)]
+        testing_estimators(varimp=varimp, models=models, X=X, y=y)
+        gc.enable()
+        del(models)
+        gc.collect()
+        
+
+X, y = get_titanic_data()
+X_train, X_test, y_train, y_test = train_test_split(X, y, stratify=y, random_state=42)
+# lightgbm faster and better than RF
+lgb_model = LGBMClassifier(n_jobs= 4)
+lgb_model.fit(X_train, y_train)
+result = permutation_importance(lgb_model, X_test, y_test, n_repeats=10, random_state=42, n_jobs=2)
+sorted_idx = result.importances_mean.argsort()
+# Plot
+fig, ax = plt.subplots()
+ax.boxplot(result.importances[sorted_idx].T, vert=False, labels=X_test.columns[sorted_idx])
+ax.set_title("Permutation Importances (test set)")
+fig.tight_layout()
+plt.show()
+
+if __name__ == '__main__':
+    # classification and cat. pred
+    X, y = get_titanic_data()
+    testing_clf_all_varimp(X=X, y=y)
+    # regression
+    X, y = get_boston_data()
+    testing_regr_all_varimp(X=X, y=y)
\ No newline at end of file

From bbea1b745c51eac0031e2b7b854c406480df52d0 Mon Sep 17 00:00:00 2001
From: Thomas Bury <thomas.bury@allianz.be>
Date: Sat, 16 May 2020 12:54:56 +0200
Subject: [PATCH 2/9] correcting typos

---
 boruta/boruta_py.py | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

diff --git a/boruta/boruta_py.py b/boruta/boruta_py.py
index c346708..1cb1416 100644
--- a/boruta/boruta_py.py
+++ b/boruta/boruta_py.py
@@ -542,15 +542,15 @@ def _get_shap_imp(self, X, y):
         obj_feat = list(set(list(X_tr.columns)) - set(list(X_tr.select_dtypes(include=[np.number]))))
         obj_idx = None
 
-        # if obj_feat:
-        #     X_tr[obj_feat] = X_tr[obj_feat].astype('str').astype('category')
-        #     X_tt[obj_feat] = X_tt[obj_feat].astype('str').astype('category')
-        #     obj_idx = np.argwhere(X_tr.columns.isin(obj_feat)).ravel()
+        if obj_feat:
+            X_tr[obj_feat] = X_tr[obj_feat].astype('str').astype('category')
+            X_tt[obj_feat] = X_tt[obj_feat].astype('str').astype('category')
+            obj_idx = np.argwhere(X_tr.columns.isin(obj_feat)).ravel()
 
         if self._is_tree_based():
             try:
                 if self.is_cat:
-                    model = self.estimator.fit(X_tr, y_tr, sample_weight=w_tr, cat_features=obj_idx)
+                    model = self.estimator.fit(X_tr, y_tr, sample_weight=w_tr, cat_features=obj_feat)
                 else:
                     model = self.estimator.fit(X_tr, y_tr, sample_weight=w_tr)
 
@@ -607,7 +607,7 @@ def _get_perm_imp(self, X, y):
                 raise ValueError('Please check your X and y variable. The provided '
                                  'estimator cannot be fitted to your data.\n' + str(e))
 
-            perm_imp = permutation_importance(self.estimator, X_tt, y_tt, n_repeats=5, random_state=42, n_jobs=-1)
+            perm_imp = permutation_importance(model, X_tt, y_tt, n_repeats=5, random_state=42, n_jobs=-1)
             imp = perm_imp.importances_mean.ravel()
         else:
             raise ValueError('Not a tree based model')

From e8062d7cd58c1f2e13fa1c989a1afe7770782f06 Mon Sep 17 00:00:00 2001
From: Thomas Bury <thomas.bury@allianz.be>
Date: Mon, 18 May 2020 17:54:23 +0200
Subject: [PATCH 3/9] fix the fig size + tight layout

---
 boruta/boruta_py.py | 16 ++++++----------
 1 file changed, 6 insertions(+), 10 deletions(-)

diff --git a/boruta/boruta_py.py b/boruta/boruta_py.py
index 1cb1416..7d943b7 100644
--- a/boruta/boruta_py.py
+++ b/boruta/boruta_py.py
@@ -267,8 +267,7 @@ def plot_importance(self):
                              medianprops=dict(linestyle='-', linewidth=1.5),
                              whiskerprops=dict(linestyle='-', linewidth=1.5),
                              capprops=dict(linestyle='-', linewidth=1.5),
-                             showfliers=False, grid=True, rot=0, vert=False, patch_artist=True,
-                             figsize=(10, 3 * len(self.support_) / 10)
+                             showfliers=False, grid=True, rot=0, vert=False, patch_artist=True
                              )
 
         box_face_col = ["tab:blue"] * sum(self.support_) + ["#FDD023"] * sum(self.support_weak_)
@@ -276,13 +275,8 @@ def plot_importance(self):
             bp.findobj(matplotlib.patches.Patch)[len(self.support_) - c - 1].set_facecolor(box_face_col[c])
             bp.findobj(matplotlib.patches.Patch)[len(self.support_) - c - 1].set_facecolor(box_face_col[c])
 
-        # xt = bp.get_xticks()
-        # xt = np.round(np.append(xt, self.sha_max), 2)
-        # xtl = xt.tolist()
-        # xtl[-1] = "sha. max"
-        # bp.set_xticks(xt)
-        # bp.set_xticklabels(xtl)
-        bp.set_xlim(left=vimp_df.min(skipna=True).min(skipna=True))
+        xrange = vimp_df.max(skipna=True).max(skipna=True)-vimp_df.min(skipna=True).min(skipna=True)
+        bp.set_xlim(left=vimp_df.min(skipna=True).min(skipna=True)-0.10*xrange)
         custom_lines = [Line2D([0], [0], color="tab:blue", lw=5),
                         Line2D([0], [0], color="#FDD023", lw=5),
                         Line2D([0], [0], color="gray", lw=5),
@@ -290,7 +284,9 @@ def plot_importance(self):
         bp.legend(custom_lines, ['confirmed', 'tentative', 'rejected', 'sha. max'], loc="lower right")
         plt.axvline(x=self.sha_max, linestyle='--', color='gray')
         fig = bp.get_figure()
-        fig.suptitle('Boruta importance and selected predictors')
+        plt.title('Boruta importance and selected predictors')
+        fig.set_size_inches((10, 2 * np.rint(max(vimp_df.shape) / 10)))
+        plt.tight_layout()
         plt.show()
 
     def _is_catboost(self):

From 6a05d378c6c2a91eb0ea643c255ad685afe4fd87 Mon Sep 17 00:00:00 2001
From: Thomas Bury <thomas.bury@allianz.be>
Date: Thu, 28 May 2020 16:16:55 +0200
Subject: [PATCH 4/9] fixing lgb clf shap imp

---
 boruta/boruta_py.py | 21 ++++++++++++++++-----
 1 file changed, 16 insertions(+), 5 deletions(-)

diff --git a/boruta/boruta_py.py b/boruta/boruta_py.py
index 7d943b7..3136d47 100644
--- a/boruta/boruta_py.py
+++ b/boruta/boruta_py.py
@@ -292,6 +292,9 @@ def plot_importance(self):
     def _is_catboost(self):
         self.is_cat = 'catboost' in str(type(self.estimator))
 
+    def _is_lightgbm(self):
+        self.is_lgb = 'lightgbm' in str(type(self.estimator))
+
     def _is_tree_based(self):
         """
         checking if the estimator is tree-based (kernel SAP is too slow to be used here, unless using sampling)
@@ -556,12 +559,20 @@ def _get_shap_imp(self, X, y):
             # build the explainer
             explainer = shap.TreeExplainer(model, feature_perturbation="tree_path_dependent")
             shap_values = explainer.shap_values(X_tt)
-            # flatten to 2D if classification
+            # flatten to 2D if classification and lightgbm
             if is_classifier(self.estimator):
-                shap_values = np.array(shap_values)
-                shap_values = shap_values.reshape(-1, shap_values.shape[-1])
-
-            shap_imp = np.mean(np.abs(shap_values), axis=0)
+                if isinstance(shap_values, list):
+                    # for lightgbm clf sklearn, shap return list of arrays
+                    # https://github.com/slundberg/shap/issues/526
+                    class_inds = range(len(shap_values))
+                    shap_imp = np.zeros(shap_values[0].shape[1])
+                    for i, ind in enumerate(class_inds):
+                        shap_imp = np.abs(shap_values[ind]).mean(0)
+                    shap_imp /= len(shap_values)
+                else:
+                    shap_imp = np.abs(shap_values).mean(0)
+            else:
+                shap_imp = np.abs(shap_values).mean(0)
         else:
             raise ValueError('Not a tree based model')
 

From 2081d391cdf883742d3ddf439dccd2ec3cc6a63f Mon Sep 17 00:00:00 2001
From: Thomas Bury <thomas.bury@allianz.be>
Date: Wed, 8 Jul 2020 11:35:28 +0200
Subject: [PATCH 5/9] changes for lightgbm seed

---
 boruta/boruta_py.py                           |  42 +-
 .../Madalon_Data_Set-checkpoint.ipynb         | 405 ++++++++++++++++++
 boruta/examples/Madalon_Data_Set.ipynb        |  21 +-
 .../.ipynb_checkpoints/__init__-checkpoint.py |   0
 .../modif_tests-checkpoint.py                 | 154 +++++++
 .../unit_tests-checkpoint.py                  |  57 +++
 6 files changed, 645 insertions(+), 34 deletions(-)
 create mode 100644 boruta/examples/.ipynb_checkpoints/Madalon_Data_Set-checkpoint.ipynb
 create mode 100644 boruta/test/.ipynb_checkpoints/__init__-checkpoint.py
 create mode 100644 boruta/test/.ipynb_checkpoints/modif_tests-checkpoint.py
 create mode 100644 boruta/test/.ipynb_checkpoints/unit_tests-checkpoint.py

diff --git a/boruta/boruta_py.py b/boruta/boruta_py.py
index 3136d47..eaa79ba 100644
--- a/boruta/boruta_py.py
+++ b/boruta/boruta_py.py
@@ -1,17 +1,15 @@
 from __future__ import print_function, division
 import numpy as np
 import scipy as sp
-from sklearn.utils import check_random_state, check_X_y
-from sklearn.base import TransformerMixin, BaseEstimator, is_classifier, is_regressor
-from sklearn.utils import check_random_state, check_X_y
-from sklearn.base import TransformerMixin, BaseEstimator, is_regressor, is_classifier
-from sklearn.model_selection import RepeatedKFold, train_test_split
-from sklearn.inspection import permutation_importance
 import shap
 import pandas as pd
 import time
 import matplotlib
 import matplotlib.pyplot as plt
+from sklearn.utils import check_random_state, check_X_y
+from sklearn.base import TransformerMixin, BaseEstimator, is_classifier, is_regressor
+from sklearn.model_selection import RepeatedKFold, train_test_split
+from sklearn.inspection import permutation_importance
 from matplotlib.lines import Line2D
 #!/usr/bin/env python
 # -*- coding: utf-8 -*-
@@ -180,7 +178,9 @@ def __init__(self, estimator, n_estimators=1000, perc=100, alpha=0.05, importanc
         self.cat_name = None
         self.cat_idx = None
         # Catboost doesn't allow to change random seed after fitting
-        self.is_cat = False
+        self.is_cat = 'catboost' in str(type(self.estimator))
+        # Random state throws an error with lightgbm
+        self.is_lgb = 'lightgbm' in str(type(self.estimator))
         # plotting
         self.imp_real_hist = None
         self.sha_max = None
@@ -289,12 +289,6 @@ def plot_importance(self):
         plt.tight_layout()
         plt.show()
 
-    def _is_catboost(self):
-        self.is_cat = 'catboost' in str(type(self.estimator))
-
-    def _is_lightgbm(self):
-        self.is_lgb = 'lightgbm' in str(type(self.estimator))
-
     def _is_tree_based(self):
         """
         checking if the estimator is tree-based (kernel SAP is too slow to be used here, unless using sampling)
@@ -313,7 +307,6 @@ def _validate_pandas_input(self, arg):
             )
 
     def _fit(self, X_raw, y):
-        self._is_catboost()
 
         start_time = time.time()
         # basic cat features encoding
@@ -329,7 +322,7 @@ def _fit(self, X_raw, y):
             # a way without loop but need to re-do astype
             Cat = X_raw[cat_feat].stack().astype('category').cat.codes.unstack()
 
-        if self.is_cat is False:
+        if not self.is_cat:
             if cat_feat:
                 X = pd.concat([X_raw[X_raw.columns.difference(cat_feat)], Cat], axis=1)
             else:
@@ -353,7 +346,7 @@ def _fit(self, X_raw, y):
         if not isinstance(y, np.ndarray):
             y = self._validate_pandas_input(y)
 
-        self.random_state_instance = check_random_state(self.random_state)
+        self.random_state = check_random_state(self.random_state)
         # setup variables for Boruta
         n_sample, n_feat = X.shape
         _iter = 1
@@ -384,8 +377,13 @@ def _fit(self, X_raw, y):
 
             # make sure we start with a new tree in each iteration
             # Catboost doesn't allow to change random seed after fitting
-            if self.is_cat is False:
-                self.estimator.set_params(random_state=self.random_state)
+            if not self.is_cat:
+                if self._is_lightgbm:
+                    # https://github.com/scikit-learn-contrib/boruta_py/pull/78
+                    self.estimator.set_params(random_state=self.random_state.randint(0, 10000))
+                else:
+                    self.estimator.set_params(random_state=self.random_state)
+
 
             # add shadow attributes, shuffle them and train estimator, get imps
             cur_imp = self._add_shadows_get_imps(X, y, dec_reg)
@@ -522,7 +520,7 @@ def _get_imp(self, X, y):
         return imp
 
     def _get_shap_imp(self, X, y):
-
+        # SHAP and permutation importances must be computed on unseen data
         if self.weight is not None:
             w = self.weight
             if is_regressor(self.estimator):
@@ -562,12 +560,12 @@ def _get_shap_imp(self, X, y):
             # flatten to 2D if classification and lightgbm
             if is_classifier(self.estimator):
                 if isinstance(shap_values, list):
-                    # for lightgbm clf sklearn, shap return list of arrays
+                    # for lightgbm clf sklearn api, shap returns list of arrays
                     # https://github.com/slundberg/shap/issues/526
                     class_inds = range(len(shap_values))
                     shap_imp = np.zeros(shap_values[0].shape[1])
                     for i, ind in enumerate(class_inds):
-                        shap_imp = np.abs(shap_values[ind]).mean(0)
+                        shap_imp += np.abs(shap_values[ind]).mean(0)
                     shap_imp /= len(shap_values)
                 else:
                     shap_imp = np.abs(shap_values).mean(0)
@@ -622,7 +620,7 @@ def _get_perm_imp(self, X, y):
         return imp
 
     def _get_shuffle(self, seq):
-        self.random_state_instance.shuffle(seq)
+        self.random_state.shuffle(seq)
         return seq
 
     def _add_shadows_get_imps(self, X, y, dec_reg):
diff --git a/boruta/examples/.ipynb_checkpoints/Madalon_Data_Set-checkpoint.ipynb b/boruta/examples/.ipynb_checkpoints/Madalon_Data_Set-checkpoint.ipynb
new file mode 100644
index 0000000..45fb14a
--- /dev/null
+++ b/boruta/examples/.ipynb_checkpoints/Madalon_Data_Set-checkpoint.ipynb
@@ -0,0 +1,405 @@
+{
+ "cells": [
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "### Using Boruta on the Madalon Data Set\n",
+    "Author: [Mike Bernico](mike.bernico@gmail.com)\n",
+    "\n",
+    "This example demonstrates using Boruta to find all relevant features in the Madalon dataset, which is an artificial dataset used in NIPS2003 and cited in the [Boruta paper](https://www.jstatsoft.org/article/view/v036i11/v36i11.pdf)\n",
+    "\n",
+    "This dataset has 2000 observations and 500 features.  We will use Boruta to identify the features that are relevant to the classification task.\n",
+    "\n",
+    "\n",
+    "\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 1,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Installation\n",
+    "#!pip install boruta"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 2,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import pandas as pd\n",
+    "from sklearn.datasets import load_iris\n",
+    "from sklearn.ensemble import RandomForestClassifier\n",
+    "from boruta import BorutaPy"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 3,
+   "metadata": {
+    "collapsed": true,
+    "jupyter": {
+     "outputs_hidden": true
+    }
+   },
+   "outputs": [],
+   "source": [
+    "def load_data():\n",
+    "    # URLS for dataset via UCI\n",
+    "    train_data_url='https://archive.ics.uci.edu/ml/machine-learning-databases/madelon/MADELON/madelon_train.data'\n",
+    "    train_label_url='https://archive.ics.uci.edu/ml/machine-learning-databases/madelon/MADELON/madelon_train.labels'\n",
+    "\n",
+    "    X_data = pd.read_csv(train_data_url, sep=\" \", header=None)\n",
+    "    y_data = pd.read_csv(train_label_url, sep=\" \", header=None)\n",
+    "    data = X_data.loc[:, :499]\n",
+    "    data['target'] = y_data[0]\n",
+    "    return data"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 4,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "data = load_data()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 5,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/html": [
+       "<div>\n",
+       "<table border=\"1\" class=\"dataframe\">\n",
+       "  <thead>\n",
+       "    <tr style=\"text-align: right;\">\n",
+       "      <th></th>\n",
+       "      <th>0</th>\n",
+       "      <th>1</th>\n",
+       "      <th>2</th>\n",
+       "      <th>3</th>\n",
+       "      <th>4</th>\n",
+       "      <th>5</th>\n",
+       "      <th>6</th>\n",
+       "      <th>7</th>\n",
+       "      <th>8</th>\n",
+       "      <th>9</th>\n",
+       "      <th>...</th>\n",
+       "      <th>491</th>\n",
+       "      <th>492</th>\n",
+       "      <th>493</th>\n",
+       "      <th>494</th>\n",
+       "      <th>495</th>\n",
+       "      <th>496</th>\n",
+       "      <th>497</th>\n",
+       "      <th>498</th>\n",
+       "      <th>499</th>\n",
+       "      <th>target</th>\n",
+       "    </tr>\n",
+       "  </thead>\n",
+       "  <tbody>\n",
+       "    <tr>\n",
+       "      <th>0</th>\n",
+       "      <td>485</td>\n",
+       "      <td>477</td>\n",
+       "      <td>537</td>\n",
+       "      <td>479</td>\n",
+       "      <td>452</td>\n",
+       "      <td>471</td>\n",
+       "      <td>491</td>\n",
+       "      <td>476</td>\n",
+       "      <td>475</td>\n",
+       "      <td>473</td>\n",
+       "      <td>...</td>\n",
+       "      <td>481</td>\n",
+       "      <td>477</td>\n",
+       "      <td>485</td>\n",
+       "      <td>511</td>\n",
+       "      <td>485</td>\n",
+       "      <td>481</td>\n",
+       "      <td>479</td>\n",
+       "      <td>475</td>\n",
+       "      <td>496</td>\n",
+       "      <td>-1</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>1</th>\n",
+       "      <td>483</td>\n",
+       "      <td>458</td>\n",
+       "      <td>460</td>\n",
+       "      <td>487</td>\n",
+       "      <td>587</td>\n",
+       "      <td>475</td>\n",
+       "      <td>526</td>\n",
+       "      <td>479</td>\n",
+       "      <td>485</td>\n",
+       "      <td>469</td>\n",
+       "      <td>...</td>\n",
+       "      <td>478</td>\n",
+       "      <td>487</td>\n",
+       "      <td>338</td>\n",
+       "      <td>513</td>\n",
+       "      <td>486</td>\n",
+       "      <td>483</td>\n",
+       "      <td>492</td>\n",
+       "      <td>510</td>\n",
+       "      <td>517</td>\n",
+       "      <td>-1</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>2</th>\n",
+       "      <td>487</td>\n",
+       "      <td>542</td>\n",
+       "      <td>499</td>\n",
+       "      <td>468</td>\n",
+       "      <td>448</td>\n",
+       "      <td>471</td>\n",
+       "      <td>442</td>\n",
+       "      <td>478</td>\n",
+       "      <td>480</td>\n",
+       "      <td>477</td>\n",
+       "      <td>...</td>\n",
+       "      <td>481</td>\n",
+       "      <td>492</td>\n",
+       "      <td>650</td>\n",
+       "      <td>506</td>\n",
+       "      <td>501</td>\n",
+       "      <td>480</td>\n",
+       "      <td>489</td>\n",
+       "      <td>499</td>\n",
+       "      <td>498</td>\n",
+       "      <td>-1</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>3</th>\n",
+       "      <td>480</td>\n",
+       "      <td>491</td>\n",
+       "      <td>510</td>\n",
+       "      <td>485</td>\n",
+       "      <td>495</td>\n",
+       "      <td>472</td>\n",
+       "      <td>417</td>\n",
+       "      <td>474</td>\n",
+       "      <td>502</td>\n",
+       "      <td>476</td>\n",
+       "      <td>...</td>\n",
+       "      <td>480</td>\n",
+       "      <td>474</td>\n",
+       "      <td>572</td>\n",
+       "      <td>454</td>\n",
+       "      <td>469</td>\n",
+       "      <td>475</td>\n",
+       "      <td>482</td>\n",
+       "      <td>494</td>\n",
+       "      <td>461</td>\n",
+       "      <td>1</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>4</th>\n",
+       "      <td>484</td>\n",
+       "      <td>502</td>\n",
+       "      <td>528</td>\n",
+       "      <td>489</td>\n",
+       "      <td>466</td>\n",
+       "      <td>481</td>\n",
+       "      <td>402</td>\n",
+       "      <td>478</td>\n",
+       "      <td>487</td>\n",
+       "      <td>468</td>\n",
+       "      <td>...</td>\n",
+       "      <td>479</td>\n",
+       "      <td>452</td>\n",
+       "      <td>435</td>\n",
+       "      <td>486</td>\n",
+       "      <td>508</td>\n",
+       "      <td>481</td>\n",
+       "      <td>504</td>\n",
+       "      <td>495</td>\n",
+       "      <td>511</td>\n",
+       "      <td>1</td>\n",
+       "    </tr>\n",
+       "  </tbody>\n",
+       "</table>\n",
+       "<p>5 rows × 501 columns</p>\n",
+       "</div>"
+      ],
+      "text/plain": [
+       "     0    1    2    3    4    5    6    7    8    9   ...    491  492  493  \\\n",
+       "0  485  477  537  479  452  471  491  476  475  473   ...    481  477  485   \n",
+       "1  483  458  460  487  587  475  526  479  485  469   ...    478  487  338   \n",
+       "2  487  542  499  468  448  471  442  478  480  477   ...    481  492  650   \n",
+       "3  480  491  510  485  495  472  417  474  502  476   ...    480  474  572   \n",
+       "4  484  502  528  489  466  481  402  478  487  468   ...    479  452  435   \n",
+       "\n",
+       "   494  495  496  497  498  499  target  \n",
+       "0  511  485  481  479  475  496      -1  \n",
+       "1  513  486  483  492  510  517      -1  \n",
+       "2  506  501  480  489  499  498      -1  \n",
+       "3  454  469  475  482  494  461       1  \n",
+       "4  486  508  481  504  495  511       1  \n",
+       "\n",
+       "[5 rows x 501 columns]"
+      ]
+     },
+     "execution_count": 5,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "data.head()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 6,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "y = data.pop('target')\n",
+    "X = data.copy().values"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "Boruta conforms to the sklearn api and can be used in a Pipeline as well as on it's own. Here we will demonstrate stand alone operation.\n",
+    "\n",
+    "First we will instantiate an estimator that Boruta will use.  Then we will instantiate a Boruta Object."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 7,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "rf = RandomForestClassifier(n_jobs=-1, class_weight=None, max_depth=7, random_state=0)\n",
+    "# Define Boruta feature selection method\n",
+    "feat_selector = BorutaPy(rf, n_estimators='auto', verbose=2, random_state=0)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "Once built, we can use this object to identify the relevant features in our dataset."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "feat_selector.fit(X, y)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "Boruta has confirmed only a few features as useful.   When our run ended, Boruta was undecided on 2 features.   '\n",
+    "\n",
+    "We can interrogate .support_ to understand which features were selected.   .support_ returns an array of booleans that we can use to slice our feature matrix to include only relevant columns.   Of course, .transform can also be used, as expected in the scikit API."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Check selected features\n",
+    "print(feat_selector.support_)\n",
+    "# Select the chosen features from our dataframe.\n",
+    "selected = X[:, feat_selector.support_]\n",
+    "print (\"\")\n",
+    "print (\"Selected Feature Matrix Shape\")\n",
+    "print (selected.shape)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "We can also interrogate the ranking of the unselected features with .ranking_"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "feat_selector.ranking_"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.7.7"
+  },
+  "varInspector": {
+   "cols": {
+    "lenName": 16,
+    "lenType": 16,
+    "lenVar": 40
+   },
+   "kernels_config": {
+    "python": {
+     "delete_cmd_postfix": "",
+     "delete_cmd_prefix": "del ",
+     "library": "var_list.py",
+     "varRefreshCmd": "print(var_dic_list())"
+    },
+    "r": {
+     "delete_cmd_postfix": ") ",
+     "delete_cmd_prefix": "rm(",
+     "library": "var_list.r",
+     "varRefreshCmd": "cat(var_dic_list()) "
+    }
+   },
+   "types_to_exclude": [
+    "module",
+    "function",
+    "builtin_function_or_method",
+    "instance",
+    "_Feature"
+   ],
+   "window_display": false
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 4
+}
diff --git a/boruta/examples/Madalon_Data_Set.ipynb b/boruta/examples/Madalon_Data_Set.ipynb
index 435eba0..45fb14a 100644
--- a/boruta/examples/Madalon_Data_Set.ipynb
+++ b/boruta/examples/Madalon_Data_Set.ipynb
@@ -18,9 +18,7 @@
   {
    "cell_type": "code",
    "execution_count": 1,
-   "metadata": {
-    "collapsed": true
-   },
+   "metadata": {},
    "outputs": [],
    "source": [
     "# Installation\n",
@@ -43,7 +41,10 @@
    "cell_type": "code",
    "execution_count": 3,
    "metadata": {
-    "collapsed": true
+    "collapsed": true,
+    "jupyter": {
+     "outputs_hidden": true
+    }
    },
    "outputs": [],
    "source": [
@@ -279,9 +280,7 @@
   {
    "cell_type": "code",
    "execution_count": 7,
-   "metadata": {
-    "collapsed": true
-   },
+   "metadata": {},
    "outputs": [],
    "source": [
     "rf = RandomForestClassifier(n_jobs=-1, class_weight=None, max_depth=7, random_state=0)\n",
@@ -348,9 +347,7 @@
   {
    "cell_type": "code",
    "execution_count": null,
-   "metadata": {
-    "collapsed": true
-   },
+   "metadata": {},
    "outputs": [],
    "source": []
   }
@@ -371,7 +368,7 @@
    "name": "python",
    "nbconvert_exporter": "python",
    "pygments_lexer": "ipython3",
-   "version": "3.6.5"
+   "version": "3.7.7"
   },
   "varInspector": {
    "cols": {
@@ -404,5 +401,5 @@
   }
  },
  "nbformat": 4,
- "nbformat_minor": 1
+ "nbformat_minor": 4
 }
diff --git a/boruta/test/.ipynb_checkpoints/__init__-checkpoint.py b/boruta/test/.ipynb_checkpoints/__init__-checkpoint.py
new file mode 100644
index 0000000..e69de29
diff --git a/boruta/test/.ipynb_checkpoints/modif_tests-checkpoint.py b/boruta/test/.ipynb_checkpoints/modif_tests-checkpoint.py
new file mode 100644
index 0000000..507db6f
--- /dev/null
+++ b/boruta/test/.ipynb_checkpoints/modif_tests-checkpoint.py
@@ -0,0 +1,154 @@
+%matplotlib inline
+from sklearn.datasets import fetch_openml
+from sklearn.ensemble import RandomForestClassifier
+from sklearn.impute import SimpleImputer
+from sklearn.inspection import permutation_importance
+from sklearn.compose import ColumnTransformer
+from sklearn.model_selection import train_test_split
+from sklearn.pipeline import Pipeline
+from sklearn.preprocessing import OneHotEncoder
+from sklearn.datasets import fetch_openml
+from sklearn.inspection import permutation_importance
+import catboost
+from boruta import BorutaPy as bp
+from sklearn.datasets import load_boston, load_diabetes
+import numpy as np
+import pandas as pd
+import matplotlib as mpl
+import matplotlib.pyplot as plt
+import gc
+import shap
+# lightgbm and catboost
+import lightgbm as lgb
+from lightgbm import LGBMRegressor, LGBMClassifier
+from xgboost import XGBRegressor, XGBClassifier
+from catboost import CatBoostRegressor, CatBoostClassifier
+from sys import getsizeof, path
+plt.style.use('fivethirtyeight')
+rng = np.random.RandomState(seed=42)
+
+
+
+# Convert the cat. pred. for boosting
+def cat_var(df, col_excl=None, return_cat=True):
+    """Identify categorical features.
+
+        Parameters
+        ----------
+        df: original df after missing operations
+
+        Returns
+        -------
+        cat_var_df: summary df with col index and col name for all categorical vars
+        :param return_cat: Boolean, return encoded cols as type 'category'
+        :param df: pd.DF, the encoded data-frame
+        :param col_excl: list, colums not to be encoded
+        """
+
+    if col_excl is None:
+        non_num_cols = list(set(list(df.columns)) - set(list(df.select_dtypes(include=[np.number]))))
+    else:
+        non_num_cols = list(
+            set(list(df.columns)) - set(list(df.select_dtypes(include=[np.number]))) - set(col_excl))
+
+    # cat_var_index = [i for i, x in enumerate(df[col_names].dtypes.tolist()) if isinstance(x, pd.CategoricalDtype) or x == 'object']
+    # cat_var_name = [x for i, x in enumerate(col_names) if i in cat_var_index]
+
+    cat_var_index = [df.columns.get_loc(c) for c in non_num_cols if c in df]
+
+    cat_var_df = pd.DataFrame({'cat_ind': cat_var_index,
+                               'cat_name': non_num_cols})
+
+    cols_need_mapped = cat_var_df.cat_name.to_list()
+    inv_mapper = {col: dict(enumerate(df[col].astype('category').cat.categories)) for col in df[cols_need_mapped]}
+    mapper = {col: {v: k for k, v in inv_mapper[col].items()} for col in df[cols_need_mapped]}
+
+    for c in cols_need_mapped:
+        df.loc[:, c] = df.loc[:, c].map(mapper[c]).fillna(0).astype(int)
+
+    if return_cat:
+        df[non_num_cols] = df[non_num_cols].astype('category')
+    return df, cat_var_df, inv_mapper
+
+
+def get_titanic_data():
+    # Fetch Titanic data and add random cat and numbers
+    # Example taken from https://scikit-learn.org/stable/auto_examples/inspection/
+    # plot_permutation_importance.html#sphx-glr-auto-examples-inspection-plot-permutation-importance-py
+    X, y = fetch_openml("titanic", version=1, as_frame=True, return_X_y=True)
+    rng = np.random.RandomState(seed=42)
+    X['random_cat'] = rng.randint(3, size=X.shape[0])
+    X['random_cat'] = X['random_cat'].astype('str')
+    X['random_num'] = rng.randn(X.shape[0])
+
+    categorical_columns = ['pclass', 'sex', 'embarked', 'random_cat']
+    numerical_columns = ['age', 'sibsp', 'parch', 'fare', 'random_num']
+    X = X[categorical_columns + numerical_columns]
+    # Impute
+    categorical_pipe = Pipeline([('imputer', SimpleImputer(strategy='constant', fill_value='missing'))])
+    numerical_pipe = Pipeline([('imputer', SimpleImputer(strategy='mean'))])
+    preprocessing = ColumnTransformer([('cat', categorical_pipe, categorical_columns), ('num', numerical_pipe, numerical_columns)])
+    X_trans = preprocessing.fit_transform(X)
+    X = pd.DataFrame(X_trans, columns = X.columns)
+    # encode
+    X, cat_var_df, inv_mapper = cat_var(X)
+    return X, y
+
+def get_boston_data():
+    boston = load_boston()
+    X = pd.DataFrame(boston.data)
+    X['random_num1'] = rng.randn(X.shape[0])
+    X['random_num2'] = np.random.poisson(1, X.shape[0])
+    y = pd.Series(boston.target)
+    return X, y
+
+# Testing the changes with rnd cat. and num. predictors added to the set of genuine predictors
+def testing_estimators(varimp, models, X, y):
+    for model in models:
+        print('testing ' + str(type(model)) + ' for var.imp: ' + varimp)
+        feat_selector = bp(model, n_estimators = 100, verbose= 1, max_iter= 10, random_state=42, weight=None, importance=varimp)
+        feat_selector.fit(X, y)
+        print(feat_selector.support_names_)
+        feat_selector.plot_importance()
+        gc.enable()
+        del(feat_selector, model)
+        gc.collect()
+
+def testing_clf_all_varimp(X, y):
+    for varimp in ['shap', 'pimp', 'native']:
+        models = [catboost.CatBoostClassifier(random_state=42, verbose=0), XGBClassifier(random_state=42, verbose=0), LGBMClassifier(random_state=42, verbose=0)]
+        testing_estimators(varimp=varimp, models=models, X=X, y=y)
+        gc.enable()
+        del(models)
+        gc.collect()
+
+def testing_regr_all_varimp(X, y):
+    for varimp in ['shap', 'pimp', 'native']:
+        models = [catboost.CatBoostRegressor(random_state=42, verbose=0), XGBRegressor(random_state=42, verbose=0), LGBMRegressor(random_state=42, verbose=0)]
+        testing_estimators(varimp=varimp, models=models, X=X, y=y)
+        gc.enable()
+        del(models)
+        gc.collect()
+        
+
+X, y = get_titanic_data()
+X_train, X_test, y_train, y_test = train_test_split(X, y, stratify=y, random_state=42)
+# lightgbm faster and better than RF
+lgb_model = LGBMClassifier(n_jobs= 4)
+lgb_model.fit(X_train, y_train)
+result = permutation_importance(lgb_model, X_test, y_test, n_repeats=10, random_state=42, n_jobs=2)
+sorted_idx = result.importances_mean.argsort()
+# Plot
+fig, ax = plt.subplots()
+ax.boxplot(result.importances[sorted_idx].T, vert=False, labels=X_test.columns[sorted_idx])
+ax.set_title("Permutation Importances (test set)")
+fig.tight_layout()
+plt.show()
+
+if __name__ == '__main__':
+    # classification and cat. pred
+    X, y = get_titanic_data()
+    testing_clf_all_varimp(X=X, y=y)
+    # regression
+    X, y = get_boston_data()
+    testing_regr_all_varimp(X=X, y=y)
\ No newline at end of file
diff --git a/boruta/test/.ipynb_checkpoints/unit_tests-checkpoint.py b/boruta/test/.ipynb_checkpoints/unit_tests-checkpoint.py
new file mode 100644
index 0000000..5d5ce9f
--- /dev/null
+++ b/boruta/test/.ipynb_checkpoints/unit_tests-checkpoint.py
@@ -0,0 +1,57 @@
+import unittest
+from boruta import BorutaPy
+import pandas as pd
+from sklearn.ensemble import RandomForestClassifier
+import numpy as np
+
+
+class BorutaTestCases(unittest.TestCase):
+
+    def test_get_tree_num(self):
+        rfc = RandomForestClassifier(max_depth=10)
+        bt = BorutaPy(rfc)
+        self.assertEqual(bt._get_tree_num(10), 44, "Tree Est. Math Fail")
+        self.assertEqual(bt._get_tree_num(100), 141, "Tree Est. Math Fail")
+
+    def test_if_boruta_extracts_relevant_features(self):
+        np.random.seed(42)
+        y = np.random.binomial(1, 0.5, 1000)
+        X = np.zeros((1000, 10))
+
+        z = y - np.random.binomial(1, 0.1, 1000) + np.random.binomial(1, 0.1, 1000)
+        z[z == -1] = 0
+        z[z == 2] = 1
+
+        # 5 relevant features
+        X[:, 0] = z
+        X[:, 1] = y * np.abs(np.random.normal(0, 1, 1000)) + np.random.normal(0, 0.1, 1000)
+        X[:, 2] = y + np.random.normal(0, 1, 1000)
+        X[:, 3] = y ** 2 + np.random.normal(0, 1, 1000)
+        X[:, 4] = np.sqrt(y) + np.random.binomial(2, 0.1, 1000)
+
+        # 5 irrelevant features
+        X[:, 5] = np.random.normal(0, 1, 1000)
+        X[:, 6] = np.random.poisson(1, 1000)
+        X[:, 7] = np.random.binomial(1, 0.3, 1000)
+        X[:, 8] = np.random.normal(0, 1, 1000)
+        X[:, 9] = np.random.poisson(1, 1000)
+
+        rfc = RandomForestClassifier()
+        bt = BorutaPy(rfc)
+        bt.fit(X, y)
+
+        # make sure that only all the relevant features are returned
+        self.assertListEqual(list(range(5)), list(np.where(bt.support_)[0]))
+
+        # test if this works as expected for dataframe input
+        X_df, y_df = pd.DataFrame(X), pd.Series(y)
+        bt.fit(X_df, y_df)
+        self.assertListEqual(list(range(5)), list(np.where(bt.support_)[0]))
+
+        # check it dataframe is returned when return_df=True
+        self.assertIsInstance(bt.transform(X_df, return_df=True), pd.DataFrame)
+
+if __name__ == '__main__':
+    unittest.main()
+
+

From a6c4b8ad85166aee10f07e04b6facdac2277257c Mon Sep 17 00:00:00 2001
From: Thomas Bury <thomas.bury@allianz.be>
Date: Wed, 8 Jul 2020 11:43:51 +0200
Subject: [PATCH 6/9] naming of var consistent with master

---
 boruta/boruta_py.py | 36 ++++++++++++++++++------------------
 1 file changed, 18 insertions(+), 18 deletions(-)

diff --git a/boruta/boruta_py.py b/boruta/boruta_py.py
index eaa79ba..49b2ce7 100644
--- a/boruta/boruta_py.py
+++ b/boruta/boruta_py.py
@@ -1,3 +1,13 @@
+#!/usr/bin/env python
+# -*- coding: utf-8 -*-
+"""
+Author: Daniel Homola <dani.homola@gmail.com>
+
+Original code and method by: Miron B Kursa, https://m2.icm.edu.pl/boruta/
+
+License: BSD 3 clause
+"""
+
 from __future__ import print_function, division
 import numpy as np
 import scipy as sp
@@ -11,16 +21,6 @@
 from sklearn.model_selection import RepeatedKFold, train_test_split
 from sklearn.inspection import permutation_importance
 from matplotlib.lines import Line2D
-#!/usr/bin/env python
-# -*- coding: utf-8 -*-
-"""
-Author: Daniel Homola <dani.homola@gmail.com>
-
-Original code and method by: Miron B Kursa, https://m2.icm.edu.pl/boruta/
-
-License: BSD 3 clause
-"""
-
 
 
 class BorutaPy(BaseEstimator, TransformerMixin):
@@ -171,20 +171,20 @@ def __init__(self, estimator, n_estimators=1000, perc=100, alpha=0.05, importanc
         self.two_step = two_step
         self.max_iter = max_iter
         self.random_state = random_state
-        self.random_state_instance = None
         self.verbose = verbose
         self.weight = weight
         self.importance = importance
         self.cat_name = None
         self.cat_idx = None
         # Catboost doesn't allow to change random seed after fitting
-        self.is_cat = 'catboost' in str(type(self.estimator))
+        self._is_catboost = 'catboost' in str(type(self.estimator))
         # Random state throws an error with lightgbm
-        self.is_lgb = 'lightgbm' in str(type(self.estimator))
+        self._is_lightgbm = 'lightgbm' in str(type(self.estimator))
         # plotting
         self.imp_real_hist = None
         self.sha_max = None
         self.col_names = None
+        self.__version__ = '0.3'
 
     def fit(self, X, y):
         """
@@ -322,7 +322,7 @@ def _fit(self, X_raw, y):
             # a way without loop but need to re-do astype
             Cat = X_raw[cat_feat].stack().astype('category').cat.codes.unstack()
 
-        if not self.is_cat:
+        if not self._is_catboost:
             if cat_feat:
                 X = pd.concat([X_raw[X_raw.columns.difference(cat_feat)], Cat], axis=1)
             else:
@@ -377,7 +377,7 @@ def _fit(self, X_raw, y):
 
             # make sure we start with a new tree in each iteration
             # Catboost doesn't allow to change random seed after fitting
-            if not self.is_cat:
+            if not self._is_catboost:
                 if self._is_lightgbm:
                     # https://github.com/scikit-learn-contrib/boruta_py/pull/78
                     self.estimator.set_params(random_state=self.random_state.randint(0, 10000))
@@ -499,7 +499,7 @@ def _get_tree_num(self, n_feat):
 
     def _get_imp(self, X, y):
         try:
-            if self.is_cat:
+            if self._is_catboost:
                 X = pd.DataFrame(X)
                 obj_feat = X.dtypes.loc[(X.dtypes == 'object') | (X.dtypes == 'category')].index.tolist()
                 if obj_feat:
@@ -546,7 +546,7 @@ def _get_shap_imp(self, X, y):
 
         if self._is_tree_based():
             try:
-                if self.is_cat:
+                if self._is_catboost:
                     model = self.estimator.fit(X_tr, y_tr, sample_weight=w_tr, cat_features=obj_feat)
                 else:
                     model = self.estimator.fit(X_tr, y_tr, sample_weight=w_tr)
@@ -603,7 +603,7 @@ def _get_perm_imp(self, X, y):
 
         if self._is_tree_based():
             try:
-                if self.is_cat:
+                if self._is_catboost:
                     model = self.estimator.fit(X_tr, y_tr, sample_weight=w_tr, cat_features=obj_feat)
                 else:
                     model = self.estimator.fit(X_tr, y_tr, sample_weight=w_tr)

From e278764fc4c4a86eaec1bbeee841e559c51cad23 Mon Sep 17 00:00:00 2001
From: Thomas Bury <thomas.bury@allianz.be>
Date: Wed, 8 Jul 2020 11:52:16 +0200
Subject: [PATCH 7/9] splitting import into different lines

---
 boruta/boruta_py.py | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

diff --git a/boruta/boruta_py.py b/boruta/boruta_py.py
index 49b2ce7..ab773e5 100644
--- a/boruta/boruta_py.py
+++ b/boruta/boruta_py.py
@@ -17,7 +17,8 @@
 import matplotlib
 import matplotlib.pyplot as plt
 from sklearn.utils import check_random_state, check_X_y
-from sklearn.base import TransformerMixin, BaseEstimator, is_classifier, is_regressor
+from sklearn.base import TransformerMixin, BaseEstimator
+from sklearn.base import is_classifier, is_regressor
 from sklearn.model_selection import RepeatedKFold, train_test_split
 from sklearn.inspection import permutation_importance
 from matplotlib.lines import Line2D
@@ -176,15 +177,14 @@ def __init__(self, estimator, n_estimators=1000, perc=100, alpha=0.05, importanc
         self.importance = importance
         self.cat_name = None
         self.cat_idx = None
-        # Catboost doesn't allow to change random seed after fitting
-        self._is_catboost = 'catboost' in str(type(self.estimator))
-        # Random state throws an error with lightgbm
-        self._is_lightgbm = 'lightgbm' in str(type(self.estimator))
         # plotting
         self.imp_real_hist = None
         self.sha_max = None
         self.col_names = None
-        self.__version__ = '0.3'
+        # Catboost doesn't allow to change random seed after fitting
+        self._is_catboost = 'catboost' in str(type(self.estimator))
+        # Random state throws an error with lightgbm
+        self._is_lightgbm = 'lightgbm' in str(type(self.estimator))
 
     def fit(self, X, y):
         """

From 132e265677f456cb92956c800c9b637c27a8a052 Mon Sep 17 00:00:00 2001
From: Thomas Bury <thomas.bury@allianz.be>
Date: Wed, 8 Jul 2020 11:54:32 +0200
Subject: [PATCH 8/9] import warnings

---
 boruta/boruta_py.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/boruta/boruta_py.py b/boruta/boruta_py.py
index ab773e5..6a1de8d 100644
--- a/boruta/boruta_py.py
+++ b/boruta/boruta_py.py
@@ -22,6 +22,7 @@
 from sklearn.model_selection import RepeatedKFold, train_test_split
 from sklearn.inspection import permutation_importance
 from matplotlib.lines import Line2D
+import warnings
 
 
 class BorutaPy(BaseEstimator, TransformerMixin):

From 504abf50b63ba7fb2e5429124043018b53cf09ef Mon Sep 17 00:00:00 2001
From: Thomas Bury <thomas.bury@allianz.be>
Date: Fri, 28 Aug 2020 15:05:50 +0200
Subject: [PATCH 9/9] move weight in fit method

---
 boruta/boruta_py.py        | 572 +++++++++++++++++++++++++------------
 boruta/test/modif_tests.py |  37 +--
 2 files changed, 412 insertions(+), 197 deletions(-)

diff --git a/boruta/boruta_py.py b/boruta/boruta_py.py
index 6a1de8d..e1e14f7 100644
--- a/boruta/boruta_py.py
+++ b/boruta/boruta_py.py
@@ -1,30 +1,3 @@
-#!/usr/bin/env python
-# -*- coding: utf-8 -*-
-"""
-Author: Daniel Homola <dani.homola@gmail.com>
-
-Original code and method by: Miron B Kursa, https://m2.icm.edu.pl/boruta/
-
-License: BSD 3 clause
-"""
-
-from __future__ import print_function, division
-import numpy as np
-import scipy as sp
-import shap
-import pandas as pd
-import time
-import matplotlib
-import matplotlib.pyplot as plt
-from sklearn.utils import check_random_state, check_X_y
-from sklearn.base import TransformerMixin, BaseEstimator
-from sklearn.base import is_classifier, is_regressor
-from sklearn.model_selection import RepeatedKFold, train_test_split
-from sklearn.inspection import permutation_importance
-from matplotlib.lines import Line2D
-import warnings
-
-
 class BorutaPy(BaseEstimator, TransformerMixin):
     """
     Improved Python implementation of the Boruta R package.
@@ -51,8 +24,8 @@ class BorutaPy(BaseEstimator, TransformerMixin):
         crucial parameter. For more info, please read about the perc parameter.
     - Automatic tree number:
         Setting the n_estimator to 'auto' will calculate the number of trees
-        in each itartion based on the number of features under investigation.
-        This way more trees are used when the training data has many feautres
+        in each iteration based on the number of features under investigation.
+        This way more trees are used when the training data has many features
         and less when most of the features have been rejected.
     - Ranking of features:
         After fitting BorutaPy it provides the user with ranking of features.
@@ -73,6 +46,32 @@ class BorutaPy(BaseEstimator, TransformerMixin):
     care about all factors that contribute to it, not just the bluntest signs
     of it in context of your methodology (yes, minimal optimal set of features
     by definition depends on your classifier choice).
+
+
+    Summary
+    -------
+         - Loop over n_iter or until dec_reg == 0
+         - add shadows
+            o find features that are tentative
+            o make sure that at least 5 columns are added
+            o shuffle shadows
+            o get feature importance
+                * fit the estimator
+                * extract feature importance (native, shap or permutation)
+                * return feature importance
+            o separate the importance of shadows and real
+
+         - Calculate the maximum shadow importance and append to the previous run
+         - Assign hits using the imp_sha_max of this run
+            o find all the feat imp > imp_sha_max
+            o tag them as hits
+            o add +1 to the previous tag vector
+         - Perform a test
+            o select non rejected features yet
+            o get a binomial p-values (nbr of times the feat has been tagged as important on the n_iter done so far)
+            o reject or not according the (corrected) p-val
+
+
     Parameters
     ----------
     estimator : object
@@ -96,7 +95,9 @@ class BorutaPy(BaseEstimator, TransformerMixin):
         correction steps.
     importance : str, default = 'shap'
         The kind of variable importance used to compare and discriminate original
-        vs shadow predictors.
+        vs shadow predictors. Note that the builtin tree importance (gini/impurity based
+        importance) is biased towards numerical and large cardinality predictors, even
+        if they are random. Shapley values and permutation imp. are robust w.r.t those predictors.
     two_step : Boolean, default = True
         If you want to use the original implementation of Boruta with Bonferroni
         correction only set this to False.
@@ -107,12 +108,13 @@ class BorutaPy(BaseEstimator, TransformerMixin):
         If RandomState instance, random_state is the random number generator;
         If None, the random number generator is the RandomState instance used
         by `np.random`.
-    weight : pd.Series
     verbose : int, default=0
         Controls verbosity of output:
         - 0: no output
         - 1: displays iteration number
         - 2: which features have been selected already
+
+
     Attributes
     ----------
     n_features_ : int
@@ -127,6 +129,20 @@ class BorutaPy(BaseEstimator, TransformerMixin):
         ranking position of the i-th feature. Selected (i.e., estimated
         best) features are assigned rank 1 and tentative features are assigned
         rank 2.
+    cat_name : list of str
+        the name of the categorical columns
+    cat_idx : list of int
+        the index of the categorical columns
+    imp_real_hist : array
+        array of the historical feature importance of the real predictors
+    sha_max : float
+        the maximum feature importance of the shadow predictors
+    col_names : list of str
+        the names of the real predictors
+    tag_df : dataframe
+        the df with the details (accepted or rejected) of the feature selection
+
+
     Examples
     --------
 
@@ -164,7 +180,7 @@ class BorutaPy(BaseEstimator, TransformerMixin):
         Journal of Statistical Software, Vol. 36, Issue 11, Sep 2010
     """
 
-    def __init__(self, estimator, n_estimators=1000, perc=100, alpha=0.05, importance='shap',
+    def __init__(self, estimator, n_estimators=1000, perc=90, alpha=0.05, importance='shap',
                  two_step=True, max_iter=100, random_state=None, weight=None, verbose=0):
         self.estimator = estimator
         self.n_estimators = n_estimators
@@ -173,21 +189,22 @@ def __init__(self, estimator, n_estimators=1000, perc=100, alpha=0.05, importanc
         self.two_step = two_step
         self.max_iter = max_iter
         self.random_state = random_state
+        self.random_state_instance = None
         self.verbose = verbose
         self.weight = weight
         self.importance = importance
         self.cat_name = None
         self.cat_idx = None
+        # Catboost doesn't allow to change random seed after fitting
+        self.is_cat = is_catboost(estimator)
+        self.is_lgb = is_lightgbm(estimator)
         # plotting
         self.imp_real_hist = None
         self.sha_max = None
         self.col_names = None
-        # Catboost doesn't allow to change random seed after fitting
-        self._is_catboost = 'catboost' in str(type(self.estimator))
-        # Random state throws an error with lightgbm
-        self._is_lightgbm = 'lightgbm' in str(type(self.estimator))
+        self.tag_df = None
 
-    def fit(self, X, y):
+    def fit(self, X, y, sample_weight=None):
         """
         Fits the Boruta feature selection with the provided estimator.
         Parameters
@@ -196,12 +213,15 @@ def fit(self, X, y):
             The training input samples.
         y : array-like, shape = [n_samples]
             The target values.
+        sample_weight : array-like, shape = [n_samples], default=None
+            Individual weights for each sample
         """
         self.imp_real_hist = np.empty((0, X.shape[1]), float)
         if isinstance(X, pd.DataFrame) is not True:
             X = pd.DataFrame(X)
         self.col_names = X.columns.to_list()
-        return self._fit(X, y)
+
+        return self._fit(X, y, sample_weight=sample_weight)
 
     def transform(self, X, weak=False, return_df=False):
         """
@@ -225,15 +245,18 @@ def transform(self, X, weak=False, return_df=False):
 
         return self._transform(X, weak, return_df)
 
-    def fit_transform(self, X, y, weak=False, return_df=False):
+    def fit_transform(self, X, y, sample_weight=None, weak=False, return_df=False):
         """
         Fits Boruta, then reduces the input X to the selected features.
+
         Parameters
         ----------
         X : array-like, shape = [n_samples, n_features]
             The training input samples.
         y : array-like, shape = [n_samples]
             The target values.
+        sample_weight : array-like, shape = [n_samples], default=None
+            Individual weights for each sample
         weak: boolean, default = False
             If set to true, the tentative features are also used to reduce X.
         return_df : boolean, default = False
@@ -244,18 +267,42 @@ def fit_transform(self, X, y, weak=False, return_df=False):
         X : array-like, shape = [n_samples, n_features_]
             The input matrix X's columns are reduced to the features which were
             selected by Boruta.
+
+        Summary
+        -------
+         - Loop over n_iter or until dec_reg == 0
+         - add shadows
+            o find features that are tentative
+            o make sure that at least 5 columns are added
+            o shuffle shadows
+            o get feature importance
+                * fit the estimator
+                * extract feature importance (native, shap or permutation)
+                * return feature importance
+            o separate the importance of shadows and real
+
+         - Calculate the maximum shadow importance and append to the previous run
+         - Assign hits using the imp_sha_max of this run
+            o find all the feat imp > imp_sha_max
+            o tag them as hits
+            o add +1 to the previous tag vector
+         - Perform a test
+            o select non rejected features yet
+            o get a binomial p-values (nbr of times the feat has been tagged as important on the n_iter done so far)
+            o reject or not according the (corrected) p-val
         """
 
-        self._fit(X, y)
+        self._fit(X, y, sample_weight=sample_weight)
         return self._transform(X, weak, return_df)
 
-    def plot_importance(self):
+    def plot_importance(self, n_feat_per_inch=5):
         """
         Boxplot of the variable importance, ordered by magnitude
         The max shadow variable importance illustrated by the dashed line.
         Requires to apply the fit method first.
         :return: boxplot
         """
+        set_my_plt_style()
         if self.imp_real_hist is None:
             raise ValueError("Use the fit method first to compute the var.imp")
 
@@ -268,38 +315,33 @@ def plot_importance(self):
                              medianprops=dict(linestyle='-', linewidth=1.5),
                              whiskerprops=dict(linestyle='-', linewidth=1.5),
                              capprops=dict(linestyle='-', linewidth=1.5),
-                             showfliers=False, grid=True, rot=0, vert=False, patch_artist=True
+                             showfliers=False, grid=True, rot=0, vert=False, patch_artist=True,
+                             figsize=(10, vimp_df.shape[1] / n_feat_per_inch), fontsize=9
                              )
-
-        box_face_col = ["tab:blue"] * sum(self.support_) + ["#FDD023"] * sum(self.support_weak_)
+        blue_color = "#2590fa"
+        yellow_color = "#f0be00"
+        box_face_col = [blue_color] * sum(self.support_) + [yellow_color] * sum(self.support_weak_)
         for c in range(len(box_face_col)):
             bp.findobj(matplotlib.patches.Patch)[len(self.support_) - c - 1].set_facecolor(box_face_col[c])
-            bp.findobj(matplotlib.patches.Patch)[len(self.support_) - c - 1].set_facecolor(box_face_col[c])
+            bp.findobj(matplotlib.patches.Patch)[len(self.support_) - c - 1].set_color(box_face_col[c])
+
+        xrange = vimp_df.max(skipna=True).max(skipna=True) - vimp_df.min(skipna=True).min(skipna=True)
+        bp.set_xlim(left=vimp_df.min(skipna=True).min(skipna=True) - 0.10 * xrange)
 
-        xrange = vimp_df.max(skipna=True).max(skipna=True)-vimp_df.min(skipna=True).min(skipna=True)
-        bp.set_xlim(left=vimp_df.min(skipna=True).min(skipna=True)-0.10*xrange)
-        custom_lines = [Line2D([0], [0], color="tab:blue", lw=5),
-                        Line2D([0], [0], color="#FDD023", lw=5),
+        custom_lines = [Line2D([0], [0], color=blue_color, lw=5),
+                        Line2D([0], [0], color=yellow_color, lw=5),
                         Line2D([0], [0], color="gray", lw=5),
                         Line2D([0], [0], linestyle='--', color="gray", lw=2)]
         bp.legend(custom_lines, ['confirmed', 'tentative', 'rejected', 'sha. max'], loc="lower right")
         plt.axvline(x=self.sha_max, linestyle='--', color='gray')
         fig = bp.get_figure()
         plt.title('Boruta importance and selected predictors')
-        fig.set_size_inches((10, 2 * np.rint(max(vimp_df.shape) / 10)))
-        plt.tight_layout()
+        fig.set_size_inches((10, 1.5 * np.rint(max(vimp_df.shape) / 10)))
+        # plt.tight_layout()
         plt.show()
 
-    def _is_tree_based(self):
-        """
-        checking if the estimator is tree-based (kernel SAP is too slow to be used here, unless using sampling)
-        :return:
-        """
-        tree_based_models = ['lightgbm', 'xgboost', 'catboost', '_forest']
-        condition = any(i in str(type(self.estimator)) for i in tree_based_models)
-        return condition
-
-    def _validate_pandas_input(self, arg):
+    @staticmethod
+    def _validate_pandas_input(arg):
         try:
             return arg.values
         except AttributeError:
@@ -307,7 +349,45 @@ def _validate_pandas_input(self, arg):
                 "input needs to be a numpy array or pandas data frame."
             )
 
-    def _fit(self, X_raw, y):
+    def _fit(self, X_raw, y, sample_weight=None):
+        """
+        Private method.
+        Chaining:
+         - Loop over n_iter or until dec_reg == 0
+         - add shadows
+            o find features that are tentative
+            o make sure that at least 5 columns are added
+            o shuffle shadows
+            o get feature importance
+                * fit the estimator
+                * extract feature importance (native, shap or permutation)
+                * return feature importance
+            o separate the importance of shadows and real
+
+         - Calculate the maximum shadow importance and append to the previous run
+         - Assign hits using the imp_sha_max of this run
+            o find all the feat imp > imp_sha_max
+            o tag them as hits
+            o add +1 to the previous tag vector
+         - Perform a test
+            o select non rejected features yet
+            o get a binomial p-values (nbr of times the feat has been tagged as important on the n_iter done so far)
+            o reject or not according the (corrected) p-val
+
+        Parameters
+        ----------
+        X_raw : array-like, shape = [n_samples, n_features]
+            The training input samples.
+        y : array-like, shape = [n_samples]
+            The target values.
+        sample_weight : array-like, shape = [n_samples], default=None
+            Individual weights for each sample
+
+        :return:
+         self : object
+            Nothing but attributes
+        """
+        # self.is_cat = is_catboost(self.estimator)
 
         start_time = time.time()
         # basic cat features encoding
@@ -323,7 +403,7 @@ def _fit(self, X_raw, y):
             # a way without loop but need to re-do astype
             Cat = X_raw[cat_feat].stack().astype('category').cat.codes.unstack()
 
-        if not self._is_catboost:
+        if self.is_cat is False:
             if cat_feat:
                 X = pd.concat([X_raw[X_raw.columns.difference(cat_feat)], Cat], axis=1)
             else:
@@ -347,6 +427,10 @@ def _fit(self, X_raw, y):
         if not isinstance(y, np.ndarray):
             y = self._validate_pandas_input(y)
 
+        if sample_weight is not None:
+            if not isinstance(sample_weight, np.ndarray):
+                sample_weight = self._validate_pandas_input(sample_weight)
+
         self.random_state = check_random_state(self.random_state)
         # setup variables for Boruta
         n_sample, n_feat = X.shape
@@ -378,16 +462,14 @@ def _fit(self, X_raw, y):
 
             # make sure we start with a new tree in each iteration
             # Catboost doesn't allow to change random seed after fitting
-            if not self._is_catboost:
-                if self._is_lightgbm:
-                    # https://github.com/scikit-learn-contrib/boruta_py/pull/78
+            if self.is_cat is False:
+                if self.is_lgb:
                     self.estimator.set_params(random_state=self.random_state.randint(0, 10000))
                 else:
                     self.estimator.set_params(random_state=self.random_state)
 
-
             # add shadow attributes, shuffle them and train estimator, get imps
-            cur_imp = self._add_shadows_get_imps(X, y, dec_reg)
+            cur_imp = self._add_shadows_get_imps(X, y, sample_weight, dec_reg)
 
             # get the threshold of shadow importances we will use for rejection
             imp_sha_max = np.percentile(cur_imp[1], self.perc)
@@ -428,10 +510,23 @@ def _fit(self, X_raw, y):
         # for plotting
         self.imp_real_hist = imp_history
         self.sha_max = imp_sha_max
+
+        if isinstance(X_raw, np.ndarray):
+            X_raw = pd.DataFrame(X_raw)
+
         if isinstance(X_raw, pd.DataFrame):
             self.support_names_ = [X_raw.columns[i] for i, x in enumerate(self.support_) if x]
+            self.tag_df = pd.DataFrame({'predictor': list(X_raw.columns)})
+            self.tag_df['Boruta'] = 1
+            self.tag_df['Boruta'] = np.where(self.tag_df['predictor'].isin(list(self.support_names_)), 1, 0)
 
-        # ranking, confirmed variables are rank 1
+        if isinstance(X_raw, pd.DataFrame):
+            self.support_weak_names_ = [X_raw.columns[i] for i, x in enumerate(self.support_weak_) if x]
+            self.tag_df['Boruta_weak_incl'] = np.where(self.tag_df['predictor'].isin(
+                list(self.support_names_ + self.support_weak_names_)
+            ), 1, 0)
+
+            # ranking, confirmed variables are rank 1
         self.ranking_ = np.ones(n_feat, dtype=np.int)
         # tentative variables are rank 2
         self.ranking_[tentative] = 2
@@ -470,6 +565,20 @@ def _fit(self, X_raw, y):
         return self
 
     def _transform(self, X, weak=False, return_df=False):
+        """
+        Private method
+
+        transform the predictor matrix by dropping the rejected and (optional) the undecided predictors
+        :param X: pd.DataFrame
+            predictor matrix
+        :param weak: bool
+            whether to drop or not the undecided predictors
+        :param return_df: bool
+            return a pandas dataframe or not
+        :return:
+         X: np.array or pd.DataFrame
+            the transformed predictors matrix
+        """
         # sanity check
         try:
             self.ranking_
@@ -489,7 +598,7 @@ def _transform(self, X, weak=False, return_df=False):
 
     def _get_tree_num(self, n_feat):
         depth = self.estimator.get_params()['max_depth']
-        if depth == None:
+        if depth is None:
             depth = 10
         # how many times a feature should be considered on average
         f_repr = 100
@@ -498,17 +607,36 @@ def _get_tree_num(self, n_feat):
         n_estimators = int(multi * f_repr)
         return n_estimators
 
-    def _get_imp(self, X, y):
+    def _get_imp(self, X, y, sample_weight=None):
+        """
+        Get the native feature importance (impurity based for instance)
+        This is know to return biased and uninformative results.
+        e.g.
+        https://scikit-learn.org/stable/auto_examples/inspection/
+        plot_permutation_importance.html#sphx-glr-auto-examples-inspection-plot-permutation-importance-py
+
+        or
+        https://explained.ai/rf-importance/
+
+        Parameters
+        ----------
+        X : array-like, shape = [n_samples, n_features]
+            The training input samples.
+        y : array-like, shape = [n_samples]
+            The target values.
+        sample_weight : array-like, shape = [n_samples], default=None
+            Individual weights for each sample
+        """
         try:
-            if self._is_catboost:
+            if self.is_cat:
                 X = pd.DataFrame(X)
                 obj_feat = X.dtypes.loc[(X.dtypes == 'object') | (X.dtypes == 'category')].index.tolist()
                 if obj_feat:
                     X[obj_feat] = X[obj_feat].astype('str').astype('category')
                 cat_feat = X.dtypes.loc[X.dtypes == 'category'].index.tolist()
-                self.estimator.fit(X, y, sample_weight=self.weight, cat_features=cat_feat)
+                self.estimator.fit(X, y, sample_weight=sample_weight, cat_features=cat_feat)
             else:
-                self.estimator.fit(X, y, sample_weight=self.weight)
+                self.estimator.fit(X, y, sample_weight=sample_weight)
 
         except Exception as e:
             raise ValueError('Please check your X and y variable. The provided '
@@ -520,111 +648,28 @@ def _get_imp(self, X, y):
                              'are currently supported in BorutaPy.')
         return imp
 
-    def _get_shap_imp(self, X, y):
-        # SHAP and permutation importances must be computed on unseen data
-        if self.weight is not None:
-            w = self.weight
-            if is_regressor(self.estimator):
-                X_tr, X_tt, y_tr, y_tt, w_tr, w_tt = train_test_split(X, y, w, random_state=42)
-            else:
-                X_tr, X_tt, y_tr, y_tt, w_tr, w_tt = train_test_split(X, y, w, stratify=y, random_state=42)
-        else:
-            if is_regressor(self.estimator):
-                X_tr, X_tt, y_tr, y_tt = train_test_split(X, y, random_state=42)
-            else:
-                X_tr, X_tt, y_tr, y_tt = train_test_split(X, y, stratify=y, random_state=42)
-            w_tr, w_tt = None, None
-
-        X_tr = pd.DataFrame(X_tr)
-        X_tt = pd.DataFrame(X_tt)
-        obj_feat = list(set(list(X_tr.columns)) - set(list(X_tr.select_dtypes(include=[np.number]))))
-        obj_idx = None
-
-        if obj_feat:
-            X_tr[obj_feat] = X_tr[obj_feat].astype('str').astype('category')
-            X_tt[obj_feat] = X_tt[obj_feat].astype('str').astype('category')
-            obj_idx = np.argwhere(X_tr.columns.isin(obj_feat)).ravel()
-
-        if self._is_tree_based():
-            try:
-                if self._is_catboost:
-                    model = self.estimator.fit(X_tr, y_tr, sample_weight=w_tr, cat_features=obj_feat)
-                else:
-                    model = self.estimator.fit(X_tr, y_tr, sample_weight=w_tr)
-
-            except Exception as e:
-                raise ValueError('Please check your X and y variable. The provided '
-                                 'estimator cannot be fitted to your data.\n' + str(e))
-            # build the explainer
-            explainer = shap.TreeExplainer(model, feature_perturbation="tree_path_dependent")
-            shap_values = explainer.shap_values(X_tt)
-            # flatten to 2D if classification and lightgbm
-            if is_classifier(self.estimator):
-                if isinstance(shap_values, list):
-                    # for lightgbm clf sklearn api, shap returns list of arrays
-                    # https://github.com/slundberg/shap/issues/526
-                    class_inds = range(len(shap_values))
-                    shap_imp = np.zeros(shap_values[0].shape[1])
-                    for i, ind in enumerate(class_inds):
-                        shap_imp += np.abs(shap_values[ind]).mean(0)
-                    shap_imp /= len(shap_values)
-                else:
-                    shap_imp = np.abs(shap_values).mean(0)
-            else:
-                shap_imp = np.abs(shap_values).mean(0)
-        else:
-            raise ValueError('Not a tree based model')
-
-        return shap_imp
-
-    def _get_perm_imp(self, X, y):
-
-        if self.weight is not None:
-            w = self.weight
-            if is_regressor(self.estimator):
-                X_tr, X_tt, y_tr, y_tt, w_tr, w_tt = train_test_split(X, y, w, random_state=42)
-            else:
-                X_tr, X_tt, y_tr, y_tt, w_tr, w_tt = train_test_split(X, y, w, stratify=y, random_state=42)
-        else:
-            if is_regressor(self.estimator):
-                X_tr, X_tt, y_tr, y_tt = train_test_split(X, y, random_state=42)
-            else:
-                X_tr, X_tt, y_tr, y_tt = train_test_split(X, y, stratify=y, random_state=42)
-            w_tr, w_tt = None, None
-
-        X_tr = pd.DataFrame(X_tr)
-        X_tt = pd.DataFrame(X_tt)
-        obj_feat = list(set(list(X_tr.columns)) - set(list(X_tr.select_dtypes(include=[np.number]))))
-        obj_idx = None
-
-        if obj_feat:
-            X_tr[obj_feat] = X_tr[obj_feat].astype('str').astype('category')
-            X_tt[obj_feat] = X_tt[obj_feat].astype('str').astype('category')
-            obj_idx = np.argwhere(X_tr.columns.isin(obj_feat)).ravel()
-
-        if self._is_tree_based():
-            try:
-                if self._is_catboost:
-                    model = self.estimator.fit(X_tr, y_tr, sample_weight=w_tr, cat_features=obj_feat)
-                else:
-                    model = self.estimator.fit(X_tr, y_tr, sample_weight=w_tr)
-
-            except Exception as e:
-                raise ValueError('Please check your X and y variable. The provided '
-                                 'estimator cannot be fitted to your data.\n' + str(e))
-
-            perm_imp = permutation_importance(model, X_tt, y_tt, n_repeats=5, random_state=42, n_jobs=-1)
-            imp = perm_imp.importances_mean.ravel()
-        else:
-            raise ValueError('Not a tree based model')
-
-        return imp
-
     def _get_shuffle(self, seq):
         self.random_state.shuffle(seq)
         return seq
 
-    def _add_shadows_get_imps(self, X, y, dec_reg):
+    def _add_shadows_get_imps(self, X, y, sample_weight, dec_reg):
+        """
+        Add a shuffled copy of the columns (shadows) and get the feature importance of the augmented data set
+
+        :param X: pd.DataFrame of shape [n_samples, n_features]
+            predictor matrix
+        :param y: pd.series of shape [n_samples]
+            target
+        :param sample_weight: array-like, shape = [n_samples], default=None
+            Individual weights for each sample
+        :param dec_reg: array
+            holds the decision about each feature 1, 0, -1 (accepted, undecided, rejected)
+        :return:
+         imp_real: array
+            feature importance of the real predictors
+         imp_sha: array
+            feature importance of the shadow predictors
+        """
         # find features that are tentative still
         x_cur_ind = np.where(dec_reg >= 0)[0]
         x_cur = np.copy(X[:, x_cur_ind])
@@ -638,9 +683,9 @@ def _add_shadows_get_imps(self, X, y, dec_reg):
         x_sha = np.apply_along_axis(self._get_shuffle, 0, x_sha)
         # get importance of the merged matrix
         if self.importance == 'shap':
-            imp = self._get_shap_imp(np.hstack((x_cur, x_sha)), y)
+            imp = _get_shap_imp(self.estimator, np.hstack((x_cur, x_sha)), y, sample_weight)
         elif self.importance == 'pimp':
-            imp = self._get_perm_imp(np.hstack((x_cur, x_sha)), y)
+            imp = _get_perm_imp(self.estimator, np.hstack((x_cur, x_sha)), y, sample_weight)
         else:
             imp = self._get_imp(np.hstack((x_cur, x_sha)), y)
 
@@ -652,7 +697,18 @@ def _add_shadows_get_imps(self, X, y, dec_reg):
 
         return imp_real, imp_sha
 
-    def _assign_hits(self, hit_reg, cur_imp, imp_sha_max):
+    @staticmethod
+    def _assign_hits(hit_reg, cur_imp, imp_sha_max):
+        """
+        count how many times a given feature was more important than the best of the shadow features
+        :param hit_reg: array
+            count how many times a given feature was more important than the best of the shadow features
+        :param cur_imp: array
+            current importance
+        :param imp_sha_max: array
+            importance of the best shadow predictor
+        :return:
+        """
         # register hits for features that did better than the best of shadows
         cur_imp_no_nan = cur_imp[0]
         cur_imp_no_nan[np.isnan(cur_imp_no_nan)] = 0
@@ -661,6 +717,20 @@ def _assign_hits(self, hit_reg, cur_imp, imp_sha_max):
         return hit_reg
 
     def _do_tests(self, dec_reg, hit_reg, _iter):
+        """
+        Perform the rest if the feature should be tagget as relevant (confirmed), not relevant (rejected)
+        or undecided. The test is performed by considering the binomial tentatives over several attempts.
+        I.e. count how many times a given feature was more important than the best of the shadow features
+        and test if the associated probability to the z-score is below, between or above the rejection or acceptance
+        threshold.
+
+        :param dec_reg: array
+            holds the decision about each feature 1, 0, -1 (accepted, undecided, rejected)
+        :param hit_reg: array
+            counts how many times a given feature was more important than the best of the shadow features
+        :param _iter:
+        :return:
+        """
         active_features = np.where(dec_reg >= 0)[0]
         hits = hit_reg[active_features]
         # get uncorrected p values based on hit_reg
@@ -696,7 +766,8 @@ def _do_tests(self, dec_reg, hit_reg, _iter):
         dec_reg[active_features[to_reject]] = -1
         return dec_reg
 
-    def _fdrcorrection(self, pvals, alpha=0.05):
+    @staticmethod
+    def _fdrcorrection(pvals, alpha=0.05):
         """
         Benjamini/Hochberg p-value correction for false discovery rate, from
         statsmodels package. Included here for decoupling dependency on statsmodels.
@@ -734,7 +805,8 @@ def _fdrcorrection(self, pvals, alpha=0.05):
         reject_[pvals_sortind] = reject
         return reject_, pvals_corrected_
 
-    def _nanrankdata(self, X, axis=1):
+    @staticmethod
+    def _nanrankdata(X, axis=1):
         """
         Replaces bottleneck's nanrankdata with scipy and numpy alternative.
         """
@@ -744,7 +816,12 @@ def _nanrankdata(self, X, axis=1):
 
     def _check_params(self, X, y):
         """
+        Private method
         Check hyperparameters as well as X and y before proceeding with fit.
+        :param X: pd.DataFrame
+            predictor matrix
+        :param y: pd.series
+            target
         """
         # check X and y are consistent len, X is Array and y is column
         X, y = check_X_y(X, y, dtype=None, force_all_finite=False)
@@ -755,6 +832,19 @@ def _check_params(self, X, y):
             raise ValueError('Alpha should be between 0 and 1.')
 
     def _print_results(self, dec_reg, _iter, flag):
+        """
+        Private method
+        printing the result
+        :param dec_reg: array
+            if the feature as been tagged as relevant (confirmed), not relevant (rejected) or undecided
+        :param _iter: int
+            the iteration number
+        :param flag: int
+            is still in the feature selection process or not
+        :return:
+         output: str
+            the output to be printed out
+        """
         n_iter = str(_iter) + ' / ' + str(self.max_iter)
         n_confirmed = np.where(dec_reg == 1)[0].shape[0]
         n_rejected = np.where(dec_reg == -1)[0].shape[0]
@@ -780,3 +870,125 @@ def _print_results(self, dec_reg, _iter, flag):
                 vimp = 'native'
             output = "\n\nBorutaPy finished running using " + vimp + " var. imp.\n\n" + result
         print(output)
+
+
+def _split_fit_estimator(estimator, X, y, sample_weight=None):
+    """
+    Private function
+    split the train, test and fit the model
+
+    :param estimator: sklearn estimator
+    :param X: pd.DataFrame of shape [n_samples, n_features]
+        predictor matrix
+    :param y: pd.series of shape [n_samples]
+        target
+    :param sample_weight: array-like, shape = [n_samples], default=None
+        Individual weights for each sample
+
+    :return:
+     model
+        fitted model
+     X_tt: array [n_samples, n_features]
+        the test split, predictors
+     y_tt: array [n_samples]
+        the test split, target
+    """
+    if sample_weight is not None:
+        w = sample_weight
+        if is_regressor(estimator):
+            X_tr, X_tt, y_tr, y_tt, w_tr, w_tt = train_test_split(X, y, w, random_state=42)
+        else:
+            X_tr, X_tt, y_tr, y_tt, w_tr, w_tt = train_test_split(X, y, w, stratify=y, random_state=42)
+    else:
+        if is_regressor(estimator):
+            X_tr, X_tt, y_tr, y_tt = train_test_split(X, y, random_state=42)
+        else:
+            X_tr, X_tt, y_tr, y_tt = train_test_split(X, y, stratify=y, random_state=42)
+        w_tr, w_tt = None, None
+
+    X_tr = pd.DataFrame(X_tr)
+    X_tt = pd.DataFrame(X_tt)
+    obj_feat = list(set(list(X_tr.columns)) - set(list(X_tr.select_dtypes(include=[np.number]))))
+    obj_idx = None
+
+    if obj_feat:
+        X_tr[obj_feat] = X_tr[obj_feat].astype('str').astype('category')
+        X_tt[obj_feat] = X_tt[obj_feat].astype('str').astype('category')
+        obj_idx = np.argwhere(X_tr.columns.isin(obj_feat)).ravel()
+
+    if check_if_tree_based(estimator):
+        try:
+            if is_catboost(estimator):
+                model = estimator.fit(X_tr, y_tr, sample_weight=w_tr, cat_features=obj_feat)
+            else:
+                model = estimator.fit(X_tr, y_tr, sample_weight=w_tr)
+
+        except Exception as e:
+            raise ValueError('Please check your X and y variable. The provided '
+                             'estimator cannot be fitted to your data.\n' + str(e))
+    else:
+        raise ValueError('Not a tree based model')
+
+    return model, X_tt, y_tt, w_tt
+
+
+def _get_shap_imp(estimator, X, y, sample_weight=None):
+    """
+    Private function
+    Get the SHAP feature importance
+
+    :param estimator: sklearn estimator
+    :param X: pd.DataFrame of shape [n_samples, n_features]
+        predictor matrix
+    :param y: pd.series of shape [n_samples]
+        target
+    :param sample_weight: array-like, shape = [n_samples], default=None
+        Individual weights for each sample
+
+    :return:
+     shap_imp, array
+        the SHAP importance array
+    """
+    model, X_tt, y_tt, w_tt = _split_fit_estimator(estimator, X, y, sample_weight=sample_weight)
+    # build the explainer
+    explainer = shap.TreeExplainer(model, feature_perturbation="tree_path_dependent")
+    shap_values = explainer.shap_values(X_tt)
+    # flatten to 2D if classification and lightgbm
+    if is_classifier(estimator):
+        if isinstance(shap_values, list):
+            # for lightgbm clf sklearn api, shap returns list of arrays
+            # https://github.com/slundberg/shap/issues/526
+            class_inds = range(len(shap_values))
+            shap_imp = np.zeros(shap_values[0].shape[1])
+            for i, ind in enumerate(class_inds):
+                shap_imp += np.abs(shap_values[ind]).mean(0)
+            shap_imp /= len(shap_values)
+        else:
+            shap_imp = np.abs(shap_values).mean(0)
+    else:
+        shap_imp = np.abs(shap_values).mean(0)
+
+    return shap_imp
+
+
+def _get_perm_imp(estimator, X, y, sample_weight):
+    """
+    Private function
+    Get the permutation feature importance
+
+    :param estimator: sklearn estimator
+    :param X: pd.DataFrame of shape [n_samples, n_features]
+        predictor matrix
+    :param y: pd.series of shape [n_samples]
+        target
+    :param sample_weight: array-like, shape = [n_samples], default=None
+        Individual weights for each sample
+
+    :return:
+     imp, array
+        the permutation importance array
+    """
+    model, X_tt, y_tt, w_tt = _split_fit_estimator(estimator, X, y, sample_weight=sample_weight)
+    perm_imp = permutation_importance(model, X_tt, y_tt, n_repeats=5, random_state=42, n_jobs=-1)
+    imp = perm_imp.importances_mean.ravel()
+    return imp
diff --git a/boruta/test/modif_tests.py b/boruta/test/modif_tests.py
index 507db6f..3bc4a5d 100644
--- a/boruta/test/modif_tests.py
+++ b/boruta/test/modif_tests.py
@@ -92,7 +92,9 @@ def get_titanic_data():
     X = pd.DataFrame(X_trans, columns = X.columns)
     # encode
     X, cat_var_df, inv_mapper = cat_var(X)
-    return X, y
+    # sample weight is just a dummy random vector for testing purpose
+    sample_weight = np.random.uniform(0,1, len(y))
+    return X, y, sample_weight
 
 def get_boston_data():
     boston = load_boston()
@@ -103,23 +105,23 @@ def get_boston_data():
     return X, y
 
 # Testing the changes with rnd cat. and num. predictors added to the set of genuine predictors
-def testing_estimators(varimp, models, X, y):
+def testing_estimators(varimp, models, X, y, sample_weight=None):
     for model in models:
-        print('testing ' + str(type(model)) + ' for var.imp: ' + varimp)
-        feat_selector = bp(model, n_estimators = 100, verbose= 1, max_iter= 10, random_state=42, weight=None, importance=varimp)
-        feat_selector.fit(X, y)
+        print('='*20 +' testing: {mod:>55} for var.imp: {vimp:<15} '.format(mod=str(model), vimp=varimp)+'='*20 )
+        feat_selector = noglmgroot.BorutaPy(model, n_estimators = 100, verbose= 1, max_iter= 10, random_state=42, weight=None, importance=varimp)
+        feat_selector.fit(X, y, sample_weight)
         print(feat_selector.support_names_)
-        feat_selector.plot_importance()
+        feat_selector.plot_importance(n_feat_per_inch=3)
         gc.enable()
         del(feat_selector, model)
         gc.collect()
 
-def testing_clf_all_varimp(X, y):
+def testing_clf_all_varimp(X, y, sample_weight=None):
     for varimp in ['shap', 'pimp', 'native']:
-        models = [catboost.CatBoostClassifier(random_state=42, verbose=0), XGBClassifier(random_state=42, verbose=0), LGBMClassifier(random_state=42, verbose=0)]
-        testing_estimators(varimp=varimp, models=models, X=X, y=y)
+        models = [RandomForestClassifier(n_jobs= 4, oob_score= True), catboost.CatBoostClassifier(random_state=42, verbose=0), XGBClassifier(random_state=42, verbose=0), LGBMClassifier(random_state=42, verbose=0)]
+        testing_estimators(varimp=varimp, models=models, X=X, y=y, sample_weight=sample_weight)
         gc.enable()
-        del(models)
+        del models
         gc.collect()
 
 def testing_regr_all_varimp(X, y):
@@ -129,13 +131,13 @@ def testing_regr_all_varimp(X, y):
         gc.enable()
         del(models)
         gc.collect()
-        
 
-X, y = get_titanic_data()
-X_train, X_test, y_train, y_test = train_test_split(X, y, stratify=y, random_state=42)
+print('='*20 + ' Benchmarking using sklearn permutation importance ' + '='*20 )
+X, y, sample_weight = get_titanic_data()
+X_train, X_test, y_train, y_test, w_train, w_test = train_test_split(X, y, sample_weight, stratify=y, random_state=42)
 # lightgbm faster and better than RF
 lgb_model = LGBMClassifier(n_jobs= 4)
-lgb_model.fit(X_train, y_train)
+lgb_model.fit(X_train, y_train, sample_weight=w_train)
 result = permutation_importance(lgb_model, X_test, y_test, n_repeats=10, random_state=42, n_jobs=2)
 sorted_idx = result.importances_mean.argsort()
 # Plot
@@ -146,9 +148,10 @@ def testing_regr_all_varimp(X, y):
 plt.show()
 
 if __name__ == '__main__':
-    # classification and cat. pred
-    X, y = get_titanic_data()
-    testing_clf_all_varimp(X=X, y=y)
+    # classification and cat. pred, sample weight is just a dummy random vector for testing purpose
+    X, y, sample_weight = get_titanic_data() #get_titanic_data()
+    testing_clf_all_varimp(X=X, y=y, sample_weight=sample_weight)
+
     # regression
     X, y = get_boston_data()
     testing_regr_all_varimp(X=X, y=y)
\ No newline at end of file