systematically select the best subset of input features for model training to predict the target variable
feature selection
- Embedded methods: built-in into the model training phase of a classification or regression model
- Filter methods: check the intrinsic properties of features and use statistical techniques to evaluate the relationship between a predictor and the target variable
- Wrapper methods: utilize ML algorithms as part of the feature evaluation process to identify and select the best subset of features iteratively and according to a specific performance metric
filter feature selection
there are currently no widely used and supported python packages out there that support these advanced multivariate feature selection techniques
Categorical Feature Selection
scikit-learn’s SelectKBest class is used in conjunction with either the chi2 or mutual_info_classif functions to identify and select the top k most relevant features
- Chi-Squared Test of Independence
-
Mutual Information
# import all the required libraries import pandas as pd from sklearn.model_selection import train_test_split from sklearn.preprocessing import LabelEncoder from sklearn.linear_model import LogisticRegression from sklearn.metrics import accuracy_score from scipy.stats import chi2_contingency # load the dataset dataset = pd.read_csv('abc.csv') # split into input and output variables X = dataset.iloc[:, :-1] y = dataset.iloc[:,-1] # assuming no mising values in dataset # split into train and test sets X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=1) # define an empty dictionary to store chi-test results chi2_check = {} # loop over each column in the encoded training set to calculate chi statistic with the target variable for column in X_train: chi, p, dof, ex = chi2_contingency(pd.crosstab(y_train, X_train[column])) chi2_check.setdefault('Feature',[]).append(column) chi2_check.setdefault('p-value',[]).append(p) # convert the dictionary to a DF chi2_result = pd.DataFrame(data = chi2_check) # select the top 4 features based on lowest p values top4_chi2 = chi2_result.nsmallest(4, 'p-value')['Feature'].tolist() # filter out these shortlisted features into new DFs X_train_fs = X_train[top4_chi2] X_test_fs = X_test[top4_chi2] # convert to shortlisted feature DFs into dummy variables X_train_enc = pd.get_dummies(X_train_fs) X_test_enc = pd.get_dummies(X_test_fs) # reindex the dummied test set variables to make sure all the feature columns in the train set are also available in the test set X_test_enc = X_test_enc.reindex(labels=X_train_enc.columns, axis=1, fill_value=0) # instantiate the LabelEncoder class to transform target variable le = LabelEncoder() # fit the LabelEncoder class on training set le.fit(y_train) # transform training and test target variables and convert to DFs y_train_enc = le.transform(y_train) y_test_enc = le.transform(y_test) # define classification algorithm model = LogisticRegression() # fit the model model.fit(X_train_enc, y_train_enc) # predict yhat = model.predict(X_test_enc) # evaluate predictions accuracy = accuracy_score(y_test_enc, yhat) print('Accuracy: %.2f' % (accuracy*100))
Numerical Feature Selection-Classification Problem
- ANOVA F-Statistic
-
Mutual Information (as explained above)
# import the required libraries import pandas as pd from sklearn.model_selection import RepeatedStratifiedKFold from sklearn.feature_selection import SelectKBest from sklearn.feature_selection import f_classif from sklearn.linear_model import LogisticRegression from sklearn.pipeline import Pipeline from sklearn.model_selection import GridSearchCV # load the dataset data = pd.read_csv('abc.csv') # split into input (X) and target (y) variables X = data.iloc[:, :-1] y = data.iloc[:,-1] # define the cross-validation evaluation method cv = RepeatedStratifiedKFold(n_splits=10, n_repeats=3, random_state=1) # define the pipeline to evaluate model = LogisticRegression(max_iter = 1000) fs = SelectKBest(score_func = f_classif) pipeline = Pipeline(steps=[('anova', fs), ('lr', model)]) # define the grid to use up to the max no. of features as the `k` value for `SelectKBest` grid = {} grid['anova__k'] = [i+1 for i in range(X.shape[1])] # define the grid search search = GridSearchCV(pipeline, grid, scoring='accuracy', n_jobs=-1, cv=cv) # perform the search results = search.fit(X, y) # summarize best score print('Best Mean Accuracy: %.3f' % results.best_score_) print('Best Config: %s' % results.best_params_)
Numerical Feature Selection — Regression Problem
- Correlation statistics
-
Mutual Information (same as explained earlier, with just a different score_func to be used: mutual_info_regression)
# import the required libraries import pandas as pd from sklearn.model_selection import RepeatedKFold from sklearn.feature_selection import SelectKBest from sklearn.feature_selection import f_regression # or mutual_info_regression, if desired from sklearn.linear_model import LinearRegression from sklearn.pipeline import Pipeline from sklearn.model_selection import GridSearchCV # load dataset data = pd.read_csv('abc.csv') X = data.iloc[:, :-1] y = data.iloc[:, -1] # define the evaluation method cv = RepeatedKFold(n_splits=10, n_repeats=3, random_state=1) # define the pipeline to evaluate model = LinearRegression() fs = SelectKBest(score_func = f_regression) # can also use mutual_info_regression to select based on Mutual Information pipeline = Pipeline(steps=[('feature',fs), ('lr', model)]) # define the grid grid = {} grid['feature__k'] = [i for i in range(X.shape[1]-20, X.shape[1]+1)] # or replace with any desired range of `k` values to be tested # define the grid search search = GridSearchCV(pipeline, grid, scoring='neg_mean_absolute_error', n_jobs=-1, cv=cv) # perform the search results = search.fit(X, y) # summarize best print('Best MAE: %.3f' % results.best_score_) print('Best Config: %s' % results.best_params_)
Wrapper Feature Selection
Recursive Feature Elimination (RFE) from scikit-learn is the most widely used wrapper feature selection method in practice
1) manual
# import all required libraries
import pandas as pd
import numpy as np
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import RepeatedStratifiedKFold
from sklearn.feature_selection import RFE
from sklearn.tree import DecisionTreeClassifier
from sklearn.pipeline import Pipeline
# load dataset
data = pd.read_csv('abc.csv')
X = data.iloc[:, :-1]
y = data.iloc[:, -1]
# create a pipeline of a specific algorithm with different no. of features to be evaluated
models = {}
for i in range (2, 10): # loop over a number of features to be used in RFE
FS = RFE(estimator=DecisionTreeClassifier(), n_features_to_select = i)
DTC = DecisionTreeClassifier()
models[str(i)] = Pipeline(steps = [('features', FS), ('DT model', DTC)])
# evaluate all the models using CV
results = []
names = []
for name, model in models.items():
cv = RepeatedStratifiedKFold(n_splits = 10, n_repeats = 3, random_state = 1)
scores = cross_val_score(model, X, y, scoring = 'accuracy', cv = cv, n_jobs = -1)
results.append(scores)
names.append(name)
print('>%s: %.3f' % (name, np.mean(scores)))
2) automatic feature selection
# import all the required libraries
import pandas as pd
import numpy as np
from sklearn.datasets import make_classification
from sklearn.model_selection import cross_val_score, RepeatedStratifiedKFold
from sklearn.feature_selection import RFECV
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
import xgboost as xgb
from xgboost.sklearn import XGBClassifier
from sklearn.pipeline import Pipeline
# load dataset
data = pd.read_csv('abc.csv')
X = data.iloc[:, :-1]
y = data.iloc[:, -1]
# create pipeline of differennt base algorithms to be used in RFECV (no. of features will be auto-selected based on cv in RFECV)
models = {}
# logistic regression
rfecv = RFECV(estimator = LogisticRegression(), cv = 10, scoring = 'accuracy')
model = DecisionTreeClassifier()
models['LR'] = Pipeline(steps = [('features', rfecv), ('model', model)])
# decision tree
rfecv = RFECV(estimator = DecisionTreeClassifier(), cv = 10, scoring = 'accuracy')
model = DecisionTreeClassifier()
models['DT'] = Pipeline(steps = [('features', rfe), ('model', model)])
# random forest
rfecv = RFECV(estimator = RandomForestClassifier(), cv = 10, scoring = 'accuracy')
model = DecisionTreeClassifier()
models['RF'] = Pipeline(steps = [('features', rfe), ('model', model)])
# XGBoost Classifier
rfecv = RFECV(estimator=XGBClassifier(), cv = 10, scoring = 'accuracy')
model = DecisionTreeClassifier()
models['XGB'] = Pipeline(steps = [('features', rfecv), ('model', model)])
# evaluate all the models
results = []
names = []
for name, model in models.items():
cv = RepeatedStratifiedKFold(n_splits = 10, n_repeats = 3, random_state = 1)
scores = cross_val_score(model, X, y, scoring = 'accuracy', cv = cv, n_jobs = -1)
results.append(scores)
names.append(name)
print('>%s: %.3f' % (name, np.mean(scores)))
# create pipeline
rfecv = RFECV(estimator = LogisticRegression(), cv = 10, scoring =
'accuracy')
model = DecisionTreeClassifier()
pipeline = Pipeline(steps=[('features', rfecv), ('model', model)])
# fit the model on all available data
pipeline.fit(X, y)
# make a prediction for one example
data = #load or define any new data unseen data that you want to make predictions upon
yhat = pipeline.predict(data)
print('Predicted: %.3f' % (yhat))