import numpy as np import pandas as pd import re import sys from sklearn.base import BaseEstimator, TransformerMixin from sklearn.utils.validation import check_X_y, check_is_fitted from sklearn.preprocessing import FunctionTransformer from sklearn.compose import ColumnTransformer, make_column_transformer, make_column_selector from sklearn.pipeline import FeatureUnion, make_union, Pipeline, make_pipeline from sklearn.feature_selection import SelectKBest, SelectPercentile from sklearn.feature_selection import SelectFpr, SelectFdr, SelectFwe from sklearn.model_selection import cross_val_score import lightgbm as lgb import matplotlib.pyplot as plt import seaborn as sns import warnings import gc
print('The number of selected features:', len(selected_features)) print(f'Dropped {init_n - len(selected_features)} uninformative features.')
Starting time 20:38:51
calc_mi_scores cost 0 hours 17 minutes 49 seconds
The number of selected features: 2050
Dropped 117 uninformative features.
selected_categorical_features = [col for col in categorical_features if col in selected_features] eval_results, feature_importances = score_dataset(X[selected_features], y, selected_categorical_features)
Starting time 20:56:46
Starting training...
[LightGBM] [Warning] Found whitespace in feature_names, replace with underlines
Training until validation scores don't improve for 50 rounds
[50] cv_agg's valid auc: 0.778311 + 0.00276223
[100] cv_agg's valid auc: 0.783085 + 0.00266899
[150] cv_agg's valid auc: 0.783015 + 0.00280856
Early stopping, best iteration is:
[122] cv_agg's valid auc: 0.783271 + 0.00267406
score_dataset cost 0 hours 6 minutes 7 seconds
@timer defdrop_correlated_features(X, y, threshold=0.9): to_keep = [] to_drop = [] categorical = X.select_dtypes(exclude='number').columns.tolist() for i, col inenumerate(X.columns): if col in categorical: continue # The correlations corr = X[to_keep].corrwith(X[col]).abs() # Select columns with correlations above threshold ifany(corr > threshold): to_drop.append(col) else: to_keep.append(col) progress((i+1) / len(X.columns)) print("\nThe number of correlated features:", len(to_drop)) return to_keep
original_features = [f for f in X.columns if f in original_df.columns] derived_features = [f for f in X.columns if f notin original_df.columns]
selected_features = [col for col in original_features + derived_features if col in selected_features]
# Drops features that are correlated
# init_n = len(selected_features) selected_features = drop_correlated_features(X[selected_features], y, threshold=0.9)
print('The number of selected features:', len(selected_features)) print(f'Dropped {init_n - len(selected_features)} correlated features.')
Starting time 21:03:05
Processing: [##################################################]100.0%
The number of correlated features: 1110
drop_correlated_features cost 0 hours 33 minutes 5 seconds
The number of selected features: 940
Dropped 1227 correlated features.
工作中,我们常调用feature_engine包实现:
# Drops features that are correlated # from feature_engine.selection import DropCorrelatedFeatures
# print('The number of selected features:', len(selected_features)) # print(f'Dropped {init_n - len(selected_features)} features.')
selected_categorical_features = [col for col in categorical_features if col in selected_features] eval_results, feature_importances = score_dataset(X[selected_features], y, selected_categorical_features)
Starting time 21:36:12
Starting training...
[LightGBM] [Warning] Found whitespace in feature_names, replace with underlines
Training until validation scores don't improve for 50 rounds
[50] cv_agg's valid auc: 0.776068 + 0.00333724
[100] cv_agg's valid auc: 0.781097 + 0.00296053
[150] cv_agg's valid auc: 0.781236 + 0.00298245
Early stopping, best iteration is:
[136] cv_agg's valid auc: 0.781375 + 0.00302538
score_dataset cost 0 hours 2 minutes 23 seconds
方差分析
方差分析主要用于分类问题中连续特征的相关性。
from sklearn.feature_selection import f_classif
numeric_features = [col for col in X.columns if col notin categorical_features]
f_statistic, p_values = f_classif(X[numeric_features], y) anova = pd.DataFrame({ "f_statistic": f_statistic, "p_values": p_values }, index=numeric_features ) print(f"The number of irrelevant features for classification:", anova['p_values'].ge(0.05).sum())
The number of irrelevant features for classification: 274
卡方检验
卡方检验是一种用于衡量两个分类变量之间相关性的统计方法。
from sklearn.feature_selection import chi2
chi2_stats, p_values = chi2( X[categorical_features], y ) chi2_test = pd.DataFrame({ "chi2_stats": chi2_stats, "p_values": p_values }, index=categorical_features ) print("The number of irrelevant features for classification:", chi2_test['p_values'].ge(0.05).sum())
The number of irrelevant features for classification: 9
selected_features_by_fdr = feature_selection.fit(X, y).get_feature_names_out() print("The number of selected features:", len(selected_features_by_fdr)) print("Dropped {} features.".format(X.shape[1] - len(selected_features_by_fdr)))
[ColumnTransformer] ... (1 of 2) Processing selectfdr-1, total= 2.7min
[ColumnTransformer] ... (2 of 2) Processing selectfdr-2, total= 0.1s
The number of selected features: 1838
Dropped 329 features.
selected_categorical_features_by_fdr = [col for col in categorical_features if col in selected_features_by_fdr] eval_results, feature_importances = score_dataset(X[selected_features_by_fdr], y, selected_categorical_features_by_fdr)
Starting time 21:44:08
Starting training...
[LightGBM] [Warning] Found whitespace in feature_names, replace with underlines
Training until validation scores don't improve for 50 rounds
[50] cv_agg's valid auc: 0.777829 + 0.00296151
[100] cv_agg's valid auc: 0.782637 + 0.00263458
[150] cv_agg's valid auc: 0.782612 + 0.0023263
Early stopping, best iteration is:
[129] cv_agg's valid auc: 0.782834 + 0.00242003
score_dataset cost 0 hours 5 minutes 41 seconds
defcalc_gini_scores(X, y, bins=10): X = pd.DataFrame(X) y = pd.Series(y) gini_scores = pd.Series() # Find discrete features colnames = X.select_dtypes(exclude='number').columns X[colnames] = X[colnames].astype("category").apply(lambda x:x.cat.codes) discrete = [X[col].nunique()<=50for col in X] # Compute gini score for colname in X.columns: if colname in discrete: var = X[colname] else: var = pd.qcut(X[colname], bins, duplicates="drop") p = y.groupby(var).mean() gini = 1 - p.pow(2).sum() gini_scores[colname] = gini return gini_scores.sort_values(ascending=False)
gini_scores = calc_gini_scores(X, y) print(f"There are {gini_scores.le(0.02).sum()} features with iv <=0.02.")
# from sklearn.svm import LinearSVC # from sklearn.feature_selection import RFECV
# Use SVM as the model # svc = LinearSVC(dual="auto", penalty="l1")
# Recursive feature elimination with cross-validation to select features. # rfe = RFECV(svc, step=1, cv=5, verbose=1) # rfe.fit(X, y)
# The mask of selected features. # print(zip(X.columns, rfe.support_)) # print("The number of features:", rfe.n_features_in_) # print("The number of selected features:", rfe.n_features_)
# feature_rank = pd.Series(rfe.ranking_, index=X.columns).sort_values(ascending=False) # print("Features sorted by their rank:", feature_rank[:10], sep="\n")
selected_categorical_features = [col for col in categorical_features if col in selected_features] eval_results, feature_importances = score_dataset(X[selected_features], y, selected_categorical_features)
Starting time 21:55:44
Starting training...
[LightGBM] [Warning] Found whitespace in feature_names, replace with underlines
Training until validation scores don't improve for 50 rounds
[50] cv_agg's valid auc: 0.776068 + 0.00333724
[100] cv_agg's valid auc: 0.781097 + 0.00296053
[150] cv_agg's valid auc: 0.781236 + 0.00298245
Early stopping, best iteration is:
[136] cv_agg's valid auc: 0.781375 + 0.00302538
score_dataset cost 0 hours 2 minutes 25 seconds
# Sort features according to importance feature_importances = feature_importances.sort_values('score', ascending=False) feature_importances['score'].head(15)
# Find the features with zero importance zero_features = feature_importances.query("score == 0.0").index.tolist() print(f'\nThere are {len(zero_features)} features with 0.0 importance')
There are 105 features with 0.0 importance
selected_features = [col for col in selected_features if col notin zero_features] print("The number of selected features:", len(selected_features)) print("Dropped {} features with zero importance.".format(len(zero_features)))
The number of selected features: 835
Dropped 105 features with zero importance.
selected_categorical_features = [col for col in categorical_features if col in selected_features] eval_results, feature_importances = score_dataset(X[selected_features], y, selected_categorical_features)
Starting time 21:58:13
Starting training...
[LightGBM] [Warning] Found whitespace in feature_names, replace with underlines
Training until validation scores don't improve for 50 rounds
[50] cv_agg's valid auc: 0.77607 + 0.00333823
[100] cv_agg's valid auc: 0.781042 + 0.00295406
[150] cv_agg's valid auc: 0.781317 + 0.00303434
[200] cv_agg's valid auc: 0.780819 + 0.00281177
Early stopping, best iteration is:
[154] cv_agg's valid auc: 0.781405 + 0.0029417
score_dataset cost 0 hours 2 minutes 34 seconds
defselect_import_features(scores, thresh=0.95): feature_imp = pd.DataFrame(scores, columns=['score']) # Sort features according to importance feature_imp = feature_imp.sort_values('score', ascending=False) # Normalize the feature importances feature_imp['score_normalized'] = feature_imp['score'] / feature_imp['score'].sum() feature_imp['cumsum'] = feature_imp['score_normalized'].cumsum() selected_features = feature_imp.query(f'cumsum >= {thresh}') return selected_features.index.tolist()
init_n = len(selected_features) import_features = select_import_features(feature_importances['score'], thresh=0.95) print("The number of import features:", len(import_features)) print(f'Dropped {init_n - len(import_features)} features.')
The number of import features: 241
Dropped 594 features.
剩余248个特征足以覆盖95%的重要性。
import_categorical_features = [col for col in categorical_features if col in import_features] eval_results, feature_importances = score_dataset(X[import_features], y, import_categorical_features)
Starting time 22:00:49
Starting training...
[LightGBM] [Warning] Found whitespace in feature_names, replace with underlines
Training until validation scores don't improve for 50 rounds
[50] cv_agg's valid auc: 0.756425 + 0.0029265
[100] cv_agg's valid auc: 0.759284 + 0.0029921
[150] cv_agg's valid auc: 0.759162 + 0.00314089
Early stopping, best iteration is:
[115] cv_agg's valid auc: 0.759352 + 0.00300464
score_dataset cost 0 hours 0 minutes 21 seconds
在继续之前,我们应该记录我们采取的特征选择步骤,以备将来使用:
删除互信息为0的无效特征:删除了117个特征
删除相关系数大于0.9的共线变量:删除了1108个特征
根据GBM删除0.0重要特征:删除108个特征
(可选)仅保留95%特征重要性所需的特征:删除了586个特征
我们看下特征组成:
original = set(original_features) & set(import_features) derived = set(import_features) - set(original)
print(f"Selected features: {len(original)} original features, {len(derived)} derived features.")
Selected features: 33 original features, 208 derived features.